-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlinear_programming.py
69 lines (55 loc) · 2.26 KB
/
linear_programming.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
import pulp
def i_a_to_indice(i, a, nb_a):
""" Transform (i, a) to an unique indice """
return i*nb_a + a
def indice_to_i_a(indice, nb_a):
""" Find the couple (i, a) from the indice
Inverse of "i_a_to_indice" """
return indice//nb_a, indice%nb_a
def LP_param_DMDP(mdp):
""" Find the matrices A, r such as the constraint of the LP is A.v >= r
- A = E - gamma*P , size = (nb_s * nb_a) x nb_s
with E[(i, a), j] = 1 iff i == j
and P transition matrix of the MDP
- r reward vector of size nb_s*nb_a x 1 """
r = np.zeros((mdp.nb_s * mdp.nb_a, 1))
A = np.zeros((mdp.nb_s * mdp.nb_a, mdp.nb_s))
for a in range(mdp.nb_a):
for s in range(mdp.nb_s):
indice = i_a_to_indice(s, a, mdp.nb_a)
r[indice] = mdp.rewards[s, a]
for j in range(mdp.nb_s):
if j == s:
A[indice][j] = 1 - mdp.gamma*mdp.transition[s, a, j]
else:
A[indice][j] = - mdp.gamma*mdp.transition[s, a, j]
return A, r
def LP_solving_DMDP(mdp):
A, r = LP_param_DMDP(mdp)
# Instantiate our problem class
model = pulp.LpProblem("LP minimizing problem", pulp.LpMinimize)
# Variable
v = pulp.LpVariable.dicts("Value Function",
(i for i in range(mdp.nb_s)),
lowBound=0,
cat='Continuous')
# Objective function
model += pulp.lpSum([v[i] for i in range(mdp.nb_s)])
# Constraints
for j in range(mdp.nb_s * mdp.nb_a):
model += pulp.lpSum([A[j, k] * v[k] for k in range(mdp.nb_s)]) >= r[j, 0]
# Solve our problem
model.solve()
if pulp.LpStatus[model.status] == 'Optimal':
v_final = [v[i].varValue for i in range(mdp.nb_s)]
else:
print("No optimal solution found, status =", pulp.LpStatus[model.status])
# Compute corresponding pi
pi = np.zeros((mdp.nb_s, 1))
for state in range(mdp.nb_s):
values = [mdp.rewards[state, action] \
+ mdp.gamma * mdp.transition[state, action, :].dot(v_final)
for action in range(mdp.nb_a)]
pi[state, 0] = np.argmax(values)
return np.array(v_final), np.array(pi)