-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpilco.py
117 lines (105 loc) · 4.61 KB
/
pilco.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import hydra
import numpy as np
import gym
import barl.envs
from barl.util.misc_util import Dumper
from barl.envs.wrappers import NormalizedEnv
from pilco.models import PILCO
from pilco.controllers import RbfController, LinearController
from pilco.rewards import ExponentialReward
import tensorflow as tf
from tqdm import trange
from gpflow import set_trainable
# from tensorflow import logging
def rollout(env, pilco, timesteps, verbose=True, random=False, SUBS=1, render=False):
X = []; Y = [];
x = env.reset()
ep_return_full = 0
ep_return_sampled = 0
for timestep in range(timesteps):
if render: env.render()
u = policy(env, pilco, x, random)
for i in range(SUBS):
x_new, r, done, _ = env.step(u)
ep_return_full += r
if done: break
if render: env.render()
if verbose:
print("Action: ", u)
print("State : ", x_new)
print("Return so far: ", ep_return_full)
X.append(np.hstack((x, u)))
Y.append(x_new - x)
ep_return_sampled += r
x = x_new
if done: break
return np.stack(X), np.stack(Y), ep_return_sampled, ep_return_full
def policy(env, pilco, x, random):
if random:
return env.action_space.sample()
else:
return pilco.compute_action(x[None, :])[0, :]
def assign_variance(models, variance):
for model in models:
model.likelihood.variance.assign(variance)
set_trainable(model.likelihood.variance, False)
@hydra.main(config_path='cfg', config_name='pilco')
def main(config):
# set seeds
np.random.seed(config.seed)
tf.random.set_seed(config.seed)
env_name = config.env.name if config.env.name != 'bacpendulum-v0' else 'Pendulum-v1'
env = NormalizedEnv(gym.make(env_name))
dumper = Dumper(config.name)
horizon = config.env.max_path_length
target = np.array(config.env.target).astype(np.float64)
weights = np.diag(config.env.weights).astype(np.float64)
m_init = np.array(config.env.m_init)[None, :].astype(np.float64)
s_init = np.diag(config.env.s_init).astype(np.float64)
restarts = 2
maxiter = 50
max_action = 1. if config.env.name != 'bacpendulum-v0' else 2.
# Initial random rollouts to generate a dataset
X,Y, _, _ = rollout(env=env, pilco=None, random=True, timesteps=horizon, render=False, SUBS=config.env.SUBS, verbose=False)
X = X[:10, ...]
Y = Y[:10, ...]
for i in range(config.init_random_rollouts):
X_, Y_, _, _ = rollout(env=env, pilco=None, random=True, timesteps=horizon, render=False, SUBS=config.env.SUBS, verbose=False)
X = np.vstack((X, X_))
Y = np.vstack((Y, Y_))
state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim
controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=config.num_basis_functions, max_action=max_action)
R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
# controller = LinearController(state_dim=state_dim, control_dim=control_dim)
X = X.astype(np.float64)
Y = Y.astype(np.float64)
pilco = PILCO((X, Y), controller=controller, horizon=horizon, reward=R, m_init=m_init, S_init=s_init)
# Example of user provided reward function, setting a custom target state
# R = ExponentialReward(state_dim=state_dim, t=np.array([0.1,0,0,0]))
# pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R)
variance = 0.001
assign_variance(pilco.mgpr.models, variance)
for rollouts in range(config.num_rl_trials):
pilco.optimize_models(maxiter=maxiter, restarts=restarts)
try:
pilco.optimize_policy(maxiter=maxiter, restarts=restarts)
except Exception:
variance *= 10
assign_variance(pilco.mgpr.models, variance)
pilco.optimize_policy(maxiter=maxiter, restarts=restarts)
eval_returns = []
pbar = trange(config.num_eval_trials)
for _ in trange(config.num_eval_trials):
X_new, Y_new, _, ep_return = rollout(env=env, pilco=pilco, timesteps=horizon, render=False, SUBS=config.env.SUBS, verbose=False)
eval_returns.append(ep_return)
stats = {"Mean Return": np.mean(eval_returns), "Std Return:": np.std(eval_returns)}
pbar.set_postfix(stats)
dumper.add("Eval Returns", eval_returns)
dumper.add("Eval ndata", X.shape[0])
dumper.save()
# Update dataset
X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new))
pilco.mgpr.set_data((X, Y))
if __name__ == '__main__':
main()