pinocchio  3.7.0
A fast and flexible implementation of Rigid Body Dynamics algorithms and their analytical derivatives
 
Loading...
Searching...
No Matches
continuous.py
1"""
2Deep actor-critic network,
3From "Continuous control with deep reinforcement learning",
4by Lillicrap et al, arXiv:1509.02971
5"""
6
7import random
8import signal
9import time
10from collections import deque
11
12import matplotlib.pyplot as plt
13import numpy as np
14import tensorflow as tf
15import tflearn
16from pendulum import Pendulum
17
18# --- Random seed
19RANDOM_SEED = int((time.time() % 10) * 1000)
20print(f"Seed = {RANDOM_SEED}")
21np.random.seed(RANDOM_SEED)
22tf.set_random_seed(RANDOM_SEED)
23random.seed(RANDOM_SEED)
24n_init = tflearn.initializations.truncated_normal(seed=RANDOM_SEED)
25u_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003, seed=RANDOM_SEED)
26
27# --- Hyper paramaters
28NEPISODES = 100 # Max training steps
29NSTEPS = 100 # Max episode length
30QVALUE_LEARNING_RATE = 0.001 # Base learning rate for the Q-value Network
31POLICY_LEARNING_RATE = 0.0001 # Base learning rate for the policy network
32DECAY_RATE = 0.99 # Discount factor
33UPDATE_RATE = 0.01 # Homotopy rate to update the networks
34REPLAY_SIZE = 10000 # Size of replay buffer
35BATCH_SIZE = 64 # Number of points to be fed in stochastic gradient
36NH1 = NH2 = 250 # Hidden layer size
37
38# --- Environment
39env = Pendulum(1) # Continuous pendulum
40env.withSinCos = True # State is dim-3: (cosq,sinq,qdot) ...
41NX = env.nobs # ... training converges with q,qdot with 2x more neurones.
42NU = env.nu # Control is dim-1: joint torque
43
44# --- Q-value and policy networks
45
46
48 def __init__(self):
49 nvars = len(tf.trainable_variables())
50
51 x = tflearn.input_data(shape=[None, NX])
52 u = tflearn.input_data(shape=[None, NU])
53
54 netx1 = tflearn.fully_connected(x, NH1, weights_init=n_init, activation="relu")
55 netx2 = tflearn.fully_connected(netx1, NH2, weights_init=n_init)
56 netu1 = tflearn.fully_connected(
57 u, NH1, weights_init=n_init, activation="linear"
58 )
59 netu2 = tflearn.fully_connected(netu1, NH2, weights_init=n_init)
60 net = tflearn.activation(netx2 + netu2, activation="relu")
61 qvalue = tflearn.fully_connected(net, 1, weights_init=u_init)
62
63 self.x = x # Network state <x> input in Q(x,u)
64 self.u = u # Network control <u> input in Q(x,u)
65 self.qvalue = qvalue # Network output <Q>
66 # Variables to be trained
67 self.variables = tf.trainable_variables()[nvars:]
68 self.hidens = [netx1, netx2, netu1, netu2] # Hidden layers for debug
69
70 def setupOptim(self):
71 qref = tf.placeholder(tf.float32, [None, 1])
72 loss = tflearn.mean_square(qref, self.qvalue)
73 optim = tf.train.AdamOptimizer(QVALUE_LEARNING_RATE).minimize(loss)
74 gradient = tf.gradients(self.qvalue, self.u)[0] / float(BATCH_SIZE)
75
76 self.qref = qref # Reference Q-values
77 self.optim = optim # Optimizer
78 self.gradient = (
79 # Gradient of Q wrt the control dQ/du (for policy training)
80 gradient
81 )
82 return self
83
84 def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE):
85 self.update_variables = [
86 target.assign(tau * ref + (1 - tau) * target)
87 for target, ref in zip(self.variables, nominalNet.variables)
88 ]
89 return self
90
91
93 def __init__(self):
94 nvars = len(tf.trainable_variables())
95
96 x = tflearn.input_data(shape=[None, NX])
97 net = tflearn.fully_connected(x, NH1, activation="relu", weights_init=n_init)
98 net = tflearn.fully_connected(net, NH2, activation="relu", weights_init=n_init)
99 policy = (
100 tflearn.fully_connected(net, NU, activation="tanh", weights_init=u_init)
101 * env.umax
102 )
103
104 self.x = x # Network input <x> in Pi(x)
105 self.policy = policy # Network output <Pi>
106 # Variables to be trained
107 self.variables = tf.trainable_variables()[nvars:]
108
109 def setupOptim(self):
110 qgradient = tf.placeholder(tf.float32, [None, NU])
111 grad = tf.gradients(self.policy, self.variables, -qgradient)
112 optim = tf.train.AdamOptimizer(POLICY_LEARNING_RATE).apply_gradients(
113 zip(grad, self.variables)
114 )
115
116 # Q-value gradient wrt control (input value)
117 self.qgradient = qgradient
118 self.optim = optim # Optimizer
119 return self
120
121 def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE):
122 self.update_variables = [
123 target.assign(tau * ref + (1 - tau) * target)
124 for target, ref in zip(self.variables, nominalNet.variables)
125 ]
126 return self
127
128
129# --- Replay memory
131 def __init__(self, x, u, r, d, x2):
132 self.x = x
133 self.u = u
134 self.reward = r
135 self.done = d
136 self.x2 = x2
137
138
139replayDeque = deque()
140
141# --- Tensor flow initialization
142
143policy = PolicyNetwork().setupOptim()
144policyTarget = PolicyNetwork().setupTargetAssign(policy)
145
146qvalue = QValueNetwork().setupOptim()
147qvalueTarget = QValueNetwork().setupTargetAssign(qvalue)
148
149sess = tf.InteractiveSession()
150tf.global_variables_initializer().run()
151
152# Uncomment to save or restore networks
153# tf.train.Saver().restore(sess, "netvalues/actorcritic.pre.ckpt")
154# tf.train.Saver().save (sess, "netvalues/actorcritic.full.ckpt")
155
156
157def rendertrial(maxiter=NSTEPS, verbose=True):
158 x = env.reset()
159 rsum = 0.0
160 for i in range(maxiter):
161 u = sess.run(policy.policy, feed_dict={policy.x: x.T})
162 x, reward = env.step(u)
163 env.render()
164 time.sleep(1e-2)
165 rsum += reward
166 if verbose:
167 print("Lasted ", i, " timestep -- total reward:", rsum)
168
169
170signal.signal(
171 signal.SIGTSTP, lambda x, y: rendertrial()
172) # Roll-out when CTRL-Z is pressed
173
174# History of search
175h_rwd = []
176h_qva = []
177h_ste = []
178
179# --- Training
180for episode in range(1, NEPISODES):
181 x = env.reset().T
182 rsum = 0.0
183
184 for step in range(NSTEPS):
185 # Greedy policy ...
186 u = sess.run(policy.policy, feed_dict={policy.x: x})
187 u += 1.0 / (1.0 + episode + step) # ... with noise
188 x2, r = env.step(u)
189 x2 = x2.T
190 done = False # pendulum scenario is endless.
191
192 # Feed replay memory ...
193 replayDeque.append(ReplayItem(x, u, r, done, x2))
194 if len(replayDeque) > REPLAY_SIZE:
195 replayDeque.popleft() # ... with FIFO forgetting.
196
197 rsum += r
198 if done or np.linalg.norm(x - x2) < 1e-3:
199 break # Break when pendulum is still.
200 x = x2
201
202 # Start optimizing networks when memory size > batch size.
203 if len(replayDeque) > BATCH_SIZE:
204 batch = random.sample(
205 replayDeque, BATCH_SIZE
206 ) # Random batch from replay memory.
207 x_batch = np.vstack([b.x for b in batch])
208 u_batch = np.vstack([b.u for b in batch])
209 r_batch = np.vstack([b.reward for b in batch])
210 d_batch = np.vstack([b.done for b in batch])
211 x2_batch = np.vstack([b.x2 for b in batch])
212
213 # Compute Q(x,u) from target network
214 u2_batch = sess.run(
215 policyTarget.policy, feed_dict={policyTarget.x: x2_batch}
216 )
217 q2_batch = sess.run(
218 qvalueTarget.qvalue,
219 feed_dict={qvalueTarget.x: x2_batch, qvalueTarget.u: u2_batch},
220 )
221 qref_batch = r_batch + (not d_batch) * (DECAY_RATE * q2_batch)
222
223 # Update qvalue to solve HJB constraint: q = r + q'
224 sess.run(
225 qvalue.optim,
226 feed_dict={
227 qvalue.x: x_batch,
228 qvalue.u: u_batch,
229 qvalue.qref: qref_batch,
230 },
231 )
232
233 # Compute approximate policy gradient ...
234 u_targ = sess.run(policy.policy, feed_dict={policy.x: x_batch})
235 qgrad = sess.run(
236 qvalue.gradient, feed_dict={qvalue.x: x_batch, qvalue.u: u_targ}
237 )
238 # ... and take an optimization step along this gradient.
239 sess.run(
240 policy.optim, feed_dict={policy.x: x_batch, policy.qgradient: qgrad}
241 )
242
243 # Update target networks by homotopy.
244 sess.run(policyTarget.update_variables)
245 sess.run(qvalueTarget.update_variables)
246
247 # \\\END_FOR step in range(NSTEPS)
248
249 # Display and logging (not mandatory).
250 maxq = (
251 np.max(
252 sess.run(qvalue.qvalue, feed_dict={qvalue.x: x_batch, qvalue.u: u_batch})
253 )
254 if "x_batch" in locals()
255 else 0
256 )
257 print(
258 f"Ep#{episode:3d}: lasted {step:d} steps, "
259 f"reward={rsum:3.0f}, max qvalue={maxq:2.3f}"
260 )
261 h_rwd.append(rsum)
262 h_qva.append(maxq)
263 h_ste.append(step)
264 if not (episode + 1) % 20:
265 rendertrial(100)
266
267# \\\END_FOR episode in range(NEPISODES)
268
269print(f"Average reward during trials: {sum(h_rwd) / NEPISODES:.3f}")
270rendertrial()
271plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))
272plt.show()