pinocchio  3.3.0
A fast and flexible implementation of Rigid Body Dynamics algorithms and their analytical derivatives
continuous.py
1 """
2 Deep actor-critic network,
3 From "Continuous control with deep reinforcement learning",
4 by Lillicrap et al, arXiv:1509.02971
5 """
6 
7 import random
8 import signal
9 import time
10 from collections import deque
11 
12 import matplotlib.pyplot as plt
13 import numpy as np
14 import tensorflow as tf
15 import tflearn
16 from pendulum import Pendulum
17 
18 
19 RANDOM_SEED = int((time.time() % 10) * 1000)
20 print("Seed = %d" % RANDOM_SEED)
21 np.random.seed(RANDOM_SEED)
22 tf.set_random_seed(RANDOM_SEED)
23 random.seed(RANDOM_SEED)
24 n_init = tflearn.initializations.truncated_normal(seed=RANDOM_SEED)
25 u_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003, seed=RANDOM_SEED)
26 
27 
28 NEPISODES = 100 # Max training steps
29 NSTEPS = 100 # Max episode length
30 QVALUE_LEARNING_RATE = 0.001 # Base learning rate for the Q-value Network
31 POLICY_LEARNING_RATE = 0.0001 # Base learning rate for the policy network
32 DECAY_RATE = 0.99 # Discount factor
33 UPDATE_RATE = 0.01 # Homotopy rate to update the networks
34 REPLAY_SIZE = 10000 # Size of replay buffer
35 BATCH_SIZE = 64 # Number of points to be fed in stochastic gradient
36 NH1 = NH2 = 250 # Hidden layer size
37 
38 
39 env = Pendulum(1) # Continuous pendulum
40 env.withSinCos = True # State is dim-3: (cosq,sinq,qdot) ...
41 NX = env.nobs # ... training converges with q,qdot with 2x more neurones.
42 NU = env.nu # Control is dim-1: joint torque
43 
44 
45 
46 
48  def __init__(self):
49  nvars = len(tf.trainable_variables())
50 
51  x = tflearn.input_data(shape=[None, NX])
52  u = tflearn.input_data(shape=[None, NU])
53 
54  netx1 = tflearn.fully_connected(x, NH1, weights_init=n_init, activation="relu")
55  netx2 = tflearn.fully_connected(netx1, NH2, weights_init=n_init)
56  netu1 = tflearn.fully_connected(
57  u, NH1, weights_init=n_init, activation="linear"
58  )
59  netu2 = tflearn.fully_connected(netu1, NH2, weights_init=n_init)
60  net = tflearn.activation(netx2 + netu2, activation="relu")
61  qvalue = tflearn.fully_connected(net, 1, weights_init=u_init)
62 
63  self.xx = x # Network state <x> input in Q(x,u)
64  self.uu = u # Network control <u> input in Q(x,u)
65  self.qvalueqvalue = qvalue # Network output <Q>
66  self.variablesvariables = tf.trainable_variables()[nvars:] # Variables to be trained
67  self.hidenshidens = [netx1, netx2, netu1, netu2] # Hidden layers for debug
68 
69  def setupOptim(self):
70  qref = tf.placeholder(tf.float32, [None, 1])
71  loss = tflearn.mean_square(qref, self.qvalueqvalue)
72  optim = tf.train.AdamOptimizer(QVALUE_LEARNING_RATE).minimize(loss)
73  gradient = tf.gradients(self.qvalueqvalue, self.uu)[0] / float(BATCH_SIZE)
74 
75  self.qrefqref = qref # Reference Q-values
76  self.optimoptim = optim # Optimizer
77  self.gradientgradient = (
78  gradient # Gradient of Q wrt the control dQ/du (for policy training)
79  )
80  return self
81 
82  def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE):
83  self.update_variablesupdate_variables = [
84  target.assign(tau * ref + (1 - tau) * target)
85  for target, ref in zip(self.variablesvariables, nominalNet.variables)
86  ]
87  return self
88 
89 
91  def __init__(self):
92  nvars = len(tf.trainable_variables())
93 
94  x = tflearn.input_data(shape=[None, NX])
95  net = tflearn.fully_connected(x, NH1, activation="relu", weights_init=n_init)
96  net = tflearn.fully_connected(net, NH2, activation="relu", weights_init=n_init)
97  policy = (
98  tflearn.fully_connected(net, NU, activation="tanh", weights_init=u_init)
99  * env.umax
100  )
101 
102  self.xx = x # Network input <x> in Pi(x)
103  self.policypolicy = policy # Network output <Pi>
104  self.variablesvariables = tf.trainable_variables()[nvars:] # Variables to be trained
105 
106  def setupOptim(self):
107  qgradient = tf.placeholder(tf.float32, [None, NU])
108  grad = tf.gradients(self.policypolicy, self.variablesvariables, -qgradient)
109  optim = tf.train.AdamOptimizer(POLICY_LEARNING_RATE).apply_gradients(
110  zip(grad, self.variablesvariables)
111  )
112 
113  self.qgradientqgradient = qgradient # Q-value gradient wrt control (input value)
114  self.optimoptim = optim # Optimizer
115  return self
116 
117  def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE):
118  self.update_variablesupdate_variables = [
119  target.assign(tau * ref + (1 - tau) * target)
120  for target, ref in zip(self.variablesvariables, nominalNet.variables)
121  ]
122  return self
123 
124 
125 
127  def __init__(self, x, u, r, d, x2):
128  self.xx = x
129  self.uu = u
130  self.rewardreward = r
131  self.donedone = d
132  self.x2x2 = x2
133 
134 
135 replayDeque = deque()
136 
137 
138 
139 policy = PolicyNetwork().setupOptim()
140 policyTarget = PolicyNetwork().setupTargetAssign(policy)
141 
142 qvalue = QValueNetwork().setupOptim()
143 qvalueTarget = QValueNetwork().setupTargetAssign(qvalue)
144 
145 sess = tf.InteractiveSession()
146 tf.global_variables_initializer().run()
147 
148 # Uncomment to save or restore networks
149 # tf.train.Saver().restore(sess, "netvalues/actorcritic.pre.ckpt")
150 # tf.train.Saver().save (sess, "netvalues/actorcritic.full.ckpt")
151 
152 
153 def rendertrial(maxiter=NSTEPS, verbose=True):
154  x = env.reset()
155  rsum = 0.0
156  for i in range(maxiter):
157  u = sess.run(policy.policy, feed_dict={policy.x: x.T})
158  x, reward = env.step(u)
159  env.render()
160  time.sleep(1e-2)
161  rsum += reward
162  if verbose:
163  print("Lasted ", i, " timestep -- total reward:", rsum)
164 
165 
166 signal.signal(
167  signal.SIGTSTP, lambda x, y: rendertrial()
168 ) # Roll-out when CTRL-Z is pressed
169 
170 
171 h_rwd = []
172 h_qva = []
173 h_ste = []
174 
175 
176 for episode in range(1, NEPISODES):
177  x = env.reset().T
178  rsum = 0.0
179 
180  for step in range(NSTEPS):
181  u = sess.run(policy.policy, feed_dict={policy.x: x}) # Greedy policy ...
182  u += 1.0 / (1.0 + episode + step) # ... with noise
183  x2, r = env.step(u)
184  x2 = x2.T
185  done = False # pendulum scenario is endless.
186 
187  replayDeque.append(ReplayItem(x, u, r, done, x2)) # Feed replay memory ...
188  if len(replayDeque) > REPLAY_SIZE:
189  replayDeque.popleft() # ... with FIFO forgetting.
190 
191  rsum += r
192  if done or np.linalg.norm(x - x2) < 1e-3:
193  break # Break when pendulum is still.
194  x = x2
195 
196  # Start optimizing networks when memory size > batch size.
197  if len(replayDeque) > BATCH_SIZE:
198  batch = random.sample(
199  replayDeque, BATCH_SIZE
200  ) # Random batch from replay memory.
201  x_batch = np.vstack([b.x for b in batch])
202  u_batch = np.vstack([b.u for b in batch])
203  r_batch = np.vstack([b.reward for b in batch])
204  d_batch = np.vstack([b.done for b in batch])
205  x2_batch = np.vstack([b.x2 for b in batch])
206 
207  # Compute Q(x,u) from target network
208  u2_batch = sess.run(
209  policyTarget.policy, feed_dict={policyTarget.x: x2_batch}
210  )
211  q2_batch = sess.run(
212  qvalueTarget.qvalue,
213  feed_dict={qvalueTarget.x: x2_batch, qvalueTarget.u: u2_batch},
214  )
215  qref_batch = r_batch + (not d_batch) * (DECAY_RATE * q2_batch)
216 
217  # Update qvalue to solve HJB constraint: q = r + q'
218  sess.run(
219  qvalue.optim,
220  feed_dict={
221  qvalue.x: x_batch,
222  qvalue.u: u_batch,
223  qvalue.qref: qref_batch,
224  },
225  )
226 
227  # Compute approximate policy gradient ...
228  u_targ = sess.run(policy.policy, feed_dict={policy.x: x_batch})
229  qgrad = sess.run(
230  qvalue.gradient, feed_dict={qvalue.x: x_batch, qvalue.u: u_targ}
231  )
232  # ... and take an optimization step along this gradient.
233  sess.run(
234  policy.optim, feed_dict={policy.x: x_batch, policy.qgradient: qgrad}
235  )
236 
237  # Update target networks by homotopy.
238  sess.run(policyTarget.update_variables)
239  sess.run(qvalueTarget.update_variables)
240 
241  # \\\END_FOR step in range(NSTEPS)
242 
243  # Display and logging (not mandatory).
244  maxq = (
245  np.max(
246  sess.run(qvalue.qvalue, feed_dict={qvalue.x: x_batch, qvalue.u: u_batch})
247  )
248  if "x_batch" in locals()
249  else 0
250  )
251  print(
252  f"Ep#{episode:3d}: lasted {step:d} steps, "
253  f"reward={rsum:3.0f}, max qvalue={maxq:2.3f}"
254  )
255  h_rwd.append(rsum)
256  h_qva.append(maxq)
257  h_ste.append(step)
258  if not (episode + 1) % 20:
259  rendertrial(100)
260 
261 # \\\END_FOR episode in range(NEPISODES)
262 
263 print("Average reward during trials: %.3f" % (sum(h_rwd) / NEPISODES))
264 rendertrial()
265 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))
266 plt.show()
— Q-value and policy networks
Definition: continuous.py:47