pinocchio  2.7.0
A fast and flexible implementation of Rigid Body Dynamics algorithms and their analytical derivatives
continuous.py
1 '''
2 Deep actor-critic network,
3 From "Continuous control with deep reinforcement learning", by Lillicrap et al, arXiv:1509.02971
4 '''
5 
6 from pendulum import Pendulum
7 import tensorflow as tf
8 import numpy as np
9 import tflearn
10 import random
11 from collections import deque
12 import time
13 import signal
14 import matplotlib.pyplot as plt
15 
16 
17 RANDOM_SEED = int((time.time()%10)*1000)
18 print "Seed = %d" % RANDOM_SEED
19 np .random.seed (RANDOM_SEED)
20 tf .set_random_seed (RANDOM_SEED)
21 random.seed (RANDOM_SEED)
22 n_init = tflearn.initializations.truncated_normal(seed=RANDOM_SEED)
23 u_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003,\
24  seed=RANDOM_SEED)
25 
26 
27 NEPISODES = 100 # Max training steps
28 NSTEPS = 100 # Max episode length
29 QVALUE_LEARNING_RATE = 0.001 # Base learning rate for the Q-value Network
30 POLICY_LEARNING_RATE = 0.0001 # Base learning rate for the policy network
31 DECAY_RATE = 0.99 # Discount factor
32 UPDATE_RATE = 0.01 # Homotopy rate to update the networks
33 REPLAY_SIZE = 10000 # Size of replay buffer
34 BATCH_SIZE = 64 # Number of points to be fed in stochastic gradient
35 NH1 = NH2 = 250 # Hidden layer size
36 
37 
38 env = Pendulum(1) # Continuous pendulum
39 env.withSinCos = True # State is dim-3: (cosq,sinq,qdot) ...
40 NX = env.nobs # ... training converges with q,qdot with 2x more neurones.
41 NU = env.nu # Control is dim-1: joint torque
42 
43 
44 
46  def __init__(self):
47  nvars = len(tf.trainable_variables())
48 
49  x = tflearn.input_data(shape=[None, NX])
50  u = tflearn.input_data(shape=[None, NU])
51 
52  netx1 = tflearn.fully_connected(x, NH1, weights_init=n_init, activation='relu')
53  netx2 = tflearn.fully_connected(netx1, NH2, weights_init=n_init)
54  netu1 = tflearn.fully_connected(u, NH1, weights_init=n_init, activation='linear')
55  netu2 = tflearn.fully_connected(netu1, NH2, weights_init=n_init)
56  net = tflearn.activation (netx2+netu2,activation='relu')
57  qvalue = tflearn.fully_connected(net, 1, weights_init=u_init)
58 
59  self.x = x # Network state <x> input in Q(x,u)
60  self.u = u # Network control <u> input in Q(x,u)
61  self.qvalue = qvalue # Network output <Q>
62  self.variables = tf.trainable_variables()[nvars:] # Variables to be trained
63  self.hidens = [ netx1, netx2, netu1, netu2 ] # Hidden layers for debug
64 
65  def setupOptim(self):
66  qref = tf.placeholder(tf.float32, [None, 1])
67  loss = tflearn.mean_square(qref, self.qvalue)
68  optim = tf.train.AdamOptimizer(QVALUE_LEARNING_RATE).minimize(loss)
69  gradient = tf.gradients(self.qvalue, self.u)[0] / float(BATCH_SIZE)
70 
71  self.qref = qref # Reference Q-values
72  self.optim = optim # Optimizer
73  self.gradient = gradient # Gradient of Q wrt the control dQ/du (for policy training)
74  return self
75 
76  def setupTargetAssign(self,nominalNet,tau=UPDATE_RATE):
77  self.update_variables = \
78  [ target.assign( tau*ref + (1-tau)*target ) \
79  for target,ref in zip(self.variables,nominalNet.variables) ]
80  return self
81 
83  def __init__(self):
84  nvars = len(tf.trainable_variables())
85 
86  x = tflearn.input_data(shape=[None, NX])
87  net = tflearn.fully_connected(x, NH1, activation='relu', weights_init=n_init)
88  net = tflearn.fully_connected(net, NH2, activation='relu', weights_init=n_init)
89  policy = tflearn.fully_connected(net, NU, activation='tanh', weights_init=u_init)*env.umax
90 
91  self.x = x # Network input <x> in Pi(x)
92  self.policy = policy # Network output <Pi>
93  self.variables = tf.trainable_variables()[nvars:] # Variables to be trained
94 
95  def setupOptim(self):
96 
97  qgradient = tf.placeholder(tf.float32, [None, NU])
98  grad = tf.gradients(self.policy, self.variables, -qgradient)
99  optim = tf.train.AdamOptimizer(POLICY_LEARNING_RATE).\
100  apply_gradients(zip(grad,self.variables))
101 
102  self.qgradient = qgradient # Q-value gradient wrt control (input value)
103  self.optim = optim # Optimizer
104  return self
105 
106  def setupTargetAssign(self,nominalNet,tau=UPDATE_RATE):
107  self.update_variables = \
108  [ target.assign( tau*ref + (1-tau)*target ) \
109  for target,ref in zip(self.variables,nominalNet.variables) ]
110  return self
111 
112 
114  def __init__(self,x,u,r,d,x2):
115  self.x = x
116  self.u = u
117  self.reward = r
118  self.done = d
119  self.x2 = x2
120 
121 replayDeque = deque()
122 
123 
124 
125 policy = PolicyNetwork(). setupOptim()
126 policyTarget = PolicyNetwork(). setupTargetAssign(policy)
127 
128 qvalue = QValueNetwork(). setupOptim()
129 qvalueTarget = QValueNetwork(). setupTargetAssign(qvalue)
130 
131 sess = tf.InteractiveSession()
132 tf.global_variables_initializer().run()
133 
134 # Uncomment to save or restore networks
135 #tf.train.Saver().restore(sess, "netvalues/actorcritic.pre.ckpt")
136 #tf.train.Saver().save (sess, "netvalues/actorcritic.full.ckpt")
137 
138 def rendertrial(maxiter=NSTEPS,verbose=True):
139  x = env.reset()
140  rsum = 0.
141  for i in range(maxiter):
142  u = sess.run(policy.policy, feed_dict={ policy.x: x.T })
143  x, reward = env.step(u)
144  env.render()
145  time.sleep(1e-2)
146  rsum += reward
147  if verbose: print 'Lasted ',i,' timestep -- total reward:',rsum
148 signal.signal(signal.SIGTSTP, lambda x,y:rendertrial()) # Roll-out when CTRL-Z is pressed
149 
150 
151 h_rwd = []
152 h_qva = []
153 h_ste = []
154 
155 
156 for episode in range(1,NEPISODES):
157  x = env.reset().T
158  rsum = 0.0
159 
160  for step in range(NSTEPS):
161  u = sess.run(policy.policy, feed_dict={ policy.x: x }) # Greedy policy ...
162  u += 1. / (1. + episode + step) # ... with noise
163  x2,r = env.step(u)
164  x2 = x2.T
165  done = False # pendulum scenario is endless.
166 
167  replayDeque.append(ReplayItem(x,u,r,done,x2)) # Feed replay memory ...
168  if len(replayDeque)>REPLAY_SIZE: replayDeque.popleft() # ... with FIFO forgetting.
169 
170  rsum += r
171  if done or np.linalg.norm(x-x2)<1e-3: break # Break when pendulum is still.
172  x = x2
173 
174  # Start optimizing networks when memory size > batch size.
175  if len(replayDeque) > BATCH_SIZE:
176  batch = random.sample(replayDeque,BATCH_SIZE) # Random batch from replay memory.
177  x_batch = np.vstack([ b.x for b in batch ])
178  u_batch = np.vstack([ b.u for b in batch ])
179  r_batch = np.vstack([ b.reward for b in batch ])
180  d_batch = np.vstack([ b.done for b in batch ])
181  x2_batch = np.vstack([ b.x2 for b in batch ])
182 
183  # Compute Q(x,u) from target network
184  u2_batch = sess.run(policyTarget.policy, feed_dict={ policyTarget .x : x2_batch})
185  q2_batch = sess.run(qvalueTarget.qvalue, feed_dict={ qvalueTarget.x : x2_batch,
186  qvalueTarget.u : u2_batch })
187  qref_batch = r_batch + (d_batch==False)*(DECAY_RATE*q2_batch)
188 
189  # Update qvalue to solve HJB constraint: q = r + q'
190  sess.run(qvalue.optim, feed_dict={ qvalue.x : x_batch,
191  qvalue.u : u_batch,
192  qvalue.qref : qref_batch })
193 
194  # Compute approximate policy gradient ...
195  u_targ = sess.run(policy.policy, feed_dict={ policy.x : x_batch} )
196  qgrad = sess.run(qvalue.gradient, feed_dict={ qvalue.x : x_batch,
197  qvalue.u : u_targ })
198  # ... and take an optimization step along this gradient.
199  sess.run(policy.optim,feed_dict= { policy.x : x_batch,
200  policy.qgradient : qgrad })
201 
202  # Update target networks by homotopy.
203  sess.run(policyTarget. update_variables)
204  sess.run(qvalueTarget.update_variables)
205 
206  # \\\END_FOR step in range(NSTEPS)
207 
208  # Display and logging (not mandatory).
209  maxq = np.max( sess.run(qvalue.qvalue,feed_dict={ qvalue.x : x_batch,
210  qvalue.u : u_batch }) ) \
211  if 'x_batch' in locals() else 0
212  print 'Ep#{:3d}: lasted {:d} steps, reward={:3.0f}, max qvalue={:2.3f}' \
213  .format(episode, step,rsum, maxq)
214  h_rwd.append(rsum)
215  h_qva.append(maxq)
216  h_ste.append(step)
217  if not (episode+1) % 20: rendertrial(100)
218 
219 # \\\END_FOR episode in range(NEPISODES)
220 
221 print "Average reward during trials: %.3f" % (sum(h_rwd)/NEPISODES)
222 rendertrial()
223 plt.plot( np.cumsum(h_rwd)/range(1,NEPISODES) )
224 plt.show()
225 
226 
227 
continuous.PolicyNetwork.x
x
Definition: continuous.py:91
continuous.QValueNetwork.x
x
Definition: continuous.py:59
continuous.QValueNetwork.hidens
hidens
Definition: continuous.py:63
continuous.ReplayItem.u
u
Definition: continuous.py:116
continuous.PolicyNetwork.variables
variables
Definition: continuous.py:93
continuous.QValueNetwork
— Q-value and policy networks
Definition: continuous.py:45
continuous.PolicyNetwork.qgradient
qgradient
Definition: continuous.py:102
continuous.ReplayItem.x2
x2
Definition: continuous.py:119
continuous.ReplayItem.reward
reward
Definition: continuous.py:117
continuous.ReplayItem.done
done
Definition: continuous.py:118
continuous.PolicyNetwork.optim
optim
Definition: continuous.py:103
continuous.QValueNetwork.variables
variables
Definition: continuous.py:62
pendulum.Pendulum
Definition: pendulum.py:41
continuous.PolicyNetwork
Definition: continuous.py:82
continuous.QValueNetwork.gradient
gradient
Definition: continuous.py:73
continuous.ReplayItem.x
x
Definition: continuous.py:115
continuous.QValueNetwork.update_variables
update_variables
Definition: continuous.py:77
continuous.PolicyNetwork.update_variables
update_variables
Definition: continuous.py:107
continuous.QValueNetwork.qref
qref
Definition: continuous.py:71
continuous.ReplayItem
Definition: continuous.py:113
continuous.PolicyNetwork.policy
policy
Definition: continuous.py:92
continuous.QValueNetwork.u
u
Definition: continuous.py:60
continuous.QValueNetwork.optim
optim
Definition: continuous.py:72
continuous.QValueNetwork.qvalue
qvalue
Definition: continuous.py:61