2 Deep actor-critic network,
3 From "Continuous control with deep reinforcement learning",
4 by Lillicrap et al, arXiv:1509.02971
10 from collections
import deque
12 import matplotlib.pyplot
as plt
14 import tensorflow
as tf
16 from pendulum
import Pendulum
19 RANDOM_SEED = int((time.time() % 10) * 1000)
20 print(
"Seed = %d" % RANDOM_SEED)
21 np.random.seed(RANDOM_SEED)
22 tf.set_random_seed(RANDOM_SEED)
23 random.seed(RANDOM_SEED)
24 n_init = tflearn.initializations.truncated_normal(seed=RANDOM_SEED)
25 u_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003, seed=RANDOM_SEED)
30 QVALUE_LEARNING_RATE = 0.001
31 POLICY_LEARNING_RATE = 0.0001
49 nvars = len(tf.trainable_variables())
51 x = tflearn.input_data(shape=[
None, NX])
52 u = tflearn.input_data(shape=[
None, NU])
54 netx1 = tflearn.fully_connected(x, NH1, weights_init=n_init, activation=
"relu")
55 netx2 = tflearn.fully_connected(netx1, NH2, weights_init=n_init)
56 netu1 = tflearn.fully_connected(
57 u, NH1, weights_init=n_init, activation=
"linear"
59 netu2 = tflearn.fully_connected(netu1, NH2, weights_init=n_init)
60 net = tflearn.activation(netx2 + netu2, activation=
"relu")
61 qvalue = tflearn.fully_connected(net, 1, weights_init=u_init)
66 self.
variablesvariables = tf.trainable_variables()[nvars:]
67 self.
hidenshidens = [netx1, netx2, netu1, netu2]
70 qref = tf.placeholder(tf.float32, [
None, 1])
71 loss = tflearn.mean_square(qref, self.
qvalueqvalue)
72 optim = tf.train.AdamOptimizer(QVALUE_LEARNING_RATE).minimize(loss)
73 gradient = tf.gradients(self.
qvalueqvalue, self.
uu)[0] / float(BATCH_SIZE)
76 self.
optimoptim = optim
82 def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE):
84 target.assign(tau * ref + (1 - tau) * target)
85 for target, ref
in zip(self.
variablesvariables, nominalNet.variables)
92 nvars = len(tf.trainable_variables())
94 x = tflearn.input_data(shape=[
None, NX])
95 net = tflearn.fully_connected(x, NH1, activation=
"relu", weights_init=n_init)
96 net = tflearn.fully_connected(net, NH2, activation=
"relu", weights_init=n_init)
98 tflearn.fully_connected(net, NU, activation=
"tanh", weights_init=u_init)
103 self.
policypolicy = policy
104 self.
variablesvariables = tf.trainable_variables()[nvars:]
106 def setupOptim(self):
107 qgradient = tf.placeholder(tf.float32, [
None, NU])
108 grad = tf.gradients(self.
policypolicy, self.
variablesvariables, -qgradient)
109 optim = tf.train.AdamOptimizer(POLICY_LEARNING_RATE).apply_gradients(
114 self.
optimoptim = optim
117 def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE):
119 target.assign(tau * ref + (1 - tau) * target)
120 for target, ref
in zip(self.
variablesvariables, nominalNet.variables)
127 def __init__(self, x, u, r, d, x2):
135 replayDeque = deque()
145 sess = tf.InteractiveSession()
146 tf.global_variables_initializer().run()
153 def rendertrial(maxiter=NSTEPS, verbose=True):
156 for i
in range(maxiter):
157 u = sess.run(policy.policy, feed_dict={policy.x: x.T})
158 x, reward = env.step(u)
163 print(
"Lasted ", i,
" timestep -- total reward:", rsum)
167 signal.SIGTSTP,
lambda x, y: rendertrial()
176 for episode
in range(1, NEPISODES):
180 for step
in range(NSTEPS):
181 u = sess.run(policy.policy, feed_dict={policy.x: x})
182 u += 1.0 / (1.0 + episode + step)
187 replayDeque.append(
ReplayItem(x, u, r, done, x2))
188 if len(replayDeque) > REPLAY_SIZE:
189 replayDeque.popleft()
192 if done
or np.linalg.norm(x - x2) < 1e-3:
197 if len(replayDeque) > BATCH_SIZE:
198 batch = random.sample(
199 replayDeque, BATCH_SIZE
201 x_batch = np.vstack([b.x
for b
in batch])
202 u_batch = np.vstack([b.u
for b
in batch])
203 r_batch = np.vstack([b.reward
for b
in batch])
204 d_batch = np.vstack([b.done
for b
in batch])
205 x2_batch = np.vstack([b.x2
for b
in batch])
209 policyTarget.policy, feed_dict={policyTarget.x: x2_batch}
213 feed_dict={qvalueTarget.x: x2_batch, qvalueTarget.u: u2_batch},
215 qref_batch = r_batch + (
not d_batch) * (DECAY_RATE * q2_batch)
223 qvalue.qref: qref_batch,
228 u_targ = sess.run(policy.policy, feed_dict={policy.x: x_batch})
230 qvalue.gradient, feed_dict={qvalue.x: x_batch, qvalue.u: u_targ}
234 policy.optim, feed_dict={policy.x: x_batch, policy.qgradient: qgrad}
238 sess.run(policyTarget.update_variables)
239 sess.run(qvalueTarget.update_variables)
246 sess.run(qvalue.qvalue, feed_dict={qvalue.x: x_batch, qvalue.u: u_batch})
248 if "x_batch" in locals()
252 f
"Ep#{episode:3d}: lasted {step:d} steps, "
253 f
"reward={rsum:3.0f}, max qvalue={maxq:2.3f}"
258 if not (episode + 1) % 20:
263 print(
"Average reward during trials: %.3f" % (sum(h_rwd) / NEPISODES))
265 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))
— Q-value and policy networks