|
| RANDOM_SEED = int((time.time()%10)*1000) |
| — Random seed
|
|
| n_init = tflearn.initializations.truncated_normal(seed=RANDOM_SEED) |
|
| u_init |
|
int | NEPISODES = 100 |
| — Hyper paramaters
|
|
int | NSTEPS = 100 |
|
float | QVALUE_LEARNING_RATE = 0.001 |
|
float | POLICY_LEARNING_RATE = 0.0001 |
|
float | DECAY_RATE = 0.99 |
|
float | UPDATE_RATE = 0.01 |
|
int | REPLAY_SIZE = 10000 |
|
int | BATCH_SIZE = 64 |
|
int | NH1 = 250 |
|
| env = Pendulum(1) |
| — Environment
|
|
| withSinCos |
|
| NX = env.nobs |
|
| NU = env.nu |
|
| replayDeque = deque() |
|
| policy = PolicyNetwork().setupOptim() |
| — Tensor flow initialization
|
|
| policyTarget = PolicyNetwork().setupTargetAssign(policy) |
|
| qvalue = QValueNetwork().setupOptim() |
|
| qvalueTarget = QValueNetwork().setupTargetAssign(qvalue) |
|
| sess = tf.InteractiveSession() |
|
list | h_rwd = [] |
| History of search.
|
|
list | h_qva = [] |
|
list | h_ste = [] |
|
| x = env.reset().T |
| — Training
|
|
float | rsum = 0.0 |
|
| u = sess.run(policy.policy, feed_dict={ policy.x: x }) |
|
| x2 = x2.T |
|
| r |
|
bool | done = False |
|
| batch = random.sample(replayDeque,BATCH_SIZE) |
|
| x_batch = np.vstack([ b.x for b in batch ]) |
|
| u_batch = np.vstack([ b.u for b in batch ]) |
|
| r_batch = np.vstack([ b.reward for b in batch ]) |
|
| d_batch = np.vstack([ b.done for b in batch ]) |
|
| x2_batch = np.vstack([ b.x2 for b in batch ]) |
|
| u2_batch = sess.run(policyTarget.policy, feed_dict={ policyTarget .x : x2_batch}) |
|
| q2_batch |
|
| qref_batch = r_batch+(d_batch==False)*(DECAY_RATE*q2_batch) |
|
| optim |
|
| feed_dict |
|
| u_targ = sess.run(policy.policy, feed_dict={ policy.x : x_batch} ) |
|
| qgrad |
|
| maxq |
|
Deep actor-critic network,
From "Continuous control with deep reinforcement learning", by Lillicrap et al, arXiv:1509.02971