master/doxygen-html/qnet_8py_source.html

 """

 Example of Q-table learning with a simple discretized 1-pendulum environment using a

 linear Q network.

 """


 import signal

 import time


 import matplotlib.pyplot as plt

 import numpy as np

 import tensorflow as tf

 from dpendulum import DPendulum


 # --- Random seed

 RANDOM_SEED = int((time.time() % 10) * 1000)

 print(f"Seed = {RANDOM_SEED}")

 np.random.seed(RANDOM_SEED)

 tf.set_random_seed(RANDOM_SEED)


 # --- Hyper paramaters

 NEPISODES = 500  # Number of training episodes

 NSTEPS = 50  # Max episode length

 LEARNING_RATE = 0.1  # Step length in optimizer

 DECAY_RATE = 0.99  # Discount factor


 # --- Environment

 env = DPendulum()

 NX = env.nx

 NU = env.nu


 # --- Q-value networks

 class QValueNetwork:

     def __init__(self):

         x = tf.placeholder(shape=[1, NX], dtype=tf.float32)

         W = tf.Variable(tf.random_uniform([NX, NU], 0, 0.01, seed=100))

         qvalue = tf.matmul(x, W)

         u = tf.argmax(qvalue, 1)


         qref = tf.placeholder(shape=[1, NU], dtype=tf.float32)

         loss = tf.reduce_sum(tf.square(qref - qvalue))

         optim = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)


         self.xx = x  # Network input

         self.qvalueqvalue = qvalue  # Q-value as a function of x

         self.uu = u  # Policy  as a function of x

         # Reference Q-value at next step (to be set to l+Q o f)

         self.qrefqref = qref

         self.optimoptim = optim  # Optimizer


 # --- Tensor flow initialization

 tf.reset_default_graph()

 qvalue = QValueNetwork()

 sess = tf.InteractiveSession()

 tf.global_variables_initializer().run()


 def onehot(ix, n=NX):

     """Return a vector which is 0 everywhere except index <i> set to 1."""

     return np.array(

         [

             [(i == ix) for i in range(n)],

         ],

         np.float,

     )


 def disturb(u, i):

     u += int(np.random.randn() * 10 / (i / 50 + 10))

     return np.clip(u, 0, NU - 1)


 def rendertrial(maxiter=100):

     x = env.reset()

     for i in range(maxiter):

         u = sess.run(qvalue.u, feed_dict={qvalue.x: onehot(x)})

         x, r = env.step(u)

         env.render()

         if r == 1:

             print("Reward!")

             break


 signal.signal(

     signal.SIGTSTP, lambda x, y: rendertrial()

 )  # Roll-out when CTRL-Z is pressed


 # --- History of search

 h_rwd = []  # Learning history (for plot).


 # --- Training

 for episode in range(1, NEPISODES):

     x = env.reset()

     rsum = 0.0


     for step in range(NSTEPS - 1):

         # Greedy policy ...

         u = sess.run(qvalue.u, feed_dict={qvalue.x: onehot(x)})[0]

         u = disturb(u, episode)  # ... with noise

         x2, reward = env.step(u)


         # Compute reference Q-value at state x respecting HJB

         Q2 = sess.run(qvalue.qvalue, feed_dict={qvalue.x: onehot(x2)})

         Qref = sess.run(qvalue.qvalue, feed_dict={qvalue.x: onehot(x)})

         Qref[0, u] = reward + DECAY_RATE * np.max(Q2)


         # Update Q-table to better fit HJB

         sess.run(qvalue.optim, feed_dict={qvalue.x: onehot(x), qvalue.qref: Qref})


         rsum += reward

         x = x2

         if reward == 1:

             break


     h_rwd.append(rsum)

     if not episode % 20:

         print(f"Episode #{episode} done with {sum(h_rwd[-20:])} sucess")


 print(f"Total rate of success: {sum(h_rwd) / NEPISODES:.3f}")

 rendertrial()

 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))

 plt.show()

dpendulum.DPendulum
Definition: dpendulum.py:74

qnet.QValueNetwork
Definition: qnet.py:33

qnet.QValueNetwork.qref
qref
Definition: qnet.py:48

qnet.QValueNetwork.optim
optim
Definition: qnet.py:49

qnet.QValueNetwork.u
u
Definition: qnet.py:46

qnet.QValueNetwork.x
x
Definition: qnet.py:44

qnet.QValueNetwork.qvalue
qvalue
Definition: qnet.py:45

qnet.onehot
def onehot(ix, n=NX)
Definition: qnet.py:59