pinocchio  2.6.21 A fast and flexible implementation of Rigid Body Dynamics algorithms and their analytical derivatives
qtable.py
1 '''
2 Example of Q-table learning with a simple discretized 1-pendulum environment.
3 '''
4
5 import numpy as np
6 from dpendulum import DPendulum
7 import matplotlib.pyplot as plt
8 import signal
9 import time
10
11
12 RANDOM_SEED = int((time.time()%10)*1000)
13 print "Seed = %d" % RANDOM_SEED
14 np.random.seed(RANDOM_SEED)
15
16
17 NEPISODES = 500 # Number of training episodes
18 NSTEPS = 50 # Max episode length
19 LEARNING_RATE = 0.85 #
20 DECAY_RATE = 0.99 # Discount factor
21
22
23 env = DPendulum()
24 NX = env.nx # Number of (discrete) states
25 NU = env.nu # Number of (discrete) controls
26
27 Q = np.zeros([env.nx,env.nu]) # Q-table initialized to 0
28
29 def rendertrial(maxiter=100):
30  '''Roll-out from random state using greedy policy.'''
31  s = env.reset()
32  for i in range(maxiter):
33  a = np.argmax(Q[s,:])
34  s,r = env.step(a)
35  env.render()
36  if r==1: print 'Reward!'; break
37
38 signal.signal(signal.SIGTSTP, lambda x,y:rendertrial()) # Roll-out when CTRL-Z is pressed
39 h_rwd = [] # Learning history (for plot).
40
41 for episode in range(1,NEPISODES):
42  x = env.reset()
43  rsum = 0.0
44  for steps in range(NSTEPS):
45  u = np.argmax(Q[x,:] + np.random.randn(1,NU)/episode) # Greedy action with noise
46  x2,reward = env.step(u)
47
48  # Compute reference Q-value at state x respecting HJB
49  Qref = reward + DECAY_RATE*np.max(Q[x2,:])
50
51  # Update Q-Table to better fit HJB
52  Q[x,u] += LEARNING_RATE*(Qref-Q[x,u])
53  x = x2
54  rsum += reward
55  if reward==1: break
56
57  h_rwd.append(rsum)
58  if not episode%20: print 'Episode #%d done with %d sucess' % (episode,sum(h_rwd[-20:]))
59
60 print "Total rate of success: %.3f" % (sum(h_rwd)/NEPISODES)
62 plt.plot( np.cumsum(h_rwd)/range(1,NEPISODES) )
63 plt.show()
dpendulum.DPendulum
Definition: dpendulum.py:58
qtable.rendertrial
def rendertrial(maxiter=100)
Definition: qtable.py:29