@@ -79,7 +79,7 @@ def to_one_hot(i, n_classes=None):
7979 ## Choose an action by greedily (with e chance of random action) from the Q-network
8080 a , allQ = sess .run ([predict , y ], feed_dict = {inputs : [to_one_hot (s , 16 )]})
8181 ## e-Greedy Exploration !!! sample random action
82- if np .random .rand (1 ) < e :
82+ if np .random .rand (1 ) < e :
8383 a [0 ] = env .action_space .sample ()
8484 ## Get new state and reward from environment
8585 s1 , r , d , _ = env .step (a [0 ])
@@ -88,7 +88,7 @@ def to_one_hot(i, n_classes=None):
8888 ## Obtain maxQ' and set our target value for chosen action.
8989 maxQ1 = np .max (Q1 ) # in Q-Learning, policy is greedy, so we use "max" to select the next action.
9090 targetQ = allQ
91- targetQ [0 , a [0 ]] = r + lambd * maxQ1
91+ targetQ [0 , a [0 ]] = r + lambd * maxQ1
9292 ## Train network using target and predicted Q values
9393 # it is not real target Q value, it is just an estimation,
9494 # but check the Q-Learning update formula:
@@ -99,7 +99,7 @@ def to_one_hot(i, n_classes=None):
9999 rAll += r
100100 s = s1
101101 ## Reduce chance of random action if an episode is done.
102- if d == True :
102+ if d == True :
103103 e = 1. / ((i / 50 ) + 10 ) # reduce e, GLIE: Greey in the limit with infinite Exploration
104104 break
105105
0 commit comments