2424
2525import tensorflow as tf
2626import tensorlayer as tl
27+ from tensorlayer .layers import *
2728import gym , time
2829import numpy as np
2930
3536learning_rate = 1e-4
3637gamma = 0.99
3738decay_rate = 0.99
38- render = False # display the game environment
39- resume = False # load existing policy network
39+ render = False # display the game environment
40+ # resume = True # load existing policy network
4041model_file_name = "model_pong"
4142np .set_printoptions (threshold = np .nan )
4243
43-
4444def prepro (I ):
4545 """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
4646 I = I [35 :195 ]
@@ -59,29 +59,25 @@ def prepro(I):
5959
6060xs , ys , rs = [], [], []
6161# observation for training and inference
62- states_batch_pl = tf .placeholder (tf .float32 , shape = [None , D ])
62+ t_states = tf .placeholder (tf .float32 , shape = [None , D ])
6363# policy network
64- network = tl .layers .InputLayer (states_batch_pl , name = 'input_layer' )
65- network = tl .layers .DenseLayer (network , n_units = H ,
66- act = tf .nn .relu , name = 'relu1' )
67- network = tl .layers .DenseLayer (network , n_units = 3 ,
68- act = tf .identity , name = 'output_layer' )
64+ network = InputLayer (t_states , name = 'input' )
65+ network = DenseLayer (network , n_units = H , act = tf .nn .relu , name = 'hidden' )
66+ network = DenseLayer (network , n_units = 3 , name = 'output' )
6967probs = network .outputs
7068sampling_prob = tf .nn .softmax (probs )
7169
72- actions_batch_pl = tf .placeholder (tf .int32 , shape = [None ])
73- discount_rewards_batch_pl = tf .placeholder (tf .float32 , shape = [None ])
74- loss = tl .rein .cross_entropy_reward_loss (probs , actions_batch_pl ,
75- discount_rewards_batch_pl )
70+ t_actions = tf .placeholder (tf .int32 , shape = [None ])
71+ t_discount_rewards = tf .placeholder (tf .float32 , shape = [None ])
72+ loss = tl .rein .cross_entropy_reward_loss (probs , t_actions , t_discount_rewards )
7673train_op = tf .train .RMSPropOptimizer (learning_rate , decay_rate ).minimize (loss )
7774
7875with tf .Session () as sess :
79- # init = tf.initialize_all_variables()
80- # sess.run(init)
8176 tl .layers .initialize_global_variables (sess )
82- if resume :
83- load_params = tl .files .load_npz (name = model_file_name + '.npz' )
84- tl .files .assign_params (sess , load_params , network )
77+ # if resume:
78+ # load_params = tl.files.load_npz(name=model_file_name+'.npz')
79+ # tl.files.assign_params(sess, load_params, network)
80+ tl .files .load_and_assign_npz (sess , model_file_name + '.npz' , network )
8581 network .print_params ()
8682 network .print_layers ()
8783
@@ -97,17 +93,18 @@ def prepro(I):
9793
9894 prob = sess .run (
9995 sampling_prob ,
100- feed_dict = {states_batch_pl : x }
96+ feed_dict = {t_states : x }
10197 )
10298 # action. 1: STOP 2: UP 3: DOWN
10399 # action = np.random.choice([1,2,3], p=prob.flatten())
104- action = tl .rein .choice_action_by_probs (prob .flatten (), [1 ,2 ,3 ])
100+ action = tl .rein .choice_action_by_probs (prob .flatten (), [1 ,2 ,3 ])
105101
106102 observation , reward , done , _ = env .step (action )
107103 reward_sum += reward
108104 xs .append (x ) # all observations in a episode
109105 ys .append (action - 1 ) # all fake labels in a episode (action begins from 1, so minus 1)
110106 rs .append (reward ) # all rewards in a episode
107+
111108 if done :
112109 episode_number += 1
113110 game_number = 0
@@ -126,9 +123,9 @@ def prepro(I):
126123 sess .run (
127124 train_op ,
128125 feed_dict = {
129- states_batch_pl : epx ,
130- actions_batch_pl : epy ,
131- discount_rewards_batch_pl : disR
126+ t_states : epx ,
127+ t_actions : epy ,
128+ t_discount_rewards : disR
132129 }
133130 )
134131
0 commit comments