Skip to content

Commit cf6b095

Browse files
committed
update atari pong
1 parent 37388d7 commit cf6b095

File tree

1 file changed

+20
-23
lines changed

1 file changed

+20
-23
lines changed

example/tutorial_atari_pong.py

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
import tensorflow as tf
2626
import tensorlayer as tl
27+
from tensorlayer.layers import *
2728
import gym, time
2829
import numpy as np
2930

@@ -35,12 +36,11 @@
3536
learning_rate = 1e-4
3637
gamma = 0.99
3738
decay_rate = 0.99
38-
render = False # display the game environment
39-
resume = False # load existing policy network
39+
render = False # display the game environment
40+
# resume = True # load existing policy network
4041
model_file_name = "model_pong"
4142
np.set_printoptions(threshold=np.nan)
4243

43-
4444
def prepro(I):
4545
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
4646
I = I[35:195]
@@ -59,29 +59,25 @@ def prepro(I):
5959

6060
xs, ys, rs = [], [], []
6161
# observation for training and inference
62-
states_batch_pl = tf.placeholder(tf.float32, shape=[None, D])
62+
t_states = tf.placeholder(tf.float32, shape=[None, D])
6363
# policy network
64-
network = tl.layers.InputLayer(states_batch_pl, name='input_layer')
65-
network = tl.layers.DenseLayer(network, n_units=H,
66-
act = tf.nn.relu, name='relu1')
67-
network = tl.layers.DenseLayer(network, n_units=3,
68-
act = tf.identity, name='output_layer')
64+
network = InputLayer(t_states, name='input')
65+
network = DenseLayer(network, n_units=H, act=tf.nn.relu, name='hidden')
66+
network = DenseLayer(network, n_units=3, name='output')
6967
probs = network.outputs
7068
sampling_prob = tf.nn.softmax(probs)
7169

72-
actions_batch_pl = tf.placeholder(tf.int32, shape=[None])
73-
discount_rewards_batch_pl = tf.placeholder(tf.float32, shape=[None])
74-
loss = tl.rein.cross_entropy_reward_loss(probs, actions_batch_pl,
75-
discount_rewards_batch_pl)
70+
t_actions = tf.placeholder(tf.int32, shape=[None])
71+
t_discount_rewards = tf.placeholder(tf.float32, shape=[None])
72+
loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards)
7673
train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
7774

7875
with tf.Session() as sess:
79-
# init = tf.initialize_all_variables()
80-
# sess.run(init)
8176
tl.layers.initialize_global_variables(sess)
82-
if resume:
83-
load_params = tl.files.load_npz(name=model_file_name+'.npz')
84-
tl.files.assign_params(sess, load_params, network)
77+
# if resume:
78+
# load_params = tl.files.load_npz(name=model_file_name+'.npz')
79+
# tl.files.assign_params(sess, load_params, network)
80+
tl.files.load_and_assign_npz(sess, model_file_name+'.npz', network)
8581
network.print_params()
8682
network.print_layers()
8783

@@ -97,17 +93,18 @@ def prepro(I):
9793

9894
prob = sess.run(
9995
sampling_prob,
100-
feed_dict={states_batch_pl: x}
96+
feed_dict={t_states: x}
10197
)
10298
# action. 1: STOP 2: UP 3: DOWN
10399
# action = np.random.choice([1,2,3], p=prob.flatten())
104-
action = tl.rein.choice_action_by_probs(prob.flatten(), [1,2,3])
100+
action = tl.rein.choice_action_by_probs(prob.flatten(), [1,2,3])
105101

106102
observation, reward, done, _ = env.step(action)
107103
reward_sum += reward
108104
xs.append(x) # all observations in a episode
109105
ys.append(action - 1) # all fake labels in a episode (action begins from 1, so minus 1)
110106
rs.append(reward) # all rewards in a episode
107+
111108
if done:
112109
episode_number += 1
113110
game_number = 0
@@ -126,9 +123,9 @@ def prepro(I):
126123
sess.run(
127124
train_op,
128125
feed_dict={
129-
states_batch_pl: epx,
130-
actions_batch_pl: epy,
131-
discount_rewards_batch_pl: disR
126+
t_states: epx,
127+
t_actions: epy,
128+
t_discount_rewards: disR
132129
}
133130
)
134131

0 commit comments

Comments
 (0)