Skip to content

Commit f55ddc4

Browse files
committed
Add flags for normalization and variable layers
1 parent 7a54506 commit f55ddc4

File tree

4 files changed

+69
-42
lines changed

4 files changed

+69
-42
lines changed

python/PPO.ipynb

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,13 @@
5858
"time_horizon = 2048 # How many steps to collect per agent before adding to buffer.\n",
5959
"beta = 1e-3 # Strength of entropy regularization\n",
6060
"num_epoch = 5 # Number of gradient descent steps per batch of experiences.\n",
61+
"num_layers = 2 # Number of hidden layers between state/observation encoding and value/policy layers.\n",
6162
"epsilon = 0.2 # Acceptable threshold around ratio of old and new policy probabilities.\n",
6263
"buffer_size = 2048 # How large the experience buffer should be before gradient descent.\n",
6364
"learning_rate = 3e-4 # Model learning rate.\n",
6465
"hidden_units = 64 # Number of units in hidden layer.\n",
6566
"batch_size = 64 # How many experiences per gradient descent update step.\n",
67+
"normalize = False\n",
6668
"\n",
6769
"### Logging dictionary for hyperparameters\n",
6870
"hyperparameter_dict = {'max_steps':max_steps, 'run_path':run_path, 'env_name':env_name,\n",
@@ -81,7 +83,9 @@
8183
{
8284
"cell_type": "code",
8385
"execution_count": null,
84-
"metadata": {},
86+
"metadata": {
87+
"collapsed": true
88+
},
8589
"outputs": [],
8690
"source": [
8791
"env = UnityEnvironment(file_name=env_name, curriculum=curriculum_file)\n",
@@ -100,6 +104,7 @@
100104
"cell_type": "code",
101105
"execution_count": null,
102106
"metadata": {
107+
"collapsed": true,
103108
"scrolled": true
104109
},
105110
"outputs": [],
@@ -124,7 +129,8 @@
124129
"# Create the Tensorflow model graph\n",
125130
"ppo_model = create_agent_model(env, lr=learning_rate,\n",
126131
" h_size=hidden_units, epsilon=epsilon,\n",
127-
" beta=beta, max_step=max_steps)\n",
132+
" beta=beta, max_step=max_steps, \n",
133+
" normalize=normalize, num_layers=num_layers)\n",
128134
"\n",
129135
"is_continuous = (env.brains[brain_name].action_space_type == \"continuous\")\n",
130136
"use_observations = (env.brains[brain_name].number_observations > 0)\n",
@@ -160,7 +166,7 @@
160166
" if env.global_done:\n",
161167
" info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]\n",
162168
" # Decide and take an action\n",
163-
" new_info = trainer.take_action(info, env, brain_name, steps)\n",
169+
" new_info = trainer.take_action(info, env, brain_name, steps, normalize)\n",
164170
" info = new_info\n",
165171
" trainer.process_experiences(info, time_horizon, gamma, lambd)\n",
166172
" if len(trainer.training_buffer['actions']) > buffer_size and train_model:\n",
@@ -208,21 +214,21 @@
208214
"metadata": {
209215
"anaconda-cloud": {},
210216
"kernelspec": {
211-
"display_name": "Python 2",
217+
"display_name": "Python 3",
212218
"language": "python",
213-
"name": "python2"
219+
"name": "python3"
214220
},
215221
"language_info": {
216222
"codemirror_mode": {
217223
"name": "ipython",
218-
"version": 2
224+
"version": 3
219225
},
220226
"file_extension": ".py",
221227
"mimetype": "text/x-python",
222228
"name": "python",
223229
"nbconvert_exporter": "python",
224-
"pygments_lexer": "ipython2",
225-
"version": "2.7.10"
230+
"pygments_lexer": "ipython3",
231+
"version": "3.6.2"
226232
}
227233
},
228234
"nbformat": 4,

python/ppo.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
--learning-rate=<rate> Model learning rate [default: 3e-4].
2828
--load Whether to load the model or randomly initialize [default: False].
2929
--max-steps=<n> Maximum number of steps to run environment [default: 1e6].
30+
--normalize Whether to normalize the state input using running statistics [default: False].
3031
--num-epoch=<n> Number of gradient descent steps per batch of experiences [default: 5].
32+
--num-layers=<n> Number of hidden layers between state/observation and outputs [default: 2].
3133
--run-path=<path> The sub-directory name for model and summary statistics [default: ppo].
3234
--save-freq=<n> Frequency at which to save model [default: 50000].
3335
--summary-freq=<n> Frequency at which to save training statistics [default: 10000].
@@ -60,11 +62,13 @@
6062
time_horizon = int(options['--time-horizon'])
6163
beta = float(options['--beta'])
6264
num_epoch = int(options['--num-epoch'])
65+
num_layers = int(options['--num-layers'])
6366
epsilon = float(options['--epsilon'])
6467
buffer_size = int(options['--buffer-size'])
6568
learning_rate = float(options['--learning-rate'])
6669
hidden_units = int(options['--hidden-units'])
6770
batch_size = int(options['--batch-size'])
71+
normalize = options['--normalize']
6872

6973
env = UnityEnvironment(file_name=env_name, worker_id=worker_id, curriculum=curriculum_file)
7074
print(str(env))
@@ -75,7 +79,8 @@
7579
# Create the Tensorflow model graph
7680
ppo_model = create_agent_model(env, lr=learning_rate,
7781
h_size=hidden_units, epsilon=epsilon,
78-
beta=beta, max_step=max_steps)
82+
beta=beta, max_step=max_steps,
83+
normalize=normalize, num_layers=num_layers)
7984

8085
is_continuous = (env.brains[brain_name].action_space_type == "continuous")
8186
use_observations = (env.brains[brain_name].number_observations > 0)
@@ -124,7 +129,7 @@ def get_progress():
124129
info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
125130
trainer.reset_buffers(info, total=True)
126131
# Decide and take an action
127-
new_info = trainer.take_action(info, env, brain_name, steps)
132+
new_info = trainer.take_action(info, env, brain_name, steps, normalize)
128133
info = new_info
129134
trainer.process_experiences(info, time_horizon, gamma, lambd)
130135
if len(trainer.training_buffer['actions']) > buffer_size and train_model:

python/ppo/models.py

Lines changed: 46 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from unityagents import UnityEnvironmentException
66

77

8-
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6):
8+
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6, normalize=False, num_layers=2):
99
"""
1010
Takes a Unity environment and model-specific hyper-parameters and returns the
1111
appropriate PPO agent model for the environment.
@@ -17,12 +17,14 @@ def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_ste
1717
:return: a sub-class of PPOAgent tailored to the environment.
1818
:param max_step: Total number of training steps.
1919
"""
20+
if num_layers < 1: num_layers = 1
21+
2022
brain_name = env.brain_names[0]
2123
brain = env.brains[brain_name]
2224
if brain.action_space_type == "continuous":
23-
return ContinuousControlModel(lr, brain, h_size, epsilon, max_step)
25+
return ContinuousControlModel(lr, brain, h_size, epsilon, max_step, normalize, num_layers)
2426
if brain.action_space_type == "discrete":
25-
return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step)
27+
return DiscreteControlModel(lr, brain, h_size, epsilon, beta, max_step, normalize, num_layers)
2628

2729

2830
def save_model(sess, saver, model_path="./", steps=0):
@@ -57,6 +59,9 @@ def export_graph(model_path, env_name="env", target_nodes="action,value_estimate
5759

5860

5961
class PPOModel(object):
62+
def __init__(self):
63+
self.normalize = False
64+
6065
def create_global_steps(self):
6166
"""Creates TF ops to track and increment global training step."""
6267
self.global_step = tf.Variable(0, name="global_step", trainable=False, dtype=tf.int32)
@@ -68,7 +73,7 @@ def create_reward_encoder(self):
6873
self.new_reward = tf.placeholder(shape=[], dtype=tf.float32, name='new_reward')
6974
self.update_reward = tf.assign(self.last_reward, self.new_reward)
7075

71-
def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, activation):
76+
def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, activation, num_layers):
7277
"""
7378
Builds a set of visual (CNN) encoders.
7479
:param o_size_h: Height observation size.
@@ -92,11 +97,13 @@ def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, act
9297
use_bias=False, activation=activation)
9398
self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
9499
use_bias=False, activation=activation)
95-
hidden = tf.layers.dense(c_layers.flatten(self.conv2), h_size, use_bias=False, activation=activation)
100+
hidden = c_layers.flatten(self.conv2)
101+
for j in range(num_layers):
102+
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
96103
streams.append(hidden)
97104
return streams
98105

99-
def create_continuous_state_encoder(self, s_size, h_size, num_streams, activation):
106+
def create_continuous_state_encoder(self, s_size, h_size, num_streams, activation, num_layers):
100107
"""
101108
Builds a set of hidden state encoders.
102109
:param s_size: state input size.
@@ -107,27 +114,30 @@ def create_continuous_state_encoder(self, s_size, h_size, num_streams, activatio
107114
"""
108115
self.state_in = tf.placeholder(shape=[None, s_size], dtype=tf.float32, name='state')
109116

110-
self.running_mean = tf.get_variable("running_mean", [s_size], trainable=False, dtype=tf.float32,
111-
initializer=tf.zeros_initializer())
112-
self.running_variance = tf.get_variable("running_variance", [s_size], trainable=False, dtype=tf.float32,
113-
initializer=tf.ones_initializer())
114-
115-
self.normalized_state = tf.clip_by_value((self.state_in - self.running_mean) / tf.sqrt(
116-
self.running_variance / (tf.cast(self.global_step, tf.float32) + 1)), -5, 5, name="normalized_state")
117+
if self.normalize:
118+
self.running_mean = tf.get_variable("running_mean", [s_size], trainable=False, dtype=tf.float32,
119+
initializer=tf.zeros_initializer())
120+
self.running_variance = tf.get_variable("running_variance", [s_size], trainable=False, dtype=tf.float32,
121+
initializer=tf.ones_initializer())
117122

118-
self.new_mean = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_mean')
119-
self.new_variance = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_variance')
120-
self.update_mean = tf.assign(self.running_mean, self.new_mean)
121-
self.update_variance = tf.assign(self.running_variance, self.new_variance)
123+
self.normalized_state = tf.clip_by_value((self.state_in - self.running_mean) / tf.sqrt(
124+
self.running_variance / (tf.cast(self.global_step, tf.float32) + 1)), -5, 5, name="normalized_state")
122125

126+
self.new_mean = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_mean')
127+
self.new_variance = tf.placeholder(shape=[s_size], dtype=tf.float32, name='new_variance')
128+
self.update_mean = tf.assign(self.running_mean, self.new_mean)
129+
self.update_variance = tf.assign(self.running_variance, self.new_variance)
130+
else:
131+
self.normalized_state = self.state_in
123132
streams = []
124133
for i in range(num_streams):
125-
hidden_1 = tf.layers.dense(self.normalized_state, h_size, use_bias=False, activation=activation)
126-
hidden_2 = tf.layers.dense(hidden_1, h_size, use_bias=False, activation=activation)
127-
streams.append(hidden_2)
134+
hidden = self.normalized_state
135+
for j in range(num_layers):
136+
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
137+
streams.append(hidden)
128138
return streams
129139

130-
def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation):
140+
def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation, num_layers):
131141
"""
132142
Builds a set of hidden state encoders from discrete state input.
133143
:param s_size: state input size (discrete).
@@ -140,8 +150,10 @@ def create_discrete_state_encoder(self, s_size, h_size, num_streams, activation)
140150
state_in = tf.reshape(self.state_in, [-1])
141151
state_onehot = c_layers.one_hot_encoding(state_in, s_size)
142152
streams = []
153+
hidden = state_onehot
143154
for i in range(num_streams):
144-
hidden = tf.layers.dense(state_onehot, h_size, use_bias=False, activation=activation)
155+
for j in range(num_layers):
156+
hidden = tf.layers.dense(hidden, h_size, use_bias=False, activation=activation)
145157
streams.append(hidden)
146158
return streams
147159

@@ -186,29 +198,31 @@ def create_ppo_optimizer(self, probs, old_probs, value, entropy, beta, epsilon,
186198

187199

188200
class ContinuousControlModel(PPOModel):
189-
def __init__(self, lr, brain, h_size, epsilon, max_step):
201+
def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers):
190202
"""
191203
Creates Continuous Control Actor-Critic model.
192204
:param brain: State-space size
193205
:param h_size: Hidden layer size
194206
"""
207+
super().__init__()
195208
s_size = brain.state_space_size
196209
a_size = brain.action_space_size
197210

211+
self.normalize = normalize
198212
self.create_global_steps()
199213
self.create_reward_encoder()
200214

201215
hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None
202216
if brain.number_observations > 0:
203217
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
204218
bw = brain.camera_resolutions[0]['blackAndWhite']
205-
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh)
219+
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
206220
if brain.state_space_size > 0:
207221
s_size = brain.state_space_size
208222
if brain.state_space_type == "continuous":
209-
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 2, tf.nn.tanh)
223+
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 2, tf.nn.tanh, num_layers)
210224
else:
211-
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 2, tf.nn.tanh)
225+
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 2, tf.nn.tanh, num_layers)
212226

213227
if hidden_visual is None and hidden_state is None:
214228
raise Exception("No valid network configuration possible. "
@@ -249,26 +263,28 @@ def __init__(self, lr, brain, h_size, epsilon, max_step):
249263

250264

251265
class DiscreteControlModel(PPOModel):
252-
def __init__(self, lr, brain, h_size, epsilon, beta, max_step):
266+
def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_layers):
253267
"""
254268
Creates Discrete Control Actor-Critic model.
255269
:param brain: State-space size
256270
:param h_size: Hidden layer size
257271
"""
272+
super().__init__()
258273
self.create_global_steps()
259274
self.create_reward_encoder()
275+
self.normalize = normalize
260276

261277
hidden_state, hidden_visual, hidden = None, None, None
262278
if brain.number_observations > 0:
263279
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
264280
bw = brain.camera_resolutions[0]['blackAndWhite']
265-
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu)[0]
281+
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]
266282
if brain.state_space_size > 0:
267283
s_size = brain.state_space_size
268284
if brain.state_space_type == "continuous":
269-
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu)[0]
285+
hidden_state = self.create_continuous_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0]
270286
else:
271-
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 1, tf.nn.elu)[0]
287+
hidden_state = self.create_discrete_state_encoder(s_size, h_size, 1, tf.nn.elu, num_layers)[0]
272288

273289
if hidden_visual is None and hidden_state is None:
274290
raise Exception("No valid network configuration possible. "

python/ppo/trainer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def running_average(self, data, steps, running_mean, running_variance):
4141
new_variance = var + (current_x - new_mean) * (current_x - mean)
4242
return new_mean, new_variance
4343

44-
def take_action(self, info, env, brain_name, steps):
44+
def take_action(self, info, env, brain_name, steps, normalize):
4545
"""
4646
Decides actions given state/observation information, and takes them in environment.
4747
:param info: Current BrainInfo from environment.
@@ -60,7 +60,7 @@ def take_action(self, info, env, brain_name, steps):
6060
feed_dict[self.model.observation_in] = np.vstack(info.observations)
6161
if self.use_states:
6262
feed_dict[self.model.state_in] = info.states
63-
if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states:
63+
if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize:
6464
new_mean, new_variance = self.running_average(info.states, steps, self.model.running_mean,
6565
self.model.running_variance)
6666
feed_dict[self.model.new_mean] = new_mean

0 commit comments

Comments
 (0)