Skip to content

Commit 77b04d1

Browse files
committed
PPO additions and warnings
* Add linear decay to learning rate for PPO * Add warning/exception for unsupported brain configurations w/ PPO
1 parent b8109bb commit 77b04d1

File tree

5 files changed

+38
-25
lines changed

5 files changed

+38
-25
lines changed

python/PPO.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@
105105
"# Create the Tensorflow model graph\n",
106106
"ppo_model = create_agent_model(env, lr=learning_rate,\n",
107107
" h_size=hidden_units, epsilon=epsilon,\n",
108-
" beta=beta)\n",
108+
" beta=beta, max_step=max_steps)\n",
109109
"\n",
110110
"is_continuous = (env.brains[brain_name].action_space_type == \"continuous\")\n",
111111
"use_observations = (env.brains[brain_name].number_observations > 0)\n",

python/ppo.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
1616
Options:
1717
--help Show this message.
18-
--max-step=<n> Maximum number of steps to run environment [default: 5e6].
18+
--max-steps=<n> Maximum number of steps to run environment [default: 5e6].
1919
--run-path=<path> The sub-directory name for model and summary statistics [default: ppo].
2020
--load Whether to load the model or randomly initialize [default: False].
2121
--train Whether to train model, or only run inference [default: True].
@@ -38,7 +38,7 @@
3838
print(options)
3939

4040
# General parameters
41-
max_steps = float(options['--max-step'])
41+
max_steps = float(options['--max-steps'])
4242
model_path = './models/{}'.format(str(options['--run-path']))
4343
summary_path = './summaries/{}'.format(str(options['--run-path']))
4444
load_model = options['--load']
@@ -69,7 +69,7 @@
6969
# Create the Tensorflow model graph
7070
ppo_model = create_agent_model(env, lr=learning_rate,
7171
h_size=hidden_units, epsilon=epsilon,
72-
beta=beta)
72+
beta=beta, max_step=max_steps)
7373

7474
is_continuous = (env.brains[brain_name].action_space_type == "continuous")
7575
use_observations = (env.brains[brain_name].number_observations > 0)

python/ppo/models.py

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
import tensorflow as tf
33
import tensorflow.contrib.layers as c_layers
44
from tensorflow.python.tools import freeze_graph
5+
from unityagents import UnityEnvironmentException
56

67

7-
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3):
8+
def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3, max_step=5e6):
89
"""
910
Takes a Unity environment and model-specific hyperparameters and returns the
1011
appropriate PPO agent model for the environment.
@@ -17,16 +18,23 @@ def create_agent_model(env, lr=1e-4, h_size=128, epsilon=0.2, beta=1e-3):
1718
"""
1819
brain_name = env.brain_names[0]
1920
if env.brains[brain_name].action_space_type == "continuous":
20-
return ContinuousControlModel(lr, env.brains[brain_name].state_space_size,
21-
env.brains[brain_name].action_space_size, h_size, epsilon, beta)
21+
if env.brains[brain_name].number_observations == 0:
22+
return ContinuousControlModel(lr, env.brains[brain_name].state_space_size,
23+
env.brains[brain_name].action_space_size, h_size, epsilon, beta, max_step)
24+
else:
25+
raise UnityEnvironmentException("There is currently no PPO model which supports both a continuous "
26+
"action space and camera observations.")
2227
if env.brains[brain_name].action_space_type == "discrete":
2328
if env.brains[brain_name].number_observations == 0:
2429
return DiscreteControlModel(lr, env.brains[brain_name].state_space_size,
25-
env.brains[brain_name].action_space_size, h_size, epsilon, beta)
30+
env.brains[brain_name].action_space_size, h_size, epsilon, beta, max_step)
2631
else:
2732
brain = env.brains[brain_name]
33+
if env.brains[brain_name].state_space_size > 0:
34+
print("This brain contains agents with both observations and states. There is currently no PPO model"
35+
"which supports this. Defaulting to Vision-based PPO model.")
2836
h, w = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['height']
29-
return VisualDiscreteControlModel(lr, h, w, env.brains[brain_name].action_space_size, h_size, epsilon, beta)
37+
return VisualDiscreteControlModel(lr, h, w, env.brains[brain_name].action_space_size, h_size, epsilon, beta, max_step)
3038

3139

3240
def save_model(sess, saver, model_path="./", steps=0):
@@ -37,7 +45,7 @@ def save_model(sess, saver, model_path="./", steps=0):
3745
:param steps: Current number of steps in training process.
3846
:param saver: Tensorflow saver for session.
3947
"""
40-
last_checkpoint = model_path+'/model-'+str(steps)+'.cptk'
48+
last_checkpoint = model_path + '/model-' + str(steps) + '.cptk'
4149
saver.save(sess, last_checkpoint)
4250
tf.train.write_graph(sess.graph_def, model_path, 'raw_graph_def.pb', as_text=False)
4351
print("Saved Model")
@@ -61,7 +69,7 @@ def export_graph(model_path, env_name="env", target_nodes="action"):
6169

6270

6371
class PPOModel(object):
64-
def __init__(self, probs, old_probs, value, entropy, beta, epsilon, lr):
72+
def __init__(self, probs, old_probs, value, entropy, beta, epsilon, lr, max_step):
6573
"""
6674
Creates training-specific Tensorflow ops for PPO models.
6775
:param probs: Current policy probabilities
@@ -85,15 +93,18 @@ def __init__(self, probs, old_probs, value, entropy, beta, epsilon, lr):
8593

8694
self.loss = self.policy_loss + self.value_loss - beta * tf.reduce_mean(entropy)
8795

88-
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
96+
self.global_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int32)
97+
self.learning_rate = tf.train.polynomial_decay(lr, self.global_step,
98+
max_step, 1e-10,
99+
power=1.0)
100+
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
89101
self.update_batch = optimizer.minimize(self.loss)
90102

91-
self.global_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int32)
92-
self.increment_step = tf.assign(self.global_step, self.global_step+1)
103+
self.increment_step = tf.assign(self.global_step, self.global_step + 1)
93104

94105

95106
class ContinuousControlModel(PPOModel):
96-
def __init__(self, lr, s_size, a_size, h_size, epsilon, beta):
107+
def __init__(self, lr, s_size, a_size, h_size, epsilon, beta, max_step):
97108
"""
98109
Creates Continuous Control Actor-Critic model.
99110
:param s_size: State-space size
@@ -127,11 +138,11 @@ def __init__(self, lr, s_size, a_size, h_size, epsilon, beta):
127138

128139
self.old_probs = tf.placeholder(shape=[None, a_size], dtype=tf.float32, name='old_probabilities')
129140

130-
PPOModel.__init__(self, self.probs, self.old_probs, self.value, self.entropy, 0.0, epsilon, lr)
141+
PPOModel.__init__(self, self.probs, self.old_probs, self.value, self.entropy, 0.0, epsilon, lr, max_step)
131142

132143

133144
class DiscreteControlModel(PPOModel):
134-
def __init__(self, lr, s_size, a_size, h_size, epsilon, beta):
145+
def __init__(self, lr, s_size, a_size, h_size, epsilon, beta, max_step):
135146
"""
136147
Creates Discrete Control Actor-Critic model.
137148
:param s_size: State-space size
@@ -158,11 +169,11 @@ def __init__(self, lr, s_size, a_size, h_size, epsilon, beta):
158169
self.old_responsible_probs = tf.reduce_sum(self.old_probs * self.selected_actions, axis=1)
159170

160171
PPOModel.__init__(self, self.responsible_probs, self.old_responsible_probs,
161-
self.value, self.entropy, beta, epsilon, lr)
172+
self.value, self.entropy, beta, epsilon, lr, max_step)
162173

163174

164175
class VisualDiscreteControlModel(PPOModel):
165-
def __init__(self, lr, o_size_h, o_size_w, a_size, h_size, epsilon, beta):
176+
def __init__(self, lr, o_size_h, o_size_w, a_size, h_size, epsilon, beta, max_step):
166177
"""
167178
Creates Discrete Control Actor-Critic model for use with visual observations (images).
168179
:param o_size_h: Observation height.
@@ -194,4 +205,4 @@ def __init__(self, lr, o_size_h, o_size_w, a_size, h_size, epsilon, beta):
194205
self.old_responsible_probs = tf.reduce_sum(self.old_probs * self.selected_actions, axis=1)
195206

196207
PPOModel.__init__(self, self.responsible_probs, self.old_responsible_probs,
197-
self.value, self.entropy, beta, epsilon, lr)
208+
self.value, self.entropy, beta, epsilon, lr, max_step)

python/ppo/trainer.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def __init__(self, ppo_model, sess, info, is_continuous, use_observations):
1717
self.model = ppo_model
1818
self.sess = sess
1919
stats = {'cumulative_reward': [], 'episode_length': [], 'value_estimate': [],
20-
'entropy': [], 'value_loss': [], 'policy_loss': []}
20+
'entropy': [], 'value_loss': [], 'policy_loss': [], 'learning_rate': []}
2121
self.stats = stats
2222

2323
self.training_buffer = vectorize_history(empty_local_history({}))
@@ -45,11 +45,13 @@ def take_action(self, info, env, brain_name):
4545
self.model.batch_size: len(info.states)}
4646
else:
4747
feed_dict = {self.model.state_in: info.states, self.model.batch_size: len(info.states)}
48-
actions, a_dist, value, ent = self.sess.run([self.model.output, self.model.probs,
49-
self.model.value, self.model.entropy],
50-
feed_dict=feed_dict)
48+
actions, a_dist, value, ent, learn_rate = self.sess.run([self.model.output, self.model.probs,
49+
self.model.value, self.model.entropy,
50+
self.model.learning_rate],
51+
feed_dict=feed_dict)
5152
self.stats['value_estimate'].append(value)
5253
self.stats['entropy'].append(ent)
54+
self.stats['learning_rate'].append(learn_rate)
5355
new_info = env.step(actions, value={brain_name: value})[brain_name]
5456
self.add_experiences(info, new_info, epsi, actions, a_dist, value)
5557
return new_info

python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
required = f.read().splitlines()
88

99
setup(name='unityagents',
10-
version='0.1',
10+
version='0.1.1',
1111
description='Unity Machine Learning Agents',
1212
license='Apache License 2.0',
1313
author='Unity Technologies',

0 commit comments

Comments
 (0)