Skip to content

Commit a1d35bf

Browse files
asolanoawjuliani
authored andcommitted
Initial support for multiple observations (#256)
* Initial support for multiple observations * Fix PPO for continuous control
1 parent 921bb15 commit a1d35bf

File tree

3 files changed

+38
-16
lines changed

3 files changed

+38
-16
lines changed

python/ppo/history.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import numpy as np
22

3-
history_keys = ['states', 'observations', 'actions', 'rewards', 'action_probs', 'epsilons',
3+
history_keys = ['states', 'actions', 'rewards', 'action_probs', 'epsilons',
44
'value_estimates', 'advantages', 'discounted_returns']
55

66

@@ -44,6 +44,8 @@ def empty_local_history(agent_dict):
4444
"""
4545
for key in history_keys:
4646
agent_dict[key] = []
47+
for i, _ in enumerate(key for key in agent_dict.keys() if key.startswith('observations')):
48+
agent_dict['observations%d' % i] = []
4749
return agent_dict
4850

4951

@@ -55,6 +57,8 @@ def vectorize_history(agent_dict):
5557
"""
5658
for key in history_keys:
5759
agent_dict[key] = np.array(agent_dict[key])
60+
for key in (key for key in agent_dict.keys() if key.startswith('observations')):
61+
agent_dict[key] = np.array(agent_dict[key])
5862
return agent_dict
5963

6064

@@ -70,6 +74,8 @@ def empty_all_history(agent_info):
7074
history_dict[agent] = empty_local_history(history_dict[agent])
7175
history_dict[agent]['cumulative_reward'] = 0
7276
history_dict[agent]['episode_steps'] = 0
77+
for i, _ in enumerate(agent_info.observations):
78+
history_dict[agent]['observations%d' % i] = []
7379
return history_dict
7480

7581

@@ -82,6 +88,8 @@ def append_history(global_buffer, local_buffer=None):
8288
"""
8389
for key in history_keys:
8490
global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
91+
for key in (key for key in local_buffer.keys() if key.startswith('observations')):
92+
global_buffer[key] = np.concatenate([global_buffer[key], local_buffer[key]], axis=0)
8593
return global_buffer
8694

8795

@@ -94,6 +102,8 @@ def set_history(global_buffer, local_buffer=None):
94102
"""
95103
for key in history_keys:
96104
global_buffer[key] = np.copy(local_buffer[key])
105+
for key in (key for key in local_buffer.keys() if key.startswith('observations')):
106+
global_buffer[key] = np.array(local_buffer[key])
97107
return global_buffer
98108

99109

@@ -108,4 +118,7 @@ def shuffle_buffer(global_buffer):
108118
for key in history_keys:
109119
if len(global_buffer[key]) > 0:
110120
global_buffer[key] = global_buffer[key][s]
121+
for key in (key for key in global_buffer.keys() if key.startswith('observations')):
122+
if len(global_buffer[key]) > 0:
123+
global_buffer[key] = global_buffer[key][s]
111124
return global_buffer

python/ppo/models.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def export_graph(model_path, env_name="env", target_nodes="action,value_estimate
6161
class PPOModel(object):
6262
def __init__(self):
6363
self.normalize = False
64+
self.observation_in = []
6465

6566
def create_global_steps(self):
6667
"""Creates TF ops to track and increment global training step."""
@@ -89,11 +90,11 @@ def create_visual_encoder(self, o_size_h, o_size_w, bw, h_size, num_streams, act
8990
else:
9091
c_channels = 3
9192

92-
self.observation_in = tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
93-
name='observation_0')
93+
self.observation_in.append(tf.placeholder(shape=[None, o_size_h, o_size_w, c_channels], dtype=tf.float32,
94+
name='observation_%d' % len(self.observation_in)))
9495
streams = []
9596
for i in range(num_streams):
96-
self.conv1 = tf.layers.conv2d(self.observation_in, 16, kernel_size=[8, 8], strides=[4, 4],
97+
self.conv1 = tf.layers.conv2d(self.observation_in[-1], 16, kernel_size=[8, 8], strides=[4, 4],
9798
use_bias=False, activation=activation)
9899
self.conv2 = tf.layers.conv2d(self.conv1, 32, kernel_size=[4, 4], strides=[2, 2],
99100
use_bias=False, activation=activation)
@@ -213,10 +214,12 @@ def __init__(self, lr, brain, h_size, epsilon, max_step, normalize, num_layers):
213214
self.create_reward_encoder()
214215

215216
hidden_state, hidden_visual, hidden_policy, hidden_value = None, None, None, None
216-
if brain.number_observations > 0:
217-
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
218-
bw = brain.camera_resolutions[0]['blackAndWhite']
219-
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers)
217+
encoders = []
218+
for i in range(brain.number_observations):
219+
height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
220+
bw = brain.camera_resolutions[i]['blackAndWhite']
221+
encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 2, tf.nn.tanh, num_layers))
222+
hidden_visual = tf.concat(encoders, axis=2)
220223
if brain.state_space_size > 0:
221224
s_size = brain.state_space_size
222225
if brain.state_space_type == "continuous":
@@ -275,10 +278,12 @@ def __init__(self, lr, brain, h_size, epsilon, beta, max_step, normalize, num_la
275278
self.normalize = normalize
276279

277280
hidden_state, hidden_visual, hidden = None, None, None
278-
if brain.number_observations > 0:
279-
height_size, width_size = brain.camera_resolutions[0]['height'], brain.camera_resolutions[0]['width']
280-
bw = brain.camera_resolutions[0]['blackAndWhite']
281-
hidden_visual = self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0]
281+
encoders = []
282+
for i in range(brain.number_observations):
283+
height_size, width_size = brain.camera_resolutions[i]['height'], brain.camera_resolutions[i]['width']
284+
bw = brain.camera_resolutions[i]['blackAndWhite']
285+
encoders.append(self.create_visual_encoder(height_size, width_size, bw, h_size, 1, tf.nn.elu, num_layers)[0])
286+
hidden_visual = tf.concat(encoders, axis=1)
282287
if brain.state_space_size > 0:
283288
s_size = brain.state_space_size
284289
if brain.state_space_type == "continuous":

python/ppo/trainer.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ def take_action(self, info, env, brain_name, steps, normalize):
5757
epsi = np.random.randn(len(info.states), env.brains[brain_name].action_space_size)
5858
feed_dict[self.model.epsilon] = epsi
5959
if self.use_observations:
60-
feed_dict[self.model.observation_in] = np.vstack(info.observations)
60+
for i, _ in enumerate(info.observations):
61+
feed_dict[self.model.observation_in[i]] = info.observations[i]
6162
if self.use_states:
6263
feed_dict[self.model.state_in] = info.states
6364
if self.is_training and env.brains[brain_name].state_space_type == "continuous" and self.use_states and normalize:
@@ -91,7 +92,8 @@ def add_experiences(self, info, next_info, epsi, actions, a_dist, value):
9192
idx = info.agents.index(agent)
9293
if not info.local_done[idx]:
9394
if self.use_observations:
94-
history['observations'].append([info.observations[0][idx]])
95+
for i, _ in enumerate(info.observations):
96+
history['observations%d' % i].append([info.observations[i][idx]])
9597
if self.use_states:
9698
history['states'].append(info.states[idx])
9799
if self.is_continuous:
@@ -120,7 +122,8 @@ def process_experiences(self, info, time_horizon, gamma, lambd):
120122
else:
121123
feed_dict = {self.model.batch_size: len(info.states)}
122124
if self.use_observations:
123-
feed_dict[self.model.observation_in] = np.vstack(info.observations)
125+
for i in range(self.info.observations):
126+
feed_dict[self.model.observation_in[i]] = info.observations[i]
124127
if self.use_states:
125128
feed_dict[self.model.state_in] = info.states
126129
value_next = self.sess.run(self.model.value, feed_dict)[l]
@@ -176,7 +179,8 @@ def update_model(self, batch_size, num_epoch):
176179
if self.use_states:
177180
feed_dict[self.model.state_in] = np.vstack(training_buffer['states'][start:end])
178181
if self.use_observations:
179-
feed_dict[self.model.observation_in] = np.vstack(training_buffer['observations'][start:end])
182+
for i, _ in enumerate(self.model.observation_in):
183+
feed_dict[self.model.observation_in[i]] = np.vstack(training_buffer['observations%d' % i][start:end])
180184
v_loss, p_loss, _ = self.sess.run([self.model.value_loss, self.model.policy_loss,
181185
self.model.update_batch], feed_dict=feed_dict)
182186
total_v += v_loss

0 commit comments

Comments
 (0)