Skip to content

Commit 27cd6dd

Browse files
author
Ervin T
authored
Modification of reward signals and rl_trainer for SAC (#2433)
* Adds evaluate_batch to reward signals. Evaluates on minibatch rather than on BrainInfo. * Changes the way reward signal results are reported in rl_trainer so that we get the pure, unprocessed environment reward separate from the reward signals. * Moves end_episode to rl_trainer * Fixed bug with BCModule with RNN
1 parent a46fd87 commit 27cd6dd

File tree

11 files changed

+215
-118
lines changed

11 files changed

+215
-118
lines changed

ml-agents/mlagents/trainers/components/bc/module.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def _update_batch(
133133
else:
134134
feed_dict[self.policy.model.action_masks] = np.ones(
135135
(
136-
self.n_sequences,
136+
self.n_sequences * self.policy.sequence_length,
137137
sum(self.policy.model.brain.vector_action_space_size),
138138
)
139139
)

ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from typing import Any, Dict, List
22
import numpy as np
3-
from mlagents.envs.brain import BrainInfo
4-
53
import tensorflow as tf
4+
from mlagents.envs.brain import BrainInfo
65

76
from mlagents.trainers.buffer import Buffer
87
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
@@ -56,25 +55,40 @@ def evaluate(
5655
:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
5756
"""
5857
if len(current_info.agents) == 0:
59-
return []
58+
return RewardSignalResult([], [])
59+
mini_batch: Dict[str, np.array] = {}
60+
# Construct the batch and use evaluate_batch
61+
mini_batch["actions"] = next_info.previous_vector_actions
62+
mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1])
63+
for i in range(len(current_info.visual_observations)):
64+
mini_batch["visual_obs%d" % i] = current_info.visual_observations[i]
65+
mini_batch["next_visual_obs%d" % i] = next_info.visual_observations[i]
66+
if self.policy.use_vec_obs:
67+
mini_batch["vector_obs"] = current_info.vector_observations
68+
mini_batch["next_vector_in"] = next_info.vector_observations
6069

61-
feed_dict = {
62-
self.policy.model.batch_size: len(next_info.vector_observations),
63-
self.policy.model.sequence_length: 1,
70+
result = self.evaluate_batch(mini_batch)
71+
return result
72+
73+
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
74+
feed_dict: Dict[tf.Tensor, Any] = {
75+
self.policy.model.batch_size: len(mini_batch["actions"]),
76+
self.policy.model.sequence_length: self.policy.sequence_length,
6477
}
65-
feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info)
78+
if self.policy.use_vec_obs:
79+
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
80+
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
81+
if self.policy.model.vis_obs_size > 0:
82+
for i in range(len(self.policy.model.visual_in)):
83+
_obs = mini_batch["visual_obs%d" % i]
84+
_next_obs = mini_batch["next_visual_obs%d" % i]
85+
feed_dict[self.policy.model.visual_in[i]] = _obs
86+
feed_dict[self.model.next_visual_in[i]] = _next_obs
87+
6688
if self.policy.use_continuous_act:
67-
feed_dict[
68-
self.policy.model.selected_actions
69-
] = next_info.previous_vector_actions
89+
feed_dict[self.policy.model.selected_actions] = mini_batch["actions"]
7090
else:
71-
feed_dict[
72-
self.policy.model.action_holder
73-
] = next_info.previous_vector_actions
74-
for i in range(self.policy.model.vis_obs_size):
75-
feed_dict[self.model.next_visual_in[i]] = next_info.visual_observations[i]
76-
if self.policy.use_vec_obs:
77-
feed_dict[self.model.next_vector_in] = next_info.vector_observations
91+
feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
7892
unscaled_reward = self.policy.sess.run(
7993
self.model.intrinsic_reward, feed_dict=feed_dict
8094
)
@@ -110,8 +124,6 @@ def prepare_update(
110124
policy_model.batch_size: num_sequences,
111125
policy_model.sequence_length: self.policy.sequence_length,
112126
policy_model.mask_input: mini_batch["masks"],
113-
policy_model.advantage: mini_batch["advantages"],
114-
policy_model.all_old_log_probs: mini_batch["action_probs"],
115127
}
116128
if self.policy.use_continuous_act:
117129
feed_dict[policy_model.output_pre] = mini_batch["actions_pre"]
@@ -121,12 +133,10 @@ def prepare_update(
121133
feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
122134
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
123135
if policy_model.vis_obs_size > 0:
124-
for i, _ in enumerate(policy_model.visual_in):
125-
feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
126-
for i, _ in enumerate(policy_model.visual_in):
127-
feed_dict[self.model.next_visual_in[i]] = mini_batch[
128-
"next_visual_obs%d" % i
129-
]
136+
for i, vis_in in enumerate(policy_model.visual_in):
137+
feed_dict[vis_in] = mini_batch["visual_obs%d" % i]
138+
for i, next_vis_in in enumerate(self.model.next_visual_in):
139+
feed_dict[next_vis_in] = mini_batch["next_visual_obs%d" % i]
130140

131141
self.has_updated = True
132142
return feed_dict

ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,6 @@ def check_config(
3636
param_keys = ["strength", "gamma"]
3737
super().check_config(config_dict, param_keys)
3838

39-
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
40-
env_rews = mini_batch["environment_rewards"]
41-
42-
return RewardSignalResult(self.strength * env_rews, env_rews)
43-
4439
def evaluate(
4540
self, current_info: BrainInfo, next_info: BrainInfo
4641
) -> RewardSignalResult:
@@ -53,3 +48,7 @@ def evaluate(
5348
unscaled_reward = np.array(next_info.rewards)
5449
scaled_reward = self.strength * unscaled_reward
5550
return RewardSignalResult(scaled_reward, unscaled_reward)
51+
52+
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
53+
env_rews = np.array(mini_batch["environment_rewards"])
54+
return RewardSignalResult(self.strength * env_rews, env_rews)

ml-agents/mlagents/trainers/components/reward_signals/gail/model.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,8 @@ def create_network(self) -> None:
224224
self.done_policy,
225225
reuse=True,
226226
)
227+
self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
228+
self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
227229
self.discriminator_score = tf.reshape(
228230
self.policy_estimate, [-1], name="gail_reward"
229231
)

ml-agents/mlagents/trainers/components/reward_signals/gail/signal.py

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ def __init__(
5252
self.update_dict: Dict[str, tf.Tensor] = {
5353
"gail_loss": self.model.loss,
5454
"gail_update_batch": self.model.update_batch,
55-
"gail_policy_estimate": self.model.policy_estimate,
56-
"gail_expert_estimate": self.model.expert_estimate,
55+
"gail_policy_estimate": self.model.mean_policy_estimate,
56+
"gail_expert_estimate": self.model.mean_expert_estimate,
5757
}
5858
if self.model.use_vail:
5959
self.update_dict["kl_loss"] = self.model.kl_loss
@@ -62,31 +62,51 @@ def __init__(
6262
self.update_dict["z_mean_policy"] = self.model.z_mean_policy
6363
self.update_dict["beta_update"] = self.model.update_beta
6464

65-
self.stats_name_to_update_name = {"Losses/GAIL Loss": "gail_loss"}
65+
self.stats_name_to_update_name = {
66+
"Losses/GAIL Loss": "gail_loss",
67+
"Policy/GAIL Policy Estimate": "gail_policy_estimate",
68+
"Policy/GAIL Expert Estimate": "gail_expert_estimate",
69+
}
6670

6771
def evaluate(
6872
self, current_info: BrainInfo, next_info: BrainInfo
6973
) -> RewardSignalResult:
7074
if len(current_info.agents) == 0:
71-
return []
75+
return RewardSignalResult([], [])
76+
mini_batch: Dict[str, np.array] = {}
77+
# Construct the batch
78+
mini_batch["actions"] = next_info.previous_vector_actions
79+
mini_batch["done"] = np.reshape(next_info.local_done, [-1, 1])
80+
for i, obs in enumerate(current_info.visual_observations):
81+
mini_batch["visual_obs%d" % i] = obs
82+
if self.policy.use_vec_obs:
83+
mini_batch["vector_obs"] = current_info.vector_observations
84+
85+
result = self.evaluate_batch(mini_batch)
86+
return result
7287

88+
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
7389
feed_dict: Dict[tf.Tensor, Any] = {
74-
self.policy.model.batch_size: len(next_info.vector_observations),
75-
self.policy.model.sequence_length: 1,
90+
self.policy.model.batch_size: len(mini_batch["actions"]),
91+
self.policy.model.sequence_length: self.policy.sequence_length,
7692
}
7793
if self.model.use_vail:
7894
feed_dict[self.model.use_noise] = [0]
7995

80-
feed_dict = self.policy.fill_eval_dict(feed_dict, brain_info=current_info)
81-
feed_dict[self.model.done_policy] = np.reshape(next_info.local_done, [-1, 1])
96+
if self.policy.use_vec_obs:
97+
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
98+
if self.policy.model.vis_obs_size > 0:
99+
for i in range(len(self.policy.model.visual_in)):
100+
_obs = mini_batch["visual_obs%d" % i]
101+
feed_dict[self.policy.model.visual_in[i]] = _obs
102+
82103
if self.policy.use_continuous_act:
83-
feed_dict[
84-
self.policy.model.selected_actions
85-
] = next_info.previous_vector_actions
104+
feed_dict[self.policy.model.selected_actions] = mini_batch["actions"]
86105
else:
87-
feed_dict[
88-
self.policy.model.action_holder
89-
] = next_info.previous_vector_actions
106+
feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
107+
feed_dict[self.model.done_policy_holder] = np.array(
108+
mini_batch["done"]
109+
).flatten()
90110
unscaled_reward = self.policy.sess.run(
91111
self.model.intrinsic_reward, feed_dict=feed_dict
92112
)
@@ -123,11 +143,10 @@ def prepare_update(
123143
# If num_sequences is less, we need to shorten the input batch.
124144
for key, element in mini_batch_policy.items():
125145
mini_batch_policy[key] = element[:max_num_experiences]
126-
# Get demo buffer
127-
self.demonstration_buffer.update_buffer.shuffle(1)
128-
# TODO: Replace with SAC sample method
129-
mini_batch_demo = self.demonstration_buffer.update_buffer.make_mini_batch(
130-
0, len(mini_batch_policy["actions"])
146+
147+
# Get batch from demo buffer
148+
mini_batch_demo = self.demonstration_buffer.update_buffer.sample_mini_batch(
149+
len(mini_batch_policy["actions"]), 1
131150
)
132151

133152
feed_dict: Dict[tf.Tensor, Any] = {

ml-agents/mlagents/trainers/components/reward_signals/reward_signal.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,21 @@ def evaluate(
6363
np.zeros(len(current_info.agents)),
6464
)
6565

66+
def evaluate_batch(self, mini_batch: Dict[str, np.array]) -> RewardSignalResult:
67+
"""
68+
Evaluates the reward for the data present in the Dict mini_batch. Note the distiction between
69+
evaluate(), which takes in two BrainInfos. This reflects the different data formats (i.e. from the Buffer
70+
vs. before being placed into the Buffer. Use this when evaluating a reward function drawn straight from a
71+
Buffer.
72+
:param mini_batch: A Dict of numpy arrays (the format used by our Buffer)
73+
when drawing from the update buffer.
74+
:return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
75+
"""
76+
mini_batch_len = len(next(iter(mini_batch.values())))
77+
return RewardSignalResult(
78+
self.strength * np.zeros(mini_batch_len), np.zeros(mini_batch_len)
79+
)
80+
6681
def prepare_update(
6782
self,
6883
policy_model: LearningModel,

ml-agents/mlagents/trainers/ppo/trainer.py

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from mlagents.trainers.ppo.policy import PPOPolicy
1414
from mlagents.trainers.ppo.multi_gpu_policy import MultiGpuPPOPolicy, get_devices
1515
from mlagents.trainers.trainer import UnityTrainerException
16-
from mlagents.trainers.rl_trainer import RLTrainer
16+
from mlagents.trainers.rl_trainer import RLTrainer, AllRewardsOutput
1717
from mlagents.trainers.components.reward_signals import RewardSignalResult
1818
from mlagents.envs.action_info import ActionInfoOutputs
1919

@@ -193,36 +193,24 @@ def add_policy_outputs(
193193

194194
def add_rewards_outputs(
195195
self,
196-
value: Dict[str, Any],
197-
rewards_dict: Dict[str, RewardSignalResult],
196+
rewards_out: AllRewardsOutput,
197+
values: Dict[str, np.ndarray],
198198
agent_id: str,
199199
agent_idx: int,
200200
agent_next_idx: int,
201201
) -> None:
202202
"""
203203
Takes the value output of the last action and store it into the training buffer.
204204
"""
205-
for name, reward_result in rewards_dict.items():
205+
for name, reward_result in rewards_out.reward_signals.items():
206206
# 0 because we use the scaled reward to train the agent
207207
self.training_buffer[agent_id]["{}_rewards".format(name)].append(
208208
reward_result.scaled_reward[agent_idx]
209209
)
210210
self.training_buffer[agent_id]["{}_value_estimates".format(name)].append(
211-
value[name][agent_next_idx][0]
211+
values[name][agent_next_idx][0]
212212
)
213213

214-
def end_episode(self):
215-
"""
216-
A signal that the Episode has ended. The buffer must be reset.
217-
Get only called when the academy resets.
218-
"""
219-
self.training_buffer.reset_local_buffers()
220-
for agent_id in self.episode_steps:
221-
self.episode_steps[agent_id] = 0
222-
for rewards in self.collected_rewards.values():
223-
for agent_id in rewards:
224-
rewards[agent_id] = 0
225-
226214
def is_ready_update(self):
227215
"""
228216
Returns whether or not the trainer has enough elements to run update model

0 commit comments

Comments
 (0)