Skip to content

Commit 4261e24

Browse files
author
Ervin T
authored
Fix bug in add_rewards_output and add test (#2442)
1 parent 27cd6dd commit 4261e24

File tree

2 files changed

+33
-2
lines changed

2 files changed

+33
-2
lines changed

ml-agents/mlagents/trainers/ppo/trainer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,10 +205,10 @@ def add_rewards_outputs(
205205
for name, reward_result in rewards_out.reward_signals.items():
206206
# 0 because we use the scaled reward to train the agent
207207
self.training_buffer[agent_id]["{}_rewards".format(name)].append(
208-
reward_result.scaled_reward[agent_idx]
208+
reward_result.scaled_reward[agent_next_idx]
209209
)
210210
self.training_buffer[agent_id]["{}_value_estimates".format(name)].append(
211-
values[name][agent_next_idx][0]
211+
values[name][agent_idx][0]
212212
)
213213

214214
def is_ready_update(self):

ml-agents/mlagents/trainers/tests/test_ppo.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from mlagents.trainers.ppo.models import PPOModel
99
from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards
1010
from mlagents.trainers.ppo.policy import PPOPolicy
11+
from mlagents.trainers.rl_trainer import AllRewardsOutput
12+
from mlagents.trainers.components.reward_signals import RewardSignalResult
1113
from mlagents.envs import UnityEnvironment, BrainParameters
1214
from mlagents.envs.mock_communicator import MockCommunicator
1315

@@ -355,5 +357,34 @@ def test_trainer_increment_step():
355357
assert trainer.step == 10
356358

357359

360+
def test_add_rewards_output(dummy_config):
361+
brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0)
362+
dummy_config["summary_path"] = "./summaries/test_trainer_summary"
363+
dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
364+
trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False)
365+
rewardsout = AllRewardsOutput(
366+
reward_signals={
367+
"extrinsic": RewardSignalResult(
368+
scaled_reward=np.array([1.0, 1.0]), unscaled_reward=np.array([1.0, 1.0])
369+
)
370+
},
371+
environment=np.array([1.0, 1.0]),
372+
)
373+
values = {"extrinsic": np.array([[2.0]])}
374+
agent_id = "123"
375+
idx = 0
376+
# make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail.
377+
next_idx = 1
378+
trainer.add_rewards_outputs(
379+
rewardsout,
380+
values=values,
381+
agent_id=agent_id,
382+
agent_idx=idx,
383+
agent_next_idx=next_idx,
384+
)
385+
assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][0] == 2.0
386+
assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
387+
388+
358389
if __name__ == "__main__":
359390
pytest.main()

0 commit comments

Comments
 (0)