|
8 | 8 | from mlagents.trainers.ppo.models import PPOModel |
9 | 9 | from mlagents.trainers.ppo.trainer import PPOTrainer, discount_rewards |
10 | 10 | from mlagents.trainers.ppo.policy import PPOPolicy |
| 11 | +from mlagents.trainers.rl_trainer import AllRewardsOutput |
| 12 | +from mlagents.trainers.components.reward_signals import RewardSignalResult |
11 | 13 | from mlagents.envs import UnityEnvironment, BrainParameters |
12 | 14 | from mlagents.envs.mock_communicator import MockCommunicator |
13 | 15 |
|
@@ -355,5 +357,34 @@ def test_trainer_increment_step(): |
355 | 357 | assert trainer.step == 10 |
356 | 358 |
|
357 | 359 |
|
| 360 | +def test_add_rewards_output(dummy_config): |
| 361 | + brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0) |
| 362 | + dummy_config["summary_path"] = "./summaries/test_trainer_summary" |
| 363 | + dummy_config["model_path"] = "./models/test_trainer_models/TestModel" |
| 364 | + trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False) |
| 365 | + rewardsout = AllRewardsOutput( |
| 366 | + reward_signals={ |
| 367 | + "extrinsic": RewardSignalResult( |
| 368 | + scaled_reward=np.array([1.0, 1.0]), unscaled_reward=np.array([1.0, 1.0]) |
| 369 | + ) |
| 370 | + }, |
| 371 | + environment=np.array([1.0, 1.0]), |
| 372 | + ) |
| 373 | + values = {"extrinsic": np.array([[2.0]])} |
| 374 | + agent_id = "123" |
| 375 | + idx = 0 |
| 376 | + # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail. |
| 377 | + next_idx = 1 |
| 378 | + trainer.add_rewards_outputs( |
| 379 | + rewardsout, |
| 380 | + values=values, |
| 381 | + agent_id=agent_id, |
| 382 | + agent_idx=idx, |
| 383 | + agent_next_idx=next_idx, |
| 384 | + ) |
| 385 | + assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][0] == 2.0 |
| 386 | + assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0 |
| 387 | + |
| 388 | + |
358 | 389 | if __name__ == "__main__": |
359 | 390 | pytest.main() |
0 commit comments