|
1 | 1 | import copy
|
2 | 2 | import random
|
| 3 | +from typing import Any |
3 | 4 |
|
4 | 5 | import numpy as np
|
5 | 6 | import torch
|
@@ -408,3 +409,92 @@ def fusion_reward(
|
408 | 409 | int(data[i]['beta'][j])]
|
409 | 410 |
|
410 | 411 | return data, estimate_cnt
|
| 412 | + |
| 413 | + |
| 414 | +@REWARD_MODEL_REGISTRY.register('ngu-reward') |
| 415 | +class NGURewardModel(BaseRewardModel): |
| 416 | + r""" |
| 417 | + Overview: |
| 418 | + The unifying reward for ngu which combined rnd-ngu and episodic |
| 419 | + The corresponding paper is `never give up: learning directed exploration strategies`. |
| 420 | + """ |
| 421 | + config = dict( |
| 422 | + type='ngu-reward', |
| 423 | + policy_nstep=5, |
| 424 | + collect_env_num=8, |
| 425 | + rnd_reward_model=dict( |
| 426 | + intrinsic_reward_type='add', |
| 427 | + learning_rate=5e-4, |
| 428 | + obs_shape=4, |
| 429 | + action_shape=2, |
| 430 | + batch_size=128, # transitions |
| 431 | + update_per_collect=10, |
| 432 | + only_use_last_five_frames_for_icm_rnd=False, |
| 433 | + clear_buffer_per_iters=10, |
| 434 | + nstep=5, |
| 435 | + hidden_size_list=[128, 128, 64], |
| 436 | + type='rnd-ngu', |
| 437 | + ), |
| 438 | + episodic_reward_model=dict( |
| 439 | + last_nonzero_reward_rescale=False, |
| 440 | + last_nonzero_reward_weight=1, |
| 441 | + intrinsic_reward_type='add', |
| 442 | + learning_rate=5e-4, |
| 443 | + obs_shape=4, |
| 444 | + action_shape=2, |
| 445 | + batch_size=128, # transitions |
| 446 | + update_per_collect=10, |
| 447 | + only_use_last_five_frames_for_icm_rnd=False, |
| 448 | + clear_buffer_per_iters=10, |
| 449 | + nstep=5, |
| 450 | + hidden_size_list=[128, 128, 64], |
| 451 | + type='episodic', |
| 452 | + ), |
| 453 | + ) |
| 454 | + |
| 455 | + def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> None: |
| 456 | + super(NGURewardModel).__init__() |
| 457 | + self.cfg = config |
| 458 | + self.tb_logger = tb_logger |
| 459 | + self.estimate_cnt = 0 |
| 460 | + self.rnd_reward_model = RndNGURewardModel(config.rnd_reward_model, device, tb_logger) |
| 461 | + self.episodic_reward_model = EpisodicNGURewardModel(config.episodic_reward_model, device, tb_logger) |
| 462 | + |
| 463 | + def train(self) -> None: |
| 464 | + self.rnd_reward_model.train() |
| 465 | + self.episodic_reward_model.train() |
| 466 | + |
| 467 | + def estimate(self, data: list) -> dict: |
| 468 | + |
| 469 | + # estimate reward |
| 470 | + rnd_reward = self.rnd_reward_model.estimate(data) |
| 471 | + episodic_reward = self.episodic_reward_model.estimate(data) |
| 472 | + |
| 473 | + # combine reward |
| 474 | + train_data_augumented, self.estimate_cnt = self.episodic_reward_model.fusion_reward( |
| 475 | + data, |
| 476 | + episodic_reward, |
| 477 | + rnd_reward, |
| 478 | + nstep=self.cfg.policy_nstep, |
| 479 | + collector_env_num=self.cfg.collect_env_num, |
| 480 | + tb_logger=self.tb_logger, |
| 481 | + estimate_cnt=self.estimate_cnt |
| 482 | + ) |
| 483 | + |
| 484 | + return train_data_augumented |
| 485 | + |
| 486 | + def collect_data(self, data) -> None: |
| 487 | + self.rnd_reward_model.collect_data(data) |
| 488 | + self.episodic_reward_model.collect_data(data) |
| 489 | + |
| 490 | + def clear_data(self, iter: int) -> None: |
| 491 | + assert hasattr( |
| 492 | + self.cfg.rnd_reward_model, 'clear_buffer_per_iters' |
| 493 | + ), "RND Reward Model does not have clear_buffer_per_iters, Clear failed" |
| 494 | + assert hasattr( |
| 495 | + self.cfg.episodic_reward_model, 'clear_buffer_per_iters' |
| 496 | + ), "Episodic Reward Model does not have clear_buffer_per_iters, Clear failed" |
| 497 | + if iter % self.cfg.rnd_reward_model.clear_buffer_per_iters == 0: |
| 498 | + self.rnd_reward_model.clear_data() |
| 499 | + if iter % self.cfg.episodic_reward_model.clear_buffer_per_iters == 0: |
| 500 | + self.episodic_reward_model.clear_data() |
0 commit comments