-
Notifications
You must be signed in to change notification settings - Fork 58
Lbtq main #1311
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Lbtq main #1311
Changes from 5 commits
1dcac7b
87b86c1
dc76b01
639ab02
6188278
ed38b92
12d704a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,7 +42,7 @@ | |
| DdpgInfo = namedtuple( | ||
| "DdpgInfo", [ | ||
| "reward", "step_type", "discount", "action", "action_distribution", | ||
| "actor_loss", "critic", "discounted_return" | ||
| "actor_loss", "critic", "discounted_return", "future_distance", "her" | ||
| ], | ||
| default_value=()) | ||
| DdpgLossInfo = namedtuple('DdpgLossInfo', ('actor', 'critic')) | ||
|
|
@@ -237,10 +237,12 @@ def _sample(a, ou): | |
| noisy_action, self._action_spec) | ||
| state = empty_state._replace( | ||
| actor=DdpgActorState(actor=state, critics=())) | ||
| # action_distribution is not supported for continuous actions for now. | ||
| # Returns empty action_distribution to fail early. | ||
| return AlgStep( | ||
| output=noisy_action, | ||
| state=state, | ||
| info=DdpgInfo(action=noisy_action, action_distribution=action)) | ||
| info=DdpgInfo(action=noisy_action, action_distribution=())) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you need this change? By default we could think of a deterministic action distribution as an action tensor.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to fail early and clearly. action is a tensor, not distribution. Putting action directly there could cause confusion when debugging. See comment 3 lines above.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is intended to return
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, when used with e.g. Retrace, a distribution is needed, but action is not a distribution. |
||
|
|
||
| def rollout_step(self, time_step: TimeStep, state=None): | ||
| if self.need_full_rollout_state(): | ||
|
|
@@ -330,7 +332,8 @@ def train_step(self, inputs: TimeStep, state: DdpgState, | |
| reward=inputs.reward, | ||
| step_type=inputs.step_type, | ||
| discount=inputs.discount, | ||
| action_distribution=policy_step.output, | ||
| action=policy_step.output, | ||
| action_distribution=(), | ||
| critic=critic_info, | ||
| actor_loss=policy_step.info, | ||
| discounted_return=rollout_info.discounted_return)) | ||
|
|
@@ -355,6 +358,19 @@ def calc_loss(self, info: DdpgInfo): | |
|
|
||
| actor_loss = info.actor_loss | ||
|
|
||
| # The current implementation is hacky: Instead of using OneStepTD | ||
| # and pulling additionally a few timesteps from the future to compute | ||
| # bootstrap values, we here piggyback on n-step TDLoss, but masking | ||
| # out losses from the 2nd to n-1-th steps. | ||
| # If this hacky use pattern is to be used frequently in the future, | ||
| # we should consider refactoring it. | ||
| if hasattr(self._critic_losses[0], | ||
| "_improve_w_nstep_bootstrap") and \ | ||
| self._critic_losses[0]._improve_w_nstep_bootstrap: | ||
| # Ignore 2nd - nth step actor losses. | ||
| actor_loss.loss[1:] = 0 | ||
|
||
| actor_loss.extra[1:] = 0 | ||
|
|
||
| return LossInfo( | ||
| loss=critic_loss + actor_loss.loss, | ||
| priority=priority, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,12 +16,12 @@ | |
| from typing import Union, List, Callable | ||
|
|
||
| import alf | ||
| from alf.algorithms.td_loss import TDLoss, TDQRLoss | ||
| from alf.algorithms.td_loss import LowerBoundedTDLoss, TDQRLoss | ||
| from alf.utils import losses | ||
|
|
||
|
|
||
| @alf.configurable | ||
| class OneStepTDLoss(TDLoss): | ||
| class OneStepTDLoss(LowerBoundedTDLoss): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Conceptually OneStepTDLoss is not a child (special case) of LowerBoundedTDLoss. For me, LowerBoundedTDLoss is far more special and should be a completely new class, or a child of OneStepTDLoss.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. LowerBoundedTDLoss defaults to TDLoss and can be configured to enable lower bounding, so it's more general than TDLoss. Maybe I should name it something else? |
||
| def __init__(self, | ||
| gamma: Union[float, List[float]] = 0.99, | ||
| td_error_loss_fn: Callable = losses.element_wise_squared_loss, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,7 @@ | |
| import alf.nest.utils as nest_utils | ||
| from alf.networks import ActorDistributionNetwork, CriticNetwork | ||
| from alf.networks import QNetwork, QRNNNetwork | ||
| from alf.summary import render | ||
| from alf.tensor_specs import TensorSpec, BoundedTensorSpec | ||
| from alf.utils import losses, common, dist_utils, math_ops | ||
| from alf.utils.normalizers import ScalarAdaptiveNormalizer | ||
|
|
@@ -56,7 +57,8 @@ | |
| SacInfo = namedtuple( | ||
| "SacInfo", [ | ||
| "reward", "step_type", "discount", "action", "action_distribution", | ||
| "actor", "critic", "alpha", "log_pi", "discounted_return" | ||
| "actor", "critic", "alpha", "log_pi", "discounted_return", | ||
| "future_distance", "her" | ||
|
||
| ], | ||
| default_value=()) | ||
|
|
||
|
|
@@ -152,6 +154,9 @@ def __init__(self, | |
| q_network_cls=QNetwork, | ||
| reward_weights=None, | ||
| epsilon_greedy=None, | ||
| rollout_epsilon_greedy=1.0, | ||
| use_epsilon_schedule=0, | ||
|
||
| max_target_action=False, | ||
| use_entropy_reward=True, | ||
| normalize_entropy_reward=False, | ||
| calculate_priority=False, | ||
|
|
@@ -203,6 +208,14 @@ def __init__(self, | |
| Breakout. Only used for evaluation. If None, its value is taken | ||
| from ``config.epsilon_greedy`` and then | ||
| ``alf.get_config_value(TrainerConfig.epsilon_greedy)``. | ||
| rollout_epsilon_greedy (float): epsilon greedy policy for rollout. | ||
| Together with the following three parameters, the Sac algorithm | ||
| can be converted to a DQN algorithm. | ||
| use_epsilon_schedule (float): training schedule for | ||
| rollout_epsilon_greedy. | ||
| max_target_action (bool): whether to use the action with the highest | ||
| target value as the target action for computing bootstrapped value. | ||
| To mimic the DQN algorithm, set this to True. | ||
|
||
| use_entropy_reward (bool): whether to include entropy as reward | ||
| normalize_entropy_reward (bool): if True, normalize entropy reward | ||
| to reduce bias in episodic cases. Only used if | ||
|
|
@@ -261,6 +274,9 @@ def __init__(self, | |
| if epsilon_greedy is None: | ||
| epsilon_greedy = alf.utils.common.get_epsilon_greedy(config) | ||
| self._epsilon_greedy = epsilon_greedy | ||
| self._rollout_epsilon_greedy = rollout_epsilon_greedy | ||
| self._use_epsilon_schedule = use_epsilon_schedule | ||
| self._max_target_action = max_target_action | ||
|
|
||
| critic_networks, actor_network, self._act_type = self._make_networks( | ||
| observation_spec, action_spec, reward_spec, actor_network_cls, | ||
|
|
@@ -274,7 +290,10 @@ def __init__(self, | |
| ) | ||
|
|
||
| def _init_log_alpha(): | ||
| return nn.Parameter(torch.tensor(float(initial_log_alpha))) | ||
| alpha = torch.tensor(float(initial_log_alpha)) | ||
| if alpha_optimizer is None: | ||
|
||
| return alpha | ||
| return nn.Parameter(alpha) | ||
|
||
|
|
||
| if self._act_type == ActionType.Mixed: | ||
| # separate alphas for discrete and continuous actions | ||
|
|
@@ -314,7 +333,7 @@ def _init_log_alpha(): | |
| self.add_optimizer(alpha_optimizer, nest.flatten(log_alpha)) | ||
|
|
||
| self._log_alpha = log_alpha | ||
| if self._act_type == ActionType.Mixed: | ||
| if self._act_type == ActionType.Mixed and alpha_optimizer is not None: | ||
| self._log_alpha_paralist = nn.ParameterList( | ||
| nest.flatten(log_alpha)) | ||
|
|
||
|
|
@@ -376,6 +395,8 @@ def _init_log_alpha(): | |
| target_models=[self._target_critic_networks], | ||
| tau=target_update_tau, | ||
| period=target_update_period) | ||
| # initial q value range for rendering; adjusted as playing progresses | ||
| self._q_range = (0, 3) | ||
|
|
||
| def _make_networks(self, observation_spec, action_spec, reward_spec, | ||
| continuous_actor_network_cls, critic_network_cls, | ||
|
|
@@ -531,26 +552,52 @@ def _predict_action(self, | |
| return action_dist, action, q_values, new_state | ||
|
|
||
| def predict_step(self, inputs: TimeStep, state: SacState): | ||
| action_dist, action, _, action_state = self._predict_action( | ||
| action_dist, action, q_values, action_state = self._predict_action( | ||
| inputs.observation, | ||
| state=state.action, | ||
| epsilon_greedy=self._epsilon_greedy, | ||
| eps_greedy_sampling=True) | ||
| info = SacInfo(action_distribution=action_dist) | ||
| if (alf.summary.render.is_rendering_enabled() | ||
|
||
| and self._act_type == ActionType.Discrete): | ||
| num_acts = q_values.shape[-1] | ||
| self._q_range = (min(self._q_range[0], int(q_values.min())), | ||
| max(self._q_range[1], | ||
| int(q_values.max()) + 1)) | ||
| info = dict( | ||
| sac=info, | ||
| action_img=render.render_action("action", action, | ||
| self._action_spec), | ||
| action_dist_img=render.render_bar( | ||
| "action_dist", | ||
| action_dist.probs, | ||
| y_range=(0, 1), | ||
| annotate_format="%.2f", | ||
| x_ticks=range(num_acts)), | ||
| q_img=render.render_bar( | ||
| "q_values", | ||
| q_values, | ||
| y_range=self._q_range, | ||
| annotate_format="%.2f", | ||
| x_ticks=range(num_acts))) | ||
| return AlgStep( | ||
| output=action, | ||
| state=SacState(action=action_state), | ||
| info=SacInfo(action_distribution=action_dist)) | ||
| output=action, state=SacState(action=action_state), info=info) | ||
|
|
||
| def rollout_step(self, inputs: TimeStep, state: SacState): | ||
| """``rollout_step()`` basically predicts actions like what is done by | ||
| ``predict_step()``. Additionally, if states are to be stored a in replay | ||
| buffer, then this function also call ``_critic_networks`` and | ||
| ``_target_critic_networks`` to maintain their states. | ||
| """ | ||
| eps = self._rollout_epsilon_greedy | ||
| if self._use_epsilon_schedule > 0: | ||
| progress = alf.trainers.policy_trainer.Trainer.progress() | ||
| if progress < self._use_epsilon_schedule: | ||
| eps = 1.0 - (1.0 - eps) * progress / self._use_epsilon_schedule | ||
| action_dist, action, _, action_state = self._predict_action( | ||
| inputs.observation, | ||
| state=state.action, | ||
| epsilon_greedy=1.0, | ||
| epsilon_greedy=eps, | ||
| eps_greedy_sampling=True, | ||
| rollout=True) | ||
|
|
||
|
|
@@ -717,7 +764,10 @@ def _critic_train_step(self, inputs: TimeStep, state: SacCriticState, | |
| probs = common.expand_dims_as(action_distribution.probs, | ||
| target_critics) | ||
| # [B, reward_dim] | ||
| target_critics = torch.sum(probs * target_critics, dim=1) | ||
| if self._max_target_action: | ||
| target_critics = torch.max(target_critics, dim=1)[0] | ||
| else: | ||
| target_critics = torch.sum(probs * target_critics, dim=1) | ||
| elif self._act_type == ActionType.Mixed: | ||
| critics = self._select_q_value(rollout_info.action[0], critics) | ||
| discrete_act_dist = action_distribution[0] | ||
|
|
@@ -797,6 +847,20 @@ def calc_loss(self, info: SacInfo): | |
| alpha_loss = info.alpha | ||
| actor_loss = info.actor | ||
|
|
||
| # The current implementation is hacky: Instead of using OneStepTD | ||
| # and pulling additionally a few timesteps from the future to compute | ||
| # bootstrap values, we here piggyback on n-step TDLoss, but masking | ||
| # out losses from the 2nd to n-1-th steps. | ||
| # If this hacky use pattern is to be used frequently in the future, | ||
| # we should consider refactoring it. | ||
| if hasattr(self._critic_losses[0], | ||
| "_improve_w_nstep_bootstrap") and \ | ||
| self._critic_losses[0]._improve_w_nstep_bootstrap: | ||
| # Ignore 2nd - n-th step losses in this mode. | ||
| alpha_loss[1:] = 0 | ||
| if actor_loss.loss != (): | ||
| actor_loss.loss[1:] = 0 | ||
|
|
||
| if self._debug_summaries and alf.summary.should_record_summaries(): | ||
| with alf.summary.scope(self._name): | ||
| if self._act_type == ActionType.Mixed: | ||
|
|
@@ -850,6 +914,11 @@ def _calc_critic_loss(self, info: SacInfo): | |
| if self._use_entropy_reward: | ||
| with torch.no_grad(): | ||
| log_pi = info.log_pi | ||
| if hasattr(self._critic_losses[0], | ||
| "_improve_w_nstep_bootstrap") and \ | ||
| self._critic_losses[0]._improve_w_nstep_bootstrap: | ||
| # Ignore 2nd - n-th step entropy in this mode. | ||
| log_pi[1:] = 0 | ||
| if self._entropy_normalizer is not None: | ||
| log_pi = self._entropy_normalizer.normalize(log_pi) | ||
| entropy_reward = nest.map_structure( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better not to put DDPG-irrelevant concepts into this Info structure. HER and DDPG are two orthogonal concepts and should be disentangled in the code. Otherwise when we have a third algorithm in the future, this Info structure will be added more fields again.