Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 176 additions & 46 deletions alf/algorithms/data_transformer.py

Large diffs are not rendered by default.

22 changes: 19 additions & 3 deletions alf/algorithms/ddpg_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
DdpgInfo = namedtuple(
"DdpgInfo", [
"reward", "step_type", "discount", "action", "action_distribution",
"actor_loss", "critic", "discounted_return"
"actor_loss", "critic", "discounted_return", "future_distance", "her"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better not to put DDPG-irrelevant concepts into this Info structure. HER and DDPG are two orthogonal concepts and should be disentangled in the code. Otherwise when we have a third algorithm in the future, this Info structure will be added more fields again.

],
default_value=())
DdpgLossInfo = namedtuple('DdpgLossInfo', ('actor', 'critic'))
Expand Down Expand Up @@ -237,10 +237,12 @@ def _sample(a, ou):
noisy_action, self._action_spec)
state = empty_state._replace(
actor=DdpgActorState(actor=state, critics=()))
# action_distribution is not supported for continuous actions for now.
# Returns empty action_distribution to fail early.
return AlgStep(
output=noisy_action,
state=state,
info=DdpgInfo(action=noisy_action, action_distribution=action))
info=DdpgInfo(action=noisy_action, action_distribution=()))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need this change? By default we could think of a deterministic action distribution as an action tensor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to fail early and clearly. action is a tensor, not distribution. Putting action directly there could cause confusion when debugging. See comment 3 lines above.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is intended to return action_distribution here so that some other algorithm can use it (e.g. TracAlgorithm). Do you find it causing problem?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, when used with e.g. Retrace, a distribution is needed, but action is not a distribution.


def rollout_step(self, time_step: TimeStep, state=None):
if self.need_full_rollout_state():
Expand Down Expand Up @@ -330,7 +332,8 @@ def train_step(self, inputs: TimeStep, state: DdpgState,
reward=inputs.reward,
step_type=inputs.step_type,
discount=inputs.discount,
action_distribution=policy_step.output,
action=policy_step.output,
action_distribution=(),
critic=critic_info,
actor_loss=policy_step.info,
discounted_return=rollout_info.discounted_return))
Expand All @@ -355,6 +358,19 @@ def calc_loss(self, info: DdpgInfo):

actor_loss = info.actor_loss

# The current implementation is hacky: Instead of using OneStepTD
# and pulling additionally a few timesteps from the future to compute
# bootstrap values, we here piggyback on n-step TDLoss, but masking
# out losses from the 2nd to n-1-th steps.
# If this hacky use pattern is to be used frequently in the future,
# we should consider refactoring it.
if hasattr(self._critic_losses[0],
"_improve_w_nstep_bootstrap") and \
self._critic_losses[0]._improve_w_nstep_bootstrap:
# Ignore 2nd - nth step actor losses.
actor_loss.loss[1:] = 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you need to igore the actor loss for those steps? It seems to me it doesn't hurt to keep them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Since we compare with n-step return methods anyways, it may be more fair to keep the other steps' action and alpha losses.

actor_loss.extra[1:] = 0

return LossInfo(
loss=critic_loss + actor_loss.loss,
priority=priority,
Expand Down
4 changes: 2 additions & 2 deletions alf/algorithms/one_step_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
from typing import Union, List, Callable

import alf
from alf.algorithms.td_loss import TDLoss, TDQRLoss
from alf.algorithms.td_loss import LowerBoundedTDLoss, TDQRLoss
from alf.utils import losses


@alf.configurable
class OneStepTDLoss(TDLoss):
class OneStepTDLoss(LowerBoundedTDLoss):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Conceptually OneStepTDLoss is not a child (special case) of LowerBoundedTDLoss. For me, LowerBoundedTDLoss is far more special and should be a completely new class, or a child of OneStepTDLoss.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LowerBoundedTDLoss defaults to TDLoss and can be configured to enable lower bounding, so it's more general than TDLoss. Maybe I should name it something else?

def __init__(self,
gamma: Union[float, List[float]] = 0.99,
td_error_loss_fn: Callable = losses.element_wise_squared_loss,
Expand Down
87 changes: 78 additions & 9 deletions alf/algorithms/sac_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import alf.nest.utils as nest_utils
from alf.networks import ActorDistributionNetwork, CriticNetwork
from alf.networks import QNetwork, QRNNNetwork
from alf.summary import render
from alf.tensor_specs import TensorSpec, BoundedTensorSpec
from alf.utils import losses, common, dist_utils, math_ops
from alf.utils.normalizers import ScalarAdaptiveNormalizer
Expand All @@ -56,7 +57,8 @@
SacInfo = namedtuple(
"SacInfo", [
"reward", "step_type", "discount", "action", "action_distribution",
"actor", "critic", "alpha", "log_pi", "discounted_return"
"actor", "critic", "alpha", "log_pi", "discounted_return",
"future_distance", "her"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"future_distance" and "her" are not essential to SAC and thus they shouldn't appear here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do you think is a good place? I could group discounted_return, her, future_distance into a batch_info field. Or should it be somewhere else?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do you think is a good place? I could group discounted_return, her, future_distance into a batch_info field. Or should it be somewhere else?

One ideal treatment is to have a wrapper algorithm maybe called SACHERAlgorithm that wraps an SAC and uses HER. Its info will contain fields ['sac', 'her']. Actually it might be possible to write a general algorithm wrapper/container that can wrap any off-policy algorithm with HER.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. For now, added comment that these three fields are optional, added TODO for future work to add the HER algorithm wrapper.

],
default_value=())

Expand Down Expand Up @@ -152,6 +154,9 @@ def __init__(self,
q_network_cls=QNetwork,
reward_weights=None,
epsilon_greedy=None,
rollout_epsilon_greedy=1.0,
use_epsilon_schedule=0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to change epsilon_greedy to accept both a float and a Scheduler. For float, you can use schedulers.as_scheduler to convert it to a scheduler. Similar to TrainerConfig.priority_replay_alpha

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

max_target_action=False,
use_entropy_reward=True,
normalize_entropy_reward=False,
calculate_priority=False,
Expand Down Expand Up @@ -203,6 +208,14 @@ def __init__(self,
Breakout. Only used for evaluation. If None, its value is taken
from ``config.epsilon_greedy`` and then
``alf.get_config_value(TrainerConfig.epsilon_greedy)``.
rollout_epsilon_greedy (float): epsilon greedy policy for rollout.
Together with the following three parameters, the Sac algorithm
can be converted to a DQN algorithm.
use_epsilon_schedule (float): training schedule for
rollout_epsilon_greedy.
max_target_action (bool): whether to use the action with the highest
target value as the target action for computing bootstrapped value.
To mimic the DQN algorithm, set this to True.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels a little strange to include these arguments here because there is usually no good reason to convert SAC to DQN. Maybe create a subclass of SAC for this purpose?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, does feel hacky. The code is quite intertwined, so a subclass probably wouldn't do to separate the two classes neatly. Do you prefer to just remove this part for now?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, does feel hacky. The code is quite intertwined, so a subclass probably wouldn't do to separate the two classes neatly. Do you prefer to just remove this part for now?

I think it's definitely possible and clearer to have a subclass? You only need to include these additional arguments and their code in the subclass implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given it a try, how about this new dqn_algorithm.py?

use_entropy_reward (bool): whether to include entropy as reward
normalize_entropy_reward (bool): if True, normalize entropy reward
to reduce bias in episodic cases. Only used if
Expand Down Expand Up @@ -261,6 +274,9 @@ def __init__(self,
if epsilon_greedy is None:
epsilon_greedy = alf.utils.common.get_epsilon_greedy(config)
self._epsilon_greedy = epsilon_greedy
self._rollout_epsilon_greedy = rollout_epsilon_greedy
self._use_epsilon_schedule = use_epsilon_schedule
self._max_target_action = max_target_action

critic_networks, actor_network, self._act_type = self._make_networks(
observation_spec, action_spec, reward_spec, actor_network_cls,
Expand All @@ -274,7 +290,10 @@ def __init__(self,
)

def _init_log_alpha():
return nn.Parameter(torch.tensor(float(initial_log_alpha)))
alpha = torch.tensor(float(initial_log_alpha))
if alpha_optimizer is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if you want to fix alpha, you can simply set the learning rate for it to be zero. ???_optimizer==None ususally means using a default optimizer.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I misunderstood the param. lr=0 sounds good. Removed.

return alpha
return nn.Parameter(alpha)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If alpha_optimizer is None, we can still create it as a parameter?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could an overall optimizer accidentally include alpha parameter and optimize it?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could an overall optimizer accidentally include alpha parameter and optimize it?

I think the original intent of having alpha_optimizer=None is indeed to let the parent algorithm decides which optimizer to use for alpha, instead of not optimizing alpha at all.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Removed.


if self._act_type == ActionType.Mixed:
# separate alphas for discrete and continuous actions
Expand Down Expand Up @@ -314,7 +333,7 @@ def _init_log_alpha():
self.add_optimizer(alpha_optimizer, nest.flatten(log_alpha))

self._log_alpha = log_alpha
if self._act_type == ActionType.Mixed:
if self._act_type == ActionType.Mixed and alpha_optimizer is not None:
self._log_alpha_paralist = nn.ParameterList(
nest.flatten(log_alpha))

Expand Down Expand Up @@ -376,6 +395,8 @@ def _init_log_alpha():
target_models=[self._target_critic_networks],
tau=target_update_tau,
period=target_update_period)
# initial q value range for rendering; adjusted as playing progresses
self._q_range = (0, 3)

def _make_networks(self, observation_spec, action_spec, reward_spec,
continuous_actor_network_cls, critic_network_cls,
Expand Down Expand Up @@ -531,26 +552,52 @@ def _predict_action(self,
return action_dist, action, q_values, new_state

def predict_step(self, inputs: TimeStep, state: SacState):
action_dist, action, _, action_state = self._predict_action(
action_dist, action, q_values, action_state = self._predict_action(
inputs.observation,
state=state.action,
epsilon_greedy=self._epsilon_greedy,
eps_greedy_sampling=True)
info = SacInfo(action_distribution=action_dist)
if (alf.summary.render.is_rendering_enabled()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block of rendering code should be kept as a local change as it's not essential to SAC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about I extract this into a separate file for debug rendering?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with @hynu. The current situation is that different person want to render different thing. So it's hard to have an agreed-upon set of figures for rendering. This is different from tensorboard summary where space is not that precious.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough, removed.

and self._act_type == ActionType.Discrete):
num_acts = q_values.shape[-1]
self._q_range = (min(self._q_range[0], int(q_values.min())),
max(self._q_range[1],
int(q_values.max()) + 1))
info = dict(
sac=info,
action_img=render.render_action("action", action,
self._action_spec),
action_dist_img=render.render_bar(
"action_dist",
action_dist.probs,
y_range=(0, 1),
annotate_format="%.2f",
x_ticks=range(num_acts)),
q_img=render.render_bar(
"q_values",
q_values,
y_range=self._q_range,
annotate_format="%.2f",
x_ticks=range(num_acts)))
return AlgStep(
output=action,
state=SacState(action=action_state),
info=SacInfo(action_distribution=action_dist))
output=action, state=SacState(action=action_state), info=info)

def rollout_step(self, inputs: TimeStep, state: SacState):
"""``rollout_step()`` basically predicts actions like what is done by
``predict_step()``. Additionally, if states are to be stored a in replay
buffer, then this function also call ``_critic_networks`` and
``_target_critic_networks`` to maintain their states.
"""
eps = self._rollout_epsilon_greedy
if self._use_epsilon_schedule > 0:
progress = alf.trainers.policy_trainer.Trainer.progress()
if progress < self._use_epsilon_schedule:
eps = 1.0 - (1.0 - eps) * progress / self._use_epsilon_schedule
action_dist, action, _, action_state = self._predict_action(
inputs.observation,
state=state.action,
epsilon_greedy=1.0,
epsilon_greedy=eps,
eps_greedy_sampling=True,
rollout=True)

Expand Down Expand Up @@ -717,7 +764,10 @@ def _critic_train_step(self, inputs: TimeStep, state: SacCriticState,
probs = common.expand_dims_as(action_distribution.probs,
target_critics)
# [B, reward_dim]
target_critics = torch.sum(probs * target_critics, dim=1)
if self._max_target_action:
target_critics = torch.max(target_critics, dim=1)[0]
else:
target_critics = torch.sum(probs * target_critics, dim=1)
elif self._act_type == ActionType.Mixed:
critics = self._select_q_value(rollout_info.action[0], critics)
discrete_act_dist = action_distribution[0]
Expand Down Expand Up @@ -797,6 +847,20 @@ def calc_loss(self, info: SacInfo):
alpha_loss = info.alpha
actor_loss = info.actor

# The current implementation is hacky: Instead of using OneStepTD
# and pulling additionally a few timesteps from the future to compute
# bootstrap values, we here piggyback on n-step TDLoss, but masking
# out losses from the 2nd to n-1-th steps.
# If this hacky use pattern is to be used frequently in the future,
# we should consider refactoring it.
if hasattr(self._critic_losses[0],
"_improve_w_nstep_bootstrap") and \
self._critic_losses[0]._improve_w_nstep_bootstrap:
# Ignore 2nd - n-th step losses in this mode.
alpha_loss[1:] = 0
if actor_loss.loss != ():
actor_loss.loss[1:] = 0

if self._debug_summaries and alf.summary.should_record_summaries():
with alf.summary.scope(self._name):
if self._act_type == ActionType.Mixed:
Expand Down Expand Up @@ -850,6 +914,11 @@ def _calc_critic_loss(self, info: SacInfo):
if self._use_entropy_reward:
with torch.no_grad():
log_pi = info.log_pi
if hasattr(self._critic_losses[0],
"_improve_w_nstep_bootstrap") and \
self._critic_losses[0]._improve_w_nstep_bootstrap:
# Ignore 2nd - n-th step entropy in this mode.
log_pi[1:] = 0
if self._entropy_normalizer is not None:
log_pi = self._entropy_normalizer.normalize(log_pi)
entropy_reward = nest.map_structure(
Expand Down
Loading