[BugFix] Fix shifted value computation with an LSTM

Vincent Moens · Vincent Moens · commit 1813e8edb835 · 2025-05-14T09:51:47.000+01:00
ghstack-source-id: 9ccbf82 Pull-Request-resolved: #2941
diff --git a/test/test_cost.py b/test/test_cost.py
@@ -46,16 +46,20 @@
 from torchrl._utils import _standardize
 from torchrl.data import Bounded, Categorical, Composite, MultiOneHot, OneHot, Unbounded
 from torchrl.data.postprocs.postprocs import MultiStep
-from torchrl.envs import EnvBase
+from torchrl.envs import EnvBase, GymEnv, InitTracker, SerialEnv
+from torchrl.envs.libs.gym import _has_gym
 from torchrl.envs.model_based.dreamer import DreamerEnv
 from torchrl.envs.transforms import TensorDictPrimer, TransformedEnv
 from torchrl.envs.utils import exploration_type, ExplorationType, set_exploration_type
 from torchrl.modules import (
     DistributionalQValueActor,
+    GRUModule,
+    LSTMModule,
     OneHotCategorical,
     QValueActor,
     recurrent_mode,
     SafeSequential,
+    set_recurrent_mode,
     WorldModelWrapper,
 )
 from torchrl.modules.distributions.continuous import TanhDelta, TanhNormal
@@ -146,6 +150,7 @@
         dtype_fixture,
         get_available_devices,
         get_default_devices,
+        PENDULUM_VERSIONED,
     )
     from pytorch.rl.test.mocking_classes import ContinuousActionConvMockEnv
 else:
@@ -154,6 +159,7 @@
         dtype_fixture,
         get_available_devices,
         get_default_devices,
+        PENDULUM_VERSIONED,
     )
     from mocking_classes import ContinuousActionConvMockEnv
 
@@ -13755,6 +13761,79 @@ def _forward_value_estimator_keys(self, **kwargs) -> None:
 
 
 class TestValues:
+    @pytest.mark.skipif(not _has_gym, reason="requires gym")
+    @pytest.mark.parametrize("module", ["lstm", "gru"])
+    def test_gae_recurrent(self, module):
+        # Checks that shifted=True and False provide the same result in GAE when an LSTM is used
+        env = SerialEnv(
+            2,
+            [
+                functools.partial(
+                    TransformedEnv, GymEnv(PENDULUM_VERSIONED()), InitTracker()
+                )
+                for _ in range(2)
+            ],
+        )
+        env.set_seed(0)
+        torch.manual_seed(0)
+        if module == "lstm":
+            recurrent_module = LSTMModule(
+                input_size=env.observation_spec["observation"].shape[-1],
+                hidden_size=64,
+                in_keys=["observation", "rs_h", "rs_c"],
+                out_keys=["intermediate", ("next", "rs_h"), ("next", "rs_c")],
+                python_based=True,
+                dropout=0,
+            )
+        elif module == "gru":
+            recurrent_module = GRUModule(
+                input_size=env.observation_spec["observation"].shape[-1],
+                hidden_size=64,
+                in_keys=["observation", "rs_h"],
+                out_keys=["intermediate", ("next", "rs_h")],
+                python_based=True,
+                dropout=0,
+            )
+        else:
+            raise NotImplementedError
+        recurrent_module.eval()
+        mlp_value = MLP(num_cells=[64], out_features=1)
+        value_net = Seq(
+            recurrent_module,
+            Mod(mlp_value, in_keys=["intermediate"], out_keys=["state_value"]),
+        )
+        mlp_policy = MLP(num_cells=[64], out_features=1)
+        policy_net = Seq(
+            recurrent_module,
+            Mod(mlp_policy, in_keys=["intermediate"], out_keys=["action"]),
+        )
+        env = env.append_transform(recurrent_module.make_tensordict_primer())
+        vals = env.rollout(1000, policy_net, break_when_any_done=False)
+        value_net(vals.copy())
+
+        # Shifted
+        gae_shifted = GAE(
+            gamma=0.9,
+            lmbda=0.99,
+            value_network=value_net,
+            shifted=True,
+        )
+        with set_recurrent_mode(True):
+            r0 = gae_shifted(vals.copy())
+        a0 = r0["advantage"]
+
+        gae = GAE(
+            gamma=0.9,
+            lmbda=0.99,
+            value_network=value_net,
+            shifted=False,
+            deactivate_vmap=True,
+        )
+        with set_recurrent_mode(True):
+            r1 = gae(vals.copy())
+        a1 = r1["advantage"]
+        torch.testing.assert_close(a0, a1)
+
     @pytest.mark.parametrize("device", get_default_devices())
     @pytest.mark.parametrize("gamma", [0.1, 0.5, 0.99])
     @pytest.mark.parametrize("lmbda", [0.1, 0.5, 0.99])
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -16,6 +16,8 @@
 
 import torch
 
+from torchrl.objectives.utils import _pseudo_vmap
+
 if os.getenv("PYTORCH_TEST_FBCODE"):
     from pytorch.rl.test._utils_internal import capture_log_records, get_default_devices
 else:
@@ -416,6 +418,33 @@ def str_to_tensor(s):
     assert len(records) == 1
 
 
+def add_one(x):
+    return x + 1
+
+
+@pytest.mark.parametrize("in_dim, out_dim", [(0, 0), (0, 1), (1, 0), (1, 1)])
+def test_vmap_in_out_dims(in_dim, out_dim):
+    # Create a tensor with batch dimension
+    x = torch.arange(10).reshape(2, 5)
+    # Move the input dimension to match in_dim
+    x_moved = torch.moveaxis(x, 0, in_dim)
+    # Using vmap with specified in_dim and out_dim
+    vmapped_add_one = torch.vmap(add_one, in_dims=in_dim, out_dims=out_dim)
+    actual_result = vmapped_add_one(x_moved)
+    pseudo_vmapped_add_one = _pseudo_vmap(add_one, in_dims=in_dim, out_dims=out_dim)
+    pseudo_actual_result = pseudo_vmapped_add_one(x_moved)
+
+    # Expected result by applying add_one on each element of the batch separately
+    expected_result = x + 1
+    # Move the output dimension to match the expected result
+    if out_dim == 1:
+        actual_result = torch.moveaxis(actual_result, out_dim, 0)
+        pseudo_actual_result = torch.moveaxis(pseudo_actual_result, out_dim, 0)
+    # Assert the results are as expected
+    assert torch.allclose(actual_result, expected_result)
+    assert torch.allclose(pseudo_actual_result, expected_result)
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
diff --git a/torchrl/data/tensor_specs.py b/torchrl/data/tensor_specs.py
@@ -571,7 +571,7 @@ class TensorSpec(metaclass=abc.ABCMeta):
     shape: torch.Size
     space: None | Box
     device: torch.device | None = None
-    dtype: torch.dtype = torch.float
+    dtype: torch.dtype = torch.get_default_dtype()
     domain: str = ""
     _encode_memo_dict: dict[Any, Callable[[Any], Any]] = field(
         default_factory=dict,
@@ -1682,7 +1682,7 @@ class OneHot(TensorSpec):
     shape: torch.Size
     space: CategoricalBox
     device: torch.device | None = None
-    dtype: torch.dtype = torch.float
+    dtype: torch.dtype = torch.get_default_dtype()
     domain: str = ""
     _encode_memo_dict: dict[Any, Callable[[Any], Any]] = field(
         default_factory=dict,
@@ -2067,7 +2067,7 @@ def __getitem__(self, idx: SHAPE_INDEX_TYPING):
 
     def _project(self, val: torch.Tensor) -> torch.Tensor:
         if self.mask is None:
-            out = torch.multinomial(val.to(torch.float), 1).squeeze(-1)
+            out = torch.multinomial(val.to(torch.get_default_dtype()), 1).squeeze(-1)
             out = torch.nn.functional.one_hot(out, self.space.n).to(self.dtype)
             return out
         shape = self.mask.shape
@@ -3735,7 +3735,7 @@ class Categorical(TensorSpec):
     shape: torch.Size
     space: CategoricalBox
     device: torch.device | None = None
-    dtype: torch.dtype = torch.float
+    dtype: torch.dtype = torch.get_default_dtype()
     domain: str = ""
 
     # SPEC_HANDLED_FUNCTIONS = {}
diff --git a/torchrl/modules/tensordict_module/rnn.py b/torchrl/modules/tensordict_module/rnn.py
@@ -730,10 +730,16 @@ def forward(self, tensordict: TensorDictBase):
         # packed sequences do not help to get the accurate last hidden values
         # if splits is not None:
         #     value = torch.nn.utils.rnn.pack_padded_sequence(value, splits, batch_first=True)
-        if hidden0 is not None:
+
+        if not self.recurrent_mode and hidden0 is not None:
+            # We zero the hidden states if we're calling the lstm recursively
+            #  as we assume the hidden state comes from the previous trajectory.
+            #  When using the recurrent_mode=True option, the lstm can be called from
+            #  any intermediate state, hence zeroing should not be done.
             is_init_expand = expand_as_right(is_init, hidden0)
             hidden0 = torch.where(is_init_expand, 0, hidden0)
             hidden1 = torch.where(is_init_expand, 0, hidden1)
+
         val, hidden0, hidden1 = self._lstm(
             value, batch, steps, device, dtype, hidden0, hidden1
         )
@@ -782,8 +788,8 @@ def _lstm(
             )
 
         # we only need the first hidden state
-        _hidden0_in = hidden0_in[:, 0]
-        _hidden1_in = hidden1_in[:, 0]
+        _hidden0_in = hidden0_in[..., 0, :, :]
+        _hidden1_in = hidden1_in[..., 0, :, :]
         hidden = (
             _hidden0_in.transpose(-3, -2).contiguous(),
             _hidden1_in.transpose(-3, -2).contiguous(),
@@ -1517,7 +1523,7 @@ def forward(self, tensordict: TensorDictBase):
         # packed sequences do not help to get the accurate last hidden values
         # if splits is not None:
         #     value = torch.nn.utils.rnn.pack_padded_sequence(value, splits, batch_first=True)
-        if is_init.any() and hidden is not None:
+        if not self.recurrent_mode and is_init.any() and hidden is not None:
             is_init_expand = expand_as_right(is_init, hidden)
             hidden = torch.where(is_init_expand, 0, hidden)
         val, hidden = self._gru(value, batch, steps, device, dtype, hidden)
diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py
@@ -8,14 +8,15 @@
 import re
 import warnings
 from enum import Enum
-from typing import Iterable
+from typing import Any, Callable, Iterable
 
 import torch
 from tensordict import NestedKey, TensorDict, TensorDictBase, unravel_key
 from tensordict.nn import TensorDictModule
 from torch import nn, Tensor
 from torch.nn import functional as F
 from torch.nn.modules import dropout
+from torch.utils._pytree import tree_map
 
 try:
     from torch import vmap
@@ -527,7 +528,7 @@ def new_func(self, netname=None):
     return new_func
 
 
-def _vmap_func(module, *args, func=None, **kwargs):
+def _vmap_func(module, *args, func=None, pseudo_vmap: bool = False, **kwargs):
     try:
 
         def decorated_module(*module_args_params):
@@ -539,7 +540,9 @@ def decorated_module(*module_args_params):
                 else:
                     return getattr(module, func)(*module_args)
 
-        return vmap(decorated_module, *args, **kwargs)  # noqa: TOR101
+        if not pseudo_vmap:
+            return vmap(decorated_module, *args, **kwargs)  # noqa: TOR101
+        return _pseudo_vmap(decorated_module, *args, **kwargs)
 
     except RuntimeError as err:
         if re.match(
@@ -550,6 +553,56 @@ def decorated_module(*module_args_params):
             ) from err
 
 
+def _pseudo_vmap(
+    func: Callable,
+    in_dims: Any = 0,
+    out_dims: Any = 0,
+    randomness: str | None = None,
+    *,
+    chunk_size=None,
+):
+    if randomness is not None and randomness not in ("different", "error"):
+        raise ValueError(
+            f"pseudo_vmap only supports 'different' or 'error' randomness modes, but got {randomness=}. If another mode is required, please "
+            "submit an issue in TorchRL."
+        )
+    if isinstance(in_dims, int):
+        in_dims = (in_dims,)
+    if isinstance(out_dims, int):
+        out_dims = (out_dims,)
+    from tensordict.nn.functional_modules import _exclude_td_from_pytree
+
+    def _unbind(d, x):
+        if d is not None and hasattr(x, "unbind"):
+            return x.unbind(d)
+        # Generator to reprod the value
+        return (x for _ in range(1000))
+
+    def _stack(d, x):
+        if d is not None:
+            return torch.stack(list(x), d)
+        return x
+
+    @functools.wraps(func)
+    def new_func(*args, **kwargs):
+        with _exclude_td_from_pytree():
+            # Unbind inputs
+            vs = zip(*tuple(tree_map(_unbind, in_dims, args)))
+            rs = []
+            for v in vs:
+                r = func(*v, **kwargs)
+                if not isinstance(r, tuple):
+                    r = (r,)
+                rs.append(r)
+            rs = tuple(zip(*rs))
+            vs = tuple(tree_map(_stack, out_dims, rs))
+            if len(vs) == 1:
+                return vs[0]
+            return vs
+
+    return new_func
+
+
 def _reduce(tensor: torch.Tensor, reduction: str) -> float | torch.Tensor:
     """Reduces a tensor given the reduction method."""
     if reduction == "none":
diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py