[BugFix] Fixed broken SACLoss when there is more than one qvalue_network (#3500)

ParamThakkar123 · web-flow · commit 4160c9a7dbd0 · 2026-02-14T00:28:04.000Z
diff --git a/test/test_objectives.py b/test/test_objectives.py
@@ -4398,6 +4398,28 @@ def test_reset_parameters_recursive(self, version):
         )
         self.reset_parameters_recursive_test(loss_fn)
 
+    def test_sac_list_qvalue_networks(self, version):
+        torch.manual_seed(self.seed)
+        td = self._create_mock_data_sac()
+        actor = self._create_mock_actor()
+        qvalue1 = self._create_mock_qvalue()
+        qvalue2 = self._create_mock_qvalue()
+        if version == 1:
+            value = self._create_mock_value()
+        else:
+            value = None
+        loss_fn = SACLoss(
+            actor_network=actor,
+            qvalue_network=[qvalue1, qvalue2],
+            value_network=value,
+            num_qvalue_nets=2,
+        )
+        with pytest.warns(
+            UserWarning, match="No target network updater has been associated"
+        ) if rl_warnings() else contextlib.nullcontext():
+            loss = loss_fn(td)
+        assert "loss_qvalue" in loss.keys()
+
     @pytest.mark.parametrize("delay_value", (True, False))
     @pytest.mark.parametrize("delay_actor", (True, False))
     @pytest.mark.parametrize("delay_qvalue", (True, False))
diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py
@@ -348,6 +348,8 @@ def convert_to_functional(
             params = TensorDict.from_modules(
                 *module, as_module=True, expand_identical=True
             )
+            # Use the first module as the functional forward reference.
+            module = module[0]
         else:
             params = TensorDict.from_module(module, as_module=True)
 
diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py
@@ -73,12 +73,15 @@ class SACLoss(LossModule):
 
     Args:
         actor_network (ProbabilisticTensorDictSequential): stochastic actor
-        qvalue_network (TensorDictModule): Q(s, a) parametric model.
+        qvalue_network (TensorDictModule | list[TensorDictModule]): Q(s, a) parametric model.
             This module typically outputs a ``"state_action_value"`` entry.
             If a single instance of `qvalue_network` is provided, it will be duplicated ``num_qvalue_nets``
             times. If a list of modules is passed, their
             parameters will be stacked unless they share the same identity (in which case
             the original parameter will be expanded).
+            When a list is provided, the first module is used as the functional forward
+            reference (its ``in_keys``/``out_keys`` are used), so all modules must share
+            the same signature.
 
             .. warning:: When a list of parameters if passed, it will **not** be compared against the policy parameters
               and all the parameters will be considered as untied.

Original file line number	Diff line number	Diff line change
`@@ -348,6 +348,8 @@ def convert_to_functional(`
`348`	`348`	`params = TensorDict.from_modules(`
`349`	`349`	`*module, as_module=True, expand_identical=True`
`350`	`350`	`)`
	`351`	`+ # Use the first module as the functional forward reference.`
	`352`	`+ module = module[0]`
`351`	`353`	`else:`
`352`	`354`	`params = TensorDict.from_module(module, as_module=True)`
`353`	`355`