[Performance] Avoid computing log-probs when retrieving dist (#3081)

vmoens · vmoens · commit 227b5fca22d0 · 2025-07-16T21:13:00.000+01:00
diff --git a/torchrl/modules/llm/policies/common.py b/torchrl/modules/llm/policies/common.py
@@ -10,7 +10,7 @@
 
 import torch
 from tensordict import lazy_stack, NestedKey, TensorDictBase
-from tensordict.nn import TensorDictModuleBase, TensorDictSequential
+from tensordict.nn import TensorDictModuleBase
 from tensordict.tensorclass import TensorClass
 from tensordict.utils import _zip_strict
 from torch import distributions as D
@@ -488,7 +488,7 @@ def get_dist(
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
 
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
 
         # Get logits/log-probs
         if as_padded_tensor is None:
@@ -563,7 +563,7 @@ def _get_dist_with_prompt_mask(
                 "get_dist_with_prompt_mask is not implemented for generate=True. "
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
 
         # Try to get prompt tokens first
         if self.pad_output:
@@ -674,7 +674,7 @@ def _get_dist_with_assistant_mask(
                 "get_dist_with_assistant_mask is not implemented for generate=True. "
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
         # Update the tokens key to reflect the tokenized history when querying the log-probs
         tensordict.update(
             td_out,
@@ -743,7 +743,7 @@ def _get_dist_with_attention_mask(
                 "get_dist_with_attention_mask is not implemented for generate=True. "
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
         if self.pad_output:
             logits = td_out.get(logits_key)
             attention_mask = td_out.get(attention_mask_key)
@@ -800,7 +800,7 @@ def _get_dist_with_custom_mask(
                 "get_dist_with_custom_mask is not implemented for generate=True. "
                 "You can create a new version of this wrapper using the `get_new_version` method."
             )
-        td_out = self(tensordict.copy())
+        td_out = self.forward(tensordict.copy(), logits_only=True)
         if self.pad_output:
             logits = td_out.get(logits_key)
         else:
@@ -847,8 +847,24 @@ def _get_generic_dist(self, tensordict: TensorDictBase, **kwargs) -> D.Distribut
         """
         return self._get_dist_with_attention_mask(tensordict, **kwargs)
 
-    # Sampling is taken care of by the sub-modules
-    forward = TensorDictSequential.forward
+    def forward(
+        self,
+        tensordict: TensorDictBase,
+        *,
+        tensordict_out: TensorDictBase | None = None,
+        logits_only: bool = False,
+        **kwargs,
+    ) -> TensorDictBase:  # noqa: D417
+        """Forward pass for the LLM policy.
+
+        Args:
+            tensordict (TensorDictBase): The input tensordict.
+
+        Keyword Args:
+            tensordict_out (TensorDictBase | None): The output tensordict.
+            logits_only (bool): Whether to return only the logits. Only effective if generate=False. Defaults to `False`.
+        """
+        raise NotImplementedError
 
     def _check_padded(self, val: torch.Tensor) -> torch.Tensor:
         """Check that a value is a padded tensor."""
diff --git a/torchrl/modules/llm/policies/transformers_wrapper.py b/torchrl/modules/llm/policies/transformers_wrapper.py
@@ -469,15 +469,29 @@ def get_new_version(self, **kwargs):
     def forward(
         self,
         tensordict: TensorDictBase,
+        *,
         tensordict_out: TensorDictBase | None = None,
+        logits_only: bool = False,
         **kwargs,
     ) -> TensorDictBase:
         tensordict_orig = tensordict
         if not tensordict.ndim:
+            if tensordict_out is not None:
+                raise ValueError(
+                    "tensordict_out must not be provided when tensordict.ndim == 0. If this is needed, "
+                    "please submit an issue on github."
+                )
             # unsqueeze - squeeze the input
-            return self(lazy_stack([tensordict]))[0]
+            return self.forward(lazy_stack([tensordict]), logits_only=logits_only)[0]
         elif tensordict.ndim > 1:
-            return self(tensordict.reshape(-1)).view(tensordict.shape)
+            if tensordict_out is not None:
+                raise ValueError(
+                    "tensordict_out must not be provided when tensordict.ndim > 1. If this is needed, "
+                    "please submit an issue on github."
+                )
+            return self.forward(tensordict.reshape(-1), logits_only=logits_only).view(
+                tensordict.shape
+            )
 
         if not isinstance(tensordict, LazyStackedTensorDict):
             tensordict = tensordict.to_lazystack(0)
@@ -517,17 +531,23 @@ def forward(
             if self.generate:
                 out = self._from_transformers_generate_history(tensordict, cfg, out)
             else:
-                out = self._from_transformers_logprobs_history(tensordict, cfg, out)
+                out = self._from_transformers_logprobs_history(
+                    tensordict, cfg, out, logits_only=logits_only
+                )
         elif self.input_mode == "text":
             if self.generate:
                 out = self._from_transformers_generate_text(tensordict, cfg, out)
             else:
-                out = self._from_transformers_logprobs_text(tensordict, cfg, out)
+                out = self._from_transformers_logprobs_text(
+                    tensordict, cfg, out, logits_only=logits_only
+                )
         elif self.input_mode == "tokens":
             if self.generate:
                 out = self._from_transformers_generate_tokens(tensordict, cfg, out)
             else:
-                out = self._from_transformers_logprobs_tokens(tensordict, cfg, out)
+                out = self._from_transformers_logprobs_tokens(
+                    tensordict, cfg, out, logits_only=logits_only
+                )
 
         if _source_device:
             out = out.to(_source_device)
@@ -690,7 +710,7 @@ def _from_transformers_generate_history(self, td, cfg, out) -> TensorDictBase:
         result.set(self.history_key, history_chat)
         return result
 
-    def _from_transformers_logprobs_history(self, td, cfg, out):
+    def _from_transformers_logprobs_history(self, td, cfg, out, logits_only=False):
         """Compute log-probs from history input."""
         from torchrl.data.llm import History
 
@@ -731,7 +751,9 @@ def _from_transformers_logprobs_history(self, td, cfg, out):
             raise ValueError(
                 f"Expected TensorDictBase for history input, got {type(response_tokens)}"
             )
-        result = self._logprobs_from_history_tokens(response_tokens, cfg, out)
+        result = self._logprobs_from_history_tokens(
+            response_tokens, cfg, out, logits_only=logits_only
+        )
         text_result = Text._from_tensordict(result.empty())
         result.set(self.text_key, text_result)
         result[self.text_key, "full"] = text_full
@@ -952,7 +974,9 @@ def _cat_tensors(
                 result = result.to(cast)
             return result
 
-    def _logprobs_from_history_tokens(self, response_tokens, cfg, out):
+    def _logprobs_from_history_tokens(
+        self, response_tokens, cfg, out, logits_only=False
+    ):
         """Compute log-probs from history tokens."""
         pad_val = self.tokenizer.pad_token_id
 
@@ -996,6 +1020,7 @@ def _logprobs_from_history_tokens(self, response_tokens, cfg, out):
             tokens_full_padded,
             attention_mask_full_padded,
             pad_val,
+            logits_only=logits_only,
         )
 
         # Build output TensorClass objects
@@ -1051,19 +1076,20 @@ def _logprobs_from_history_tokens(self, response_tokens, cfg, out):
         tokens_obj.padded = MetaData(self.pad_output)
         out.set(self.tokens_key, tokens_obj)
 
-        log_probs_obj = LogProbs._from_tensordict(
-            TensorDict(batch_size=out.batch_size).to_lazystack(0)
-        )
-        if self.pad_output:
-            log_probs_obj.full = log_probs_full_padded
-        else:
-            log_probs_full_unpadded = _unpad_tensors(
-                log_probs_full_padded, attention_mask_full_padded, as_nested=False
+        if not logits_only:
+            log_probs_obj = LogProbs._from_tensordict(
+                TensorDict(batch_size=out.batch_size).to_lazystack(0)
             )
-            log_probs_obj.full = log_probs_full_unpadded
-        log_probs_obj.response = None
-        log_probs_obj.padded = MetaData(self.pad_output)
-        out.set(self.log_probs_key, log_probs_obj)
+            if self.pad_output:
+                log_probs_obj.full = log_probs_full_padded
+            else:
+                log_probs_full_unpadded = _unpad_tensors(
+                    log_probs_full_padded, attention_mask_full_padded, as_nested=False
+                )
+                log_probs_obj.full = log_probs_full_unpadded
+            log_probs_obj.response = None
+            log_probs_obj.padded = MetaData(self.pad_output)
+            out.set(self.log_probs_key, log_probs_obj)
 
         # Add logits to output if we're in a get_dist call
         if self._in_get_dist_call:
@@ -1095,7 +1121,7 @@ def _from_transformers_generate_text(self, td, cfg, out) -> TensorDictBase:
             raise ValueError(f"Expected list of text for text input, got {type(text)}")
         return self._generate_from_text(text, cfg, out)
 
-    def _from_transformers_logprobs_text(self, td, cfg, out):
+    def _from_transformers_logprobs_text(self, td, cfg, out, logits_only=False):
         """Compute log-probs from text input."""
         # Validate input
         if self.input_key not in td:
@@ -1168,6 +1194,7 @@ def _from_transformers_logprobs_text(self, td, cfg, out):
             input_ids_full_padded,
             attention_mask_full_padded,
             self.tokenizer.pad_token_id,
+            logits_only=logits_only,
         )
 
         # Build output TensorClass objects
@@ -1212,19 +1239,20 @@ def _from_transformers_logprobs_text(self, td, cfg, out):
         masks_obj.padded = MetaData(self.pad_output)
         out.set(self.masks_key, masks_obj)
 
-        log_probs_obj = LogProbs._from_tensordict(
-            TensorDict(batch_size=out.batch_size).to_lazystack(0)
-        )
-        if self.pad_output:
-            log_probs_obj.full = log_probs_full_padded
-        else:
-            log_probs_full_unpadded = _unpad_tensors(
-                log_probs_full_padded, attention_mask_full_padded, as_nested=False
+        if not logits_only:
+            log_probs_obj = LogProbs._from_tensordict(
+                TensorDict(batch_size=out.batch_size).to_lazystack(0)
             )
-            log_probs_obj.full = log_probs_full_unpadded
-        log_probs_obj.response = None
-        log_probs_obj.padded = MetaData(self.pad_output)
-        out.set(self.log_probs_key, log_probs_obj)
+            if self.pad_output:
+                log_probs_obj.full = log_probs_full_padded
+            else:
+                log_probs_full_unpadded = _unpad_tensors(
+                    log_probs_full_padded, attention_mask_full_padded, as_nested=False
+                )
+                log_probs_obj.full = log_probs_full_unpadded
+            log_probs_obj.response = None
+            log_probs_obj.padded = MetaData(self.pad_output)
+            out.set(self.log_probs_key, log_probs_obj)
 
         # Add logits to output if we're in a get_dist call
         if self._in_get_dist_call:
@@ -1416,7 +1444,11 @@ def _generate_from_tokens(
         return out
 
     def _from_transformers_logprobs_tokens(
-        self, td: TensorDictBase, cfg: dict | None, out: TensorDictBase
+        self,
+        td: TensorDictBase,
+        cfg: dict | None,
+        out: TensorDictBase,
+        logits_only=False,
     ) -> TensorDictBase:
         """Compute log-probs from tokens input."""
         # Validate input
@@ -1470,6 +1502,7 @@ def _from_transformers_logprobs_tokens(
             input_ids_full_padded,
             attention_mask_full_padded,
             self.tokenizer.pad_token_id,
+            logits_only=logits_only,
         )
 
         # Build output TensorClass objects
@@ -1514,19 +1547,20 @@ def _from_transformers_logprobs_tokens(
         masks_obj.padded = MetaData(self.pad_output)
         out.set(self.masks_key, masks_obj)
 
-        log_probs_obj = LogProbs._from_tensordict(
-            TensorDict(batch_size=out.batch_size).to_lazystack(0)
-        )
-        if self.pad_output:
-            log_probs_obj.full = log_probs_full_padded
-        else:
-            log_probs_full_unpadded = _unpad_tensors(
-                log_probs_full_padded, attention_mask_full_padded, as_nested=False
+        if not logits_only:
+            log_probs_obj = LogProbs._from_tensordict(
+                TensorDict(batch_size=out.batch_size).to_lazystack(0)
             )
-            log_probs_obj.full = log_probs_full_unpadded
-        log_probs_obj.response = None
-        log_probs_obj.padded = MetaData(self.pad_output)
-        out.set(self.log_probs_key, log_probs_obj)
+            if self.pad_output:
+                log_probs_obj.full = log_probs_full_padded
+            else:
+                log_probs_full_unpadded = _unpad_tensors(
+                    log_probs_full_padded, attention_mask_full_padded, as_nested=False
+                )
+                log_probs_obj.full = log_probs_full_unpadded
+            log_probs_obj.response = None
+            log_probs_obj.padded = MetaData(self.pad_output)
+            out.set(self.log_probs_key, log_probs_obj)
 
         # Add logits to output if we're in a get_dist call
         if self._in_get_dist_call:
@@ -1567,7 +1601,7 @@ def _log_probs_generate(cls, tokens, logits, pad_val=-100, pad: bool = True):
         return log_probs, logits
 
     def _compute_log_probs_from_model_output(
-        self, model_output, input_ids, attention_mask, pad_val
+        self, model_output, input_ids, attention_mask, pad_val, logits_only=False
     ):
         """Compute log-probs from model output without modifying original tensors.
 
@@ -1576,6 +1610,7 @@ def _compute_log_probs_from_model_output(
             input_ids: Original input token ids
             attention_mask: Original attention mask
             pad_val: Padding token value to ignore in loss computation
+            logits_only: Whether to return only the logits.
 
         Returns:
             tuple: (log_probs, shifted_logits) where log_probs are the computed log probabilities
@@ -1600,6 +1635,8 @@ def _compute_log_probs_from_model_output(
             raise ValueError(
                 f"The logits shape {shifted_logits.shape} does not match the input ids shape {shifted_input_ids.shape}"
             )
+        if logits_only:
+            return None, shifted_logits
 
         # Compute log-probs
         td = TensorDict(
diff --git a/torchrl/modules/llm/policies/vllm_wrapper.py b/torchrl/modules/llm/policies/vllm_wrapper.py
@@ -501,15 +501,29 @@ def get_new_version(self, **kwargs):
     def forward(
         self,
         tensordict: TensorDictBase,
+        *,
         tensordict_out: TensorDictBase | None = None,
+        logits_only: bool = False,
         **kwargs,
     ) -> TensorDictBase:
         tensordict_orig = tensordict
         if not tensordict.ndim:
+            if tensordict_out is not None:
+                raise ValueError(
+                    "tensordict_out must not be provided when tensordict.ndim == 0. If this is needed, "
+                    "please submit an issue on github."
+                )
             # unsqueeze - squeeze the input
-            return self(lazy_stack([tensordict]))[0]
+            return self.forward(lazy_stack([tensordict]), logits_only=logits_only)[0]
         elif tensordict.ndim > 1:
-            return self(tensordict.reshape(-1)).view(tensordict.shape)
+            if tensordict_out is not None:
+                raise ValueError(
+                    "tensordict_out must not be provided when tensordict.ndim > 1. If this is needed, "
+                    "please submit an issue on github."
+                )
+            return self.forward(tensordict.reshape(-1), logits_only=logits_only).view(
+                tensordict.shape
+            )
 
         if not isinstance(tensordict, LazyStackedTensorDict):
             tensordict = tensordict.to_lazystack(0)