add topp topk in LogprobsPostProcessor

yuki-97 · yuki-97 · commit 7931cb0a5b31 · 2026-03-03T00:52:34.000-08:00
Signed-off-by: Yuki Huang &lt;yukih@nvidia.com&gt;
diff --git a/nemo_rl/algorithms/loss/loss_functions.py b/nemo_rl/algorithms/loss/loss_functions.py
@@ -278,7 +278,7 @@ def __call__(
         if self.reference_policy_kl_penalty != 0:
             # When top-k/top-p filtering is enabled, we need special handling for KL:
             # - reference_policy_logprobs is computed **without** filtering (see use_reference_model)
-            # - curr_logprobs is computed **with** filtering (for actor loss compatibility)
+            # - curr_logprobs/prev_logprobs are computed **with** filtering (for actor loss compatibility)
             # - For KL, we need curr_logprobs **without** filtering to be consistent with ref logprobs
             # - For importance weights, we also use unfiltered curr_logprobs_for_kl since we're
             #   reweighting samples from π_gen_filtered to π_curr_unfiltered
diff --git a/nemo_rl/models/automodel/train.py b/nemo_rl/models/automodel/train.py
@@ -32,7 +32,11 @@
 from torch import nn
 from torch.distributed.tensor import DTensor, Shard
 
-from nemo_rl.algorithms.logits_sampling_utils import TrainingSamplingParams
+from nemo_rl.algorithms.logits_sampling_utils import (
+    TrainingSamplingParams,
+    apply_top_k_top_p,
+    need_top_k_or_top_p_filtering,
+)
 from nemo_rl.algorithms.loss import SequencePackingLossWrapper, prepare_loss_input
 from nemo_rl.algorithms.loss.interfaces import LossFunction
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
@@ -124,20 +128,42 @@ def extract_logits(
 
 
 def apply_temperature_scaling(
-    logits: torch.Tensor,
-    cfg: PolicyConfig,
+    logits: torch.Tensor, sampling_params: Optional[TrainingSamplingParams]
 ) -> torch.Tensor:
     """Apply temperature scaling to logits.
 
     Args:
         logits: Logits tensor to scale
-        cfg: Configuration dictionary containing generation settings
+        sampling_params: Sampling parameters
 
     Returns:
         torch.Tensor: Temperature-scaled logits
     """
-    if "generation" in cfg and cfg["generation"] is not None:
-        logits.div_(cfg["generation"]["temperature"])
+    if sampling_params is not None and sampling_params.temperature != 1.0:
+        logits.div_(sampling_params.temperature)
+    return logits
+
+
+def apply_top_k_top_p_filtering_for_local_logits(
+    logits: torch.Tensor, sampling_params: Optional[TrainingSamplingParams]
+) -> torch.Tensor:
+    """Apply top-k and top-p filtering to the non-distributed logits.
+
+    Args:
+        logits: Logits tensor to filter
+        sampling_params: Sampling parameters
+
+    Returns:
+        torch.Tensor: Filtered logits
+    """
+    if sampling_params is not None and need_top_k_or_top_p_filtering(
+        sampling_params.top_k, sampling_params.top_p
+    ):
+        logits, _ = apply_top_k_top_p(
+            logits,
+            top_k=sampling_params.top_k,
+            top_p=sampling_params.top_p,
+        )
     return logits
 
 
@@ -233,7 +259,7 @@ def prepare_data_for_cp(
 
 def forward_with_post_processing_fn(
     model: nn.Module,
-    cfg: PolicyConfig,
+    sampling_params: TrainingSamplingParams,
     post_processing_fn: PostProcessingFunction,
     processed_mb: ProcessedMicrobatch,
     is_reward_model: bool = False,
@@ -253,7 +279,7 @@ def forward_with_post_processing_fn(
 
     Args:
         model: The model to run forward pass on
-        cfg: Configuration dictionary
+        sampling_params: Sampling parameters
         post_processing_fn: Post-processing function to apply to the logits
         processed_mb: Pre-fetched ProcessedMicrobatch containing data and processed inputs
         is_reward_model: Whether this is a reward model
@@ -290,7 +316,10 @@ def forward_with_post_processing_fn(
         post_processing_fn,
         (LossPostProcessor, LogprobsPostProcessor, TopkLogitsPostProcessor),
     ):
-        logits = apply_temperature_scaling(logits, cfg)
+        # Temperature scaling is element-wise, directly applying it here.
+        # Other sampling parameters like top-k and top-p need the logits from whole vocabulary,
+        # so applying them when gathering logits from vocab parallel (called in LossPostProcessor and LogprobsPostProcessor).
+        logits = apply_temperature_scaling(logits, sampling_params)
 
     # Apply the post-processing function directly based on type
     if isinstance(post_processing_fn, LossPostProcessor):
@@ -558,6 +587,7 @@ def __init__(
         tp_mesh: Any,
         cp_size: int,
         enable_seq_packing: bool = False,
+        sampling_params: Optional[TrainingSamplingParams] = None,
     ):
         """Initialize LogprobsPostProcessor.
 
@@ -568,13 +598,15 @@ def __init__(
             tp_mesh: Tensor parallel mesh
             cp_size: Context parallel size
             enable_seq_packing: Whether sequence packing is enabled
+            sampling_params: Sampling parameters
         """
         self.cfg = cfg
         self.device_mesh = device_mesh
         self.cp_mesh = cp_mesh
         self.tp_mesh = tp_mesh
         self.cp_size = cp_size
         self.enable_seq_packing = enable_seq_packing
+        self.sampling_params = sampling_params
         self.logprob_chunk_size = cfg.get("logprob_chunk_size", None)
 
     def __call__(
@@ -627,17 +659,21 @@ def __call__(
                 input_ids_dtensor,
                 seq_index_tensor,
                 chunk_size=self.logprob_chunk_size,
+                sampling_params=self.sampling_params,  # top-k and top-p filtering
             )
 
             assert token_logprobs.shape[1] == seq_len - 1
         else:
             if isinstance(logits, DTensor):
+                # DTensor path with TP sharding
                 token_logprobs = get_logprobs_from_vocab_parallel_logits(
                     logits,
                     processed_inputs.input_ids,
                     chunk_size=self.logprob_chunk_size,
+                    sampling_params=self.sampling_params,  # top-k and top-p filtering
                 )
             else:
+                # Non-DTensor path (no TP sharding)
                 token_logprobs = self._compute_local_logprobs(
                     logits, processed_inputs.input_ids
                 )
@@ -703,12 +739,18 @@ def _compute_local_logprobs(
                     (chunk_idx + 1) * self.logprob_chunk_size,
                 )
                 chunk_logits = logits[:, chunk_start:chunk_end, :].to(torch.float32)
+                chunk_logits = apply_top_k_top_p_filtering_for_local_logits(
+                    chunk_logits, self.sampling_params
+                )
                 log_probs = torch.nn.functional.log_softmax(chunk_logits, dim=-1)
                 chunked_log_probs.append(log_probs)
             log_probs = torch.cat(chunked_log_probs, dim=1)
             del chunked_log_probs
         else:
             logits = logits.to(torch.float32)
+            logits = apply_top_k_top_p_filtering_for_local_logits(
+                logits, self.sampling_params
+            )
             log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
 
         # Extract logprobs for each token in the sequence by gathering the logprob
diff --git a/nemo_rl/models/megatron/train.py b/nemo_rl/models/megatron/train.py
@@ -184,6 +184,9 @@ def forward_with_post_processing_fn(
         post_processing_fn,
         (LossPostProcessor, LogprobsPostProcessor, TopkLogitsPostProcessor),
     ):
+        # Temperature scaling is element-wise, directly applying it here.
+        # Other sampling parameters like top-k and top-p need the logits from whole vocabulary,
+        # so applying them when gathering logits from vocab parallel (called in LossPostProcessor and LogprobsPostProcessor).
         apply_temperature_scaling(output_tensor, cfg)
 
     # Use type checking to dispatch to the correct post-processing method
diff --git a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py
@@ -574,6 +574,7 @@ def get_logprobs(
             tp_mesh=self.tp_mesh,
             cp_size=self.cp_size,
             enable_seq_packing=self.enable_seq_packing,
+            sampling_params=self.sampling_params,
         )
 
         with torch.no_grad():
@@ -602,7 +603,7 @@ def get_logprobs(
                     # Use forward_with_post_processing_fn for forward pass and post-processing
                     token_logprobs, _metrics, _ = forward_with_post_processing_fn(
                         model=self.model,
-                        cfg=self.cfg,
+                        sampling_params=self.sampling_params,
                         post_processing_fn=logprobs_post_processor,
                         processed_mb=processed_mb,
                         is_reward_model=False,
@@ -671,7 +672,7 @@ def score(self, data: BatchedDataDict) -> BatchedDataDict[ScoreOutputSpec]:
                     # Use forward_with_post_processing_fn for forward pass and post-processing
                     rm_scores, _metrics, _ = forward_with_post_processing_fn(
                         model=self.model,
-                        cfg=self.cfg,
+                        sampling_params=self.sampling_params,
                         post_processing_fn=score_post_processor,
                         processed_mb=processed_mb,
                         is_reward_model=True,
@@ -761,7 +762,7 @@ def get_topk_logits(
                     # Use forward_with_post_processing_fn for forward pass and post-processing
                     (vals, idx), _metrics, _ = forward_with_post_processing_fn(
                         model=self.model,
-                        cfg=self.cfg,
+                        sampling_params=self.sampling_params,
                         post_processing_fn=topk_post_processor,
                         processed_mb=processed_mb,
                         is_reward_model=False,