fix: fix temperature-related issues (#935)

zhandaz · wangshangsam · web-flow · commit 2aea5addc35f · 2025-08-26T20:25:00.000Z
Signed-off-by: Zhanda &lt;zhandazhu@gmail.com&gt;
Signed-off-by: Zhanda Zhu &lt;49645678+zhandaz@users.noreply.github.com&gt;
Co-authored-by: Shang Wang &lt;samshang.wang@mail.utoronto.ca&gt;
diff --git a/nemo_rl/models/generation/vllm/vllm_generation.py b/nemo_rl/models/generation/vllm/vllm_generation.py
@@ -37,6 +37,12 @@
 )
 from nemo_rl.models.generation.vllm.config import VllmConfig
 
+# Global thresholds for top_k and top_p validation.
+# While top-k/p are not supported, these values allow for token filtering while the logprobs should be compatible.
+# See https://github.com/NVIDIA-NeMo/RL/issues/69 and https://github.com/NVIDIA-NeMo/RL/issues/237 for more details.
+TOP_K_THRESHOLD = 8000  # Allow top_k >= 8000 (effectively no filtering)
+TOP_P_THRESHOLD = 0.99  # Allow top_p >= 0.99 (close to 1.0)
+
 
 class VllmGeneration(GenerationInterface):
     def __init__(
@@ -55,6 +61,33 @@ def __init__(
                 "You can enable it by adding `policy.generation.vllm_cfg.async_engine=true` to your command."
             )
 
+        # Validate sampling parameters early to avoid resource allocation with unsupported configs.
+        # The vLLM sampler patch only supports temperature scaling and does not handle top_p/top_k correctly.
+        # However, we allow values above certain thresholds for token filtering purposes.
+        top_k: int | None = self.cfg.get("top_k")
+        if top_k is not None and top_k != -1 and top_k < TOP_K_THRESHOLD:
+            raise ValueError(
+                (
+                    f"top_k sampling with values < {TOP_K_THRESHOLD} is not supported because the vLLM V1 engine "
+                    "does not return logprobs after top_k filtering. Values >= {TOP_K_THRESHOLD} are allowed "
+                    "for token filtering purposes. If you understand the implications and still want to use "
+                    f"a lower top_k value, please manually comment out this check. Got top_k={top_k}. "
+                    "See https://github.com/NVIDIA-NeMo/RL/issues/69 for more details."
+                )
+            )
+
+        top_p: float = self.cfg.get("top_p", 1.0)
+        if top_p < TOP_P_THRESHOLD:
+            raise ValueError(
+                (
+                    f"top_p sampling with values < {TOP_P_THRESHOLD} is not supported because the vLLM V1 engine "
+                    "does not return logprobs after top_p filtering. Values >= {TOP_P_THRESHOLD} are allowed "
+                    "for token filtering purposes. If you understand the implications and still want to use "
+                    f"a lower top_p value, please manually comment out this check. Got top_p={top_p}. "
+                    "See https://github.com/NVIDIA-NeMo/RL/issues/69 for more details."
+                )
+            )
+
         # Ensure all required VllmConfig fields are present
         missing_keys = [
             key for key in VllmConfig.__required_keys__ if key not in self.cfg
diff --git a/nemo_rl/models/generation/vllm/vllm_worker.py b/nemo_rl/models/generation/vllm/vllm_worker.py
@@ -250,6 +250,43 @@ def _patch_vllm_init_workers_ray():
             _patch_vllm_init_workers_ray()
             logger.info("Successfully patched vllm _init_workers_ray.")
 
+            # Patch the vLLM sampler.py file to modify logprobs computation wrt temperature.
+            # This replaces raw_logprobs = self.compute_logprobs(logits) with custom temperature-applied logprobs.
+            # TODO(zhanda): This is only a temporary fix to address the issue of incorrect logprobs returned by vllm
+            # and should be removed or improved after vllm's new logprobs option is released. And currently, other
+            # sampling parameters like top_p, top_k, etc. are not supported.
+            # See https://github.com/NVIDIA-NeMo/RL/issues/69 for more details.
+            def _patch_vllm_sampler():
+                try:
+                    import vllm.v1.sample.sampler as sampler_module
+
+                    file_to_patch = sampler_module.__file__
+
+                    with open(file_to_patch, "r") as f:
+                        content = f.read()
+
+                    old_line = "raw_logprobs = self.compute_logprobs(logits)"
+                    new_lines = "raw_logprobs = self.compute_logprobs(self.apply_temperature(logits.to(torch.float32), sampling_metadata.temperature) if sampling_metadata.temperature is not None else logits)"
+
+                    if new_lines in content:
+                        return
+
+                    if old_line not in content:
+                        return
+
+                    # Replace all instances of the old line with the new lines
+                    patched_content = content.replace(old_line, new_lines)
+
+                    # Write back the patched content
+                    with open(file_to_patch, "w") as f:
+                        f.write(patched_content)
+
+                except (ImportError, FileNotFoundError, PermissionError):
+                    # Allow failures gracefully
+                    pass
+
+            _patch_vllm_sampler()
+
         except (ImportError, AttributeError):
             # vllm not installed or has a different structure, skipping patch.
             pass
diff --git a/nemo_rl/models/megatron/common.py b/nemo_rl/models/megatron/common.py
@@ -260,6 +260,7 @@ def forward_step_arbitrary_loss(
     pad_individual_seqs_to_multiple_of: int = 1,
     pad_full_seq_to: Optional[int] = None,
     cp_normalize: bool = True,
+    policy_cfg: Optional[dict] = None,
 ):
     """Forward training step with support for packed sequences and context parallelism.
 
@@ -273,6 +274,7 @@ def forward_step_arbitrary_loss(
         pack_sequences (bool): Whether to pack sequences for efficiency
         seq_length_key (Optional[str]): Key in data_dict containing actual sequence lengths
         cp_normalize (bool): Whether to normalize the loss by the cp_size
+        policy_cfg (Optional[dict]): Policy configuration containing generation parameters
 
     Notes on packed sequences with context parallelism (CP):
         - When CP > 1, each sequence is padded to a multiple of (cp_size * 2)
@@ -342,6 +344,15 @@ def forward_step_arbitrary_loss(
             packed_seq_params=packed_seq_params,
         )
 
+        # Apply temperature scaling to logits for training
+        # This matches the dtensor worker's _apply_temperature_scaling in the train method
+        if (
+            policy_cfg is not None
+            and "generation" in policy_cfg
+            and policy_cfg["generation"] is not None
+        ):
+            output_tensor.div_(policy_cfg["generation"]["temperature"])
+
         # Unpack the output tensor if we did packed sequences
         if pack_sequences and packed_seq_params is not None:
             # remove padding
diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py
@@ -69,7 +69,6 @@
     get_handle_from_tensor,
     get_runtime_env_for_policy_worker,
     import_class_from_path,
-    is_vllm_v1_engine_enabled,
     resolve_model_class,
     sliding_window_overwrite,
 )
@@ -471,13 +470,8 @@ def create_context_parallel_ctx(
     # based on https://github.com/pytorch/torchtitan/blob/cddd7dc809f36fe0ed51cdaaea0671c084d75442/torchtitan/distributed/utils.py#L178
 
     def _apply_temperature_scaling(self, logits: torch.Tensor) -> torch.Tensor:
-        # Apply temperature scaling to logits if configured and not using V1 engine.
         if "generation" in self.cfg and self.cfg["generation"] is not None:
-            # The V1 engine returns raw logits before temperature scaling.
-            # The V0 engine returns scaled logits.
-            # Therefore, we only divide if we are not using the V1 engine.
-            if not is_vllm_v1_engine_enabled():
-                logits.div_(self.cfg["generation"]["temperature"])
+            logits.div_(self.cfg["generation"]["temperature"])
         return logits
 
     @staticmethod
diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py
@@ -79,7 +79,6 @@
     get_handle_from_tensor,
     get_runtime_env_for_policy_worker,
     import_class_from_path,
-    is_vllm_v1_engine_enabled,
 )
 from nemo_rl.utils.native_checkpoint import (
     load_checkpoint,
@@ -420,13 +419,8 @@ def __init__(
         self._held_streamed_param_reference: Optional[dict[str, torch.Tensor]] = None
 
     def _apply_temperature_scaling(self, logits: torch.Tensor) -> torch.Tensor:
-        # Apply temperature scaling to logits if configured and not using V1 engine.
         if "generation" in self.cfg and self.cfg["generation"] is not None:
-            # The V1 engine returns raw logits before temperature scaling.
-            # The V0 engine returns scaled logits.
-            # Therefore, we only divide if we are not using the V1 engine.
-            if not is_vllm_v1_engine_enabled():
-                logits.div_(self.cfg["generation"]["temperature"])
+            logits.div_(self.cfg["generation"]["temperature"])
         return logits
 
     def init_collective(self, ip: str, port: int, world_size: int) -> None:
diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -820,7 +820,9 @@ def train(
                         f"Dim 1 must be the sequence dim, expected dim 1={seq_dim_size} but got shape {v.shape}"
                     )
 
-            forward_step = partial(forward_step_arbitrary_loss, loss_fn=loss_fn)
+            forward_step = partial(
+                forward_step_arbitrary_loss, loss_fn=loss_fn, policy_cfg=self.cfg
+            )
             all_mb_metrics = []
             losses = []
             for gb_idx in range(num_global_batches):
@@ -1111,6 +1113,11 @@ def forward_step_fn(
                 packed_seq_params=packed_seq_params,
             )
 
+            # Apply temperature scaling to logits for training
+            # This matches the dtensor worker's _apply_temperature_scaling in the train method
+            if "generation" in self.cfg and self.cfg["generation"] is not None:
+                output_tensor.div_(self.cfg["generation"]["temperature"])
+
             def collection_fn(output_tensor):
                 stc = time.time()
                 tp_grp = get_tensor_model_parallel_group()
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
@@ -39,7 +39,13 @@
     },
     "dtype": "bfloat16",
     "max_new_tokens": 5,  # Small number of tokens for testing
-    "temperature": 0.8,
+    # Set temperature=1.0 to ensure consistent probability scaling when comparing vLLM and HF policy outputs.
+    # Note: greedy=True is only used in tests for deterministic behavior and not used in the real training.
+    # In vLLM, enabling greedy=True disables temperature scaling (temperature is overridden to None).
+    # The HF policy worker does not currently support greedy=True for get_logprobs.
+    # Using temperature=1.0 allows us to meaningfully test the average probability multiplicative error between the two implementations,
+    # while still maintaining the deterministic behavior.
+    "temperature": 1.0,
     "top_p": 1.0,
     "top_k": None,
     "stop_token_ids": None,
@@ -326,6 +332,43 @@ def test_vllm_missing_required_config_key(cluster):
     print(f"Successfully caught missing config key with error: {error_message}")
 
 
+def test_vllm_top_p_top_k_validation(cluster):
+    """Test that top_p and top_k validation works correctly with threshold-based logic."""
+    # Test that values above thresholds are allowed
+    config_above_thresholds = deepcopy(basic_vllm_test_config)
+    config_above_thresholds["top_p"] = 0.99  # Above TOP_P_THRESHOLD
+    config_above_thresholds["top_k"] = 8000  # Above TOP_K_THRESHOLD
+
+    # Should not raise an error
+    try:
+        VllmGeneration(cluster, config_above_thresholds)
+        print("Successfully initialized with top_p=0.99 and top_k=8000")
+    except Exception as e:
+        pytest.fail(f"Should not raise error with values above thresholds: {e}")
+
+    # Test that values below thresholds are rejected
+    config_below_thresholds = deepcopy(basic_vllm_test_config)
+    config_below_thresholds["top_p"] = 0.9  # Below TOP_P_THRESHOLD
+
+    with pytest.raises(ValueError) as excinfo:
+        VllmGeneration(cluster, config_below_thresholds)
+
+    error_message = str(excinfo.value)
+    assert "top_p sampling with values < 0.99 is not supported" in error_message
+    print(f"Successfully caught low top_p value with error: {error_message}")
+
+    # Test that low top_k values are rejected
+    config_low_top_k = deepcopy(basic_vllm_test_config)
+    config_low_top_k["top_k"] = 7999  # Below TOP_K_THRESHOLD
+
+    with pytest.raises(ValueError) as excinfo:
+        VllmGeneration(cluster, config_low_top_k)
+
+    error_message = str(excinfo.value)
+    assert "top_k sampling with values < 8000 is not supported" in error_message
+    print(f"Successfully caught low top_k value with error: {error_message}")
+
+
 def test_vllm_policy_generation(policy, test_input_data, tokenizer):
     """Test vLLM policy generation capabilities."""
     # Test generation