NVIDIA-NeMo
diff --git a/‎examples/configs/grpo_math_1B.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/configs/grpo_math_1B.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/configs/vlm_grpo_3B.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/configs/vlm_grpo_3B.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/configs/vlm_grpo_3B_megatron.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/configs/vlm_grpo_3B_megatron.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nemo_rl/algorithms/loss_functions.py‎
Lines changed: 17 additions & 0 deletions b/‎nemo_rl/algorithms/loss_functions.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎tests/unit/algorithms/test_grpo.py‎
Lines changed: 2 additions & 0 deletions b/‎tests/unit/algorithms/test_grpo.py‎
Lines changed: 2 additions & 0 deletions
@@ -43,6 +43,7 @@ loss_fn:
   # Async GRPO requires importance sampling correction enabled
   # Set to true when async_grpo.enabled is true
   use_importance_sampling_correction: false
+  truncated_importance_sampling_ratio: null
   sequence_level_importance_ratios: false
   token_level_loss: true
 
 
@@ -39,6 +39,7 @@ loss_fn:
   # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  truncated_importance_sampling_ratio: null
   token_level_loss: true
 
 checkpointing:
 
@@ -35,6 +35,7 @@ loss_fn:
   ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  truncated_importance_sampling_ratio: null
   token_level_loss: true
 checkpointing:
   enabled: true
 
@@ -42,6 +42,7 @@ class ClippedPGLossConfig(TypedDict):
     ratio_clip_c: float
     use_on_policy_kl_approximation: bool
     use_importance_sampling_correction: bool
+    truncated_importance_sampling_ratio: float | None
     token_level_loss: bool
     # If True, apply the off-policy importance-sampling correction at the
     # sequence level (one weight per generated sample), as in GSPO.
@@ -113,6 +114,9 @@ def __init__(self, cfg: ClippedPGLossConfig):
         self.use_importance_sampling_correction = cfg[
             "use_importance_sampling_correction"
         ]
+        self.truncated_importance_sampling_ratio = cfg[
+            "truncated_importance_sampling_ratio"
+        ]
         # Whether to compute importance weights per-sequence instead of per-token.
         self.sequence_level_importance_ratios = cfg.get(
             "sequence_level_importance_ratios",
@@ -125,6 +129,13 @@ def __init__(self, cfg: ClippedPGLossConfig):
             assert self.loss_type == LossType.SEQUENCE_LEVEL, (
                 "sequence-level importance sampling (e.g. GSPO) is mutually exclusive with token-level loss"
             )
+        if self.truncated_importance_sampling_ratio is not None:
+            assert self.use_importance_sampling_correction, (
+                "truncated_importance_sampling_ratio is only supported when use_importance_sampling_correction is True"
+            )
+            assert self.truncated_importance_sampling_ratio > 0, (
+                "truncated_importance_sampling_ratio should be positive"
+            )
 
     def __call__(
         self,
@@ -280,6 +291,12 @@ def __call__(
             actor_importance_weights_expanded = torch.nan_to_num(
                 actor_importance_weights_expanded, nan=0.0, posinf=0.0, neginf=0.0
             )
+        # TIS see https://fengyao.notion.site/off-policy-rl
+        if self.truncated_importance_sampling_ratio is not None:
+            actor_importance_weights_expanded = torch.clamp(
+                actor_importance_weights_expanded,
+                max=self.truncated_importance_sampling_ratio,
+            )
         actor_importance_weights = actor_importance_weights_expanded
         del actor_importance_weights_expanded
         if self.use_importance_sampling_correction:
 
@@ -889,6 +889,8 @@ def val_iter(self):
             "ratio_clip_c": 1.0,
             "use_on_policy_kl_approximation": False,
             "use_importance_sampling_correction": False,
+            "truncated_importance_sampling_ratio": None,
+            "sequence_level_importance_ratios": False,
             "token_level_loss": True,
         }
     )
Original file line number	Diff line number	Diff line change
`@@ -889,6 +889,8 @@ def val_iter(self):`
`889`	`889`	`"ratio_clip_c": 1.0,`
`890`	`890`	`"use_on_policy_kl_approximation": False,`
`891`	`891`	`"use_importance_sampling_correction": False,`
	`892`	`+ "truncated_importance_sampling_ratio": None,`
	`893`	`+ "sequence_level_importance_ratios": False,`
`892`	`894`	`"token_level_loss": True,`
`893`	`895`	`}`
`894`	`896`	`)`