support configurable spiky loss threshold on re-run state machine

ananthsub · ananthsub · commit 014391bd99fb · 2026-01-28T17:27:00.000-08:00
Signed-off-by: Ananth Subramaniam &lt;ansubramania@nvidia.com&gt;
diff --git a/docs/training/resiliency.md b/docs/training/resiliency.md
@@ -441,6 +441,7 @@ config.rerun_state_machine = RerunStateMachineConfig(
     rerun_mode="validate_results",  # or "report_determinism_stats" or "disabled"
     check_for_nan_in_loss=True,
     check_for_spiky_loss=False,
+    spiky_loss_factor=10.0,  # Adjust for your model architecture
     error_injection_rate=0,  # For testing only
     error_injection_type="transient_error",
 )
@@ -453,6 +454,7 @@ config.rerun_state_machine = RerunStateMachineConfig(
 | `rerun_mode` | `str` | `"disabled"` | Operating mode: `"disabled"`, `"validate_results"`, or `"report_determinism_stats"` |
 | `check_for_nan_in_loss` | `bool` | `True` | Check for NaN values in loss |
 | `check_for_spiky_loss` | `bool` | `False` | Check for unexpectedly large loss values |
+| `spiky_loss_factor` | `float` | `10.0` | Factor for spiky loss detection. Loss is flagged if it exceeds this multiple of max observed loss. Larger models may need higher values (e.g., 15-20 for 70B+). |
 | `error_injection_rate` | `int` | `0` | Rate for injecting test errors (testing only) |
 | `error_injection_type` | `str` | `"transient_error"` | Type of error to inject for testing |
 
diff --git a/src/megatron/bridge/training/config.py b/src/megatron/bridge/training/config.py
@@ -223,6 +223,10 @@ class RerunStateMachineConfig:
     check_for_spiky_loss: bool = False
     """Check for spiky loss."""
 
+    spiky_loss_factor: float = 10.0
+    """Factor for detecting spiky loss. A loss is considered spiky if it exceeds
+    this multiple of the max observed loss over the sample window."""
+
 
 @dataclass(kw_only=True)
 class DataloaderConfig:
diff --git a/src/megatron/bridge/training/initialize.py b/src/megatron/bridge/training/initialize.py
@@ -222,6 +222,7 @@ def init_rerun_state(rerun_state_machine_config: RerunStateMachineConfig) -> Non
         RerunDiagnostic,
         RerunErrorInjector,
         RerunMode,
+        get_rerun_state_machine,
         initialize_rerun_state_machine,
     )
 
@@ -242,6 +243,10 @@ def state_restore_func(state_dict):
         ),
     )
 
+    # Store config on the singleton for use in loss validation
+    rsm = get_rerun_state_machine()
+    rsm.spiky_loss_factor = rerun_state_machine_config.spiky_loss_factor
+
 
 def set_jit_fusion_options(model_config: GPTModelProvider | T5ModelProvider, micro_batch_size: int) -> None:
     """Set PyTorch JIT layer fusion options and warmup JIT functions.
diff --git a/src/megatron/bridge/training/losses.py b/src/megatron/bridge/training/losses.py
@@ -19,7 +19,7 @@
 from megatron.core.rerun_state_machine import get_rerun_state_machine
 
 
-SPIKY_LOSS_FACTOR: int = 10
+_DEFAULT_SPIKY_LOSS_FACTOR: float = 10.0
 
 
 def create_masked_next_token_loss_function(
@@ -86,11 +86,12 @@ def masked_next_token_loss(
         )
     # Check for spiky loss
     if check_for_spiky_loss:
+        spiky_loss_factor = getattr(rerun_state_machine, "spiky_loss_factor", _DEFAULT_SPIKY_LOSS_FACTOR)
         rerun_state_machine.validate_result(
             result=loss,
             rejection_func=partial(
                 rerun_state_machine.is_unexpectedly_large,
-                threshold=SPIKY_LOSS_FACTOR,
+                threshold=spiky_loss_factor,
                 context="loss",
             ),
             message="Spiky loss",
diff --git a/tests/unit_tests/training/test_losses.py b/tests/unit_tests/training/test_losses.py
@@ -20,7 +20,7 @@
 import torch
 
 from megatron.bridge.training.losses import (
-    SPIKY_LOSS_FACTOR,
+    _DEFAULT_SPIKY_LOSS_FACTOR,
     create_masked_next_token_loss_function,
     masked_next_token_loss,
 )
@@ -312,7 +312,7 @@ def test_partial_function_execution(self):
 class TestConstants:
     """Test module constants."""
 
-    def test_spiky_loss_factor(self):
-        """Test that SPIKY_LOSS_FACTOR has expected value."""
-        assert SPIKY_LOSS_FACTOR == 10, "SPIKY_LOSS_FACTOR should be 10"
-        assert isinstance(SPIKY_LOSS_FACTOR, int), "SPIKY_LOSS_FACTOR should be an integer"
+    def test_default_spiky_loss_factor(self):
+        """Test that _DEFAULT_SPIKY_LOSS_FACTOR has expected value."""
+        assert _DEFAULT_SPIKY_LOSS_FACTOR == 10.0, "_DEFAULT_SPIKY_LOSS_FACTOR should be 10.0"
+        assert isinstance(_DEFAULT_SPIKY_LOSS_FACTOR, float), "_DEFAULT_SPIKY_LOSS_FACTOR should be a float"