Fix reuse_grad_buf_for_mxfp8_param_ag for mxfp8 (#14445)

guyueh1 · web-flow · commit b56af44e043a · 2025-08-13T21:30:16.000+05:30
Signed-off-by: Guyue Huang &lt;guyueh@nvidia.com&gt;
diff --git a/nemo/collections/llm/recipes/precision/mixed_precision.py b/nemo/collections/llm/recipes/precision/mixed_precision.py
@@ -99,6 +99,7 @@ def bf16_with_mxfp8_mixed() -> run.Config[MegatronMixedPrecision]:
     cfg.fp8 = 'hybrid'
     cfg.fp8_recipe = "mxfp8"
     cfg.fp8_param_gather = True
+    cfg.reuse_grad_buf_for_mxfp8_param_ag = True
     return cfg
 
 
@@ -112,6 +113,7 @@ def fp16_with_mxfp8_mixed() -> run.Config[MegatronMixedPrecision]:
     cfg.fp8 = 'hybrid'
     cfg.fp8_recipe = "mxfp8"
     cfg.fp8_param_gather = True
+    cfg.reuse_grad_buf_for_mxfp8_param_ag = True
     return cfg
 
 
diff --git a/nemo/lightning/fabric/plugins.py b/nemo/lightning/fabric/plugins.py
@@ -60,6 +60,7 @@ def __init__(
         first_last_layers_bf16: bool = False,
         num_layers_at_start_in_bf16: int = 0,
         num_layers_at_end_in_bf16: int = 0,
+        reuse_grad_buf_for_mxfp8_param_ag: bool = False,
         fp8_margin: int = 0,
         fp8_amax_history_len: int = 1,
         fp8_amax_compute_algo: str = "most_recent",
@@ -104,6 +105,7 @@ def __init__(
             first_last_layers_bf16=first_last_layers_bf16,
             num_layers_at_start_in_bf16=num_layers_at_start_in_bf16,
             num_layers_at_end_in_bf16=num_layers_at_end_in_bf16,
+            reuse_grad_buf_for_mxfp8_param_ag=reuse_grad_buf_for_mxfp8_param_ag,
             fp8_margin=fp8_margin,
             fp8_amax_history_len=fp8_amax_history_len,
             fp8_amax_compute_algo=fp8_amax_compute_algo,
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -86,6 +86,7 @@ class DtypeConfig:
     hysteresis: float = (None,)
     num_layers_at_start_in_bf16: int = 0
     num_layers_at_end_in_bf16: int = 0
+    reuse_grad_buf_for_mxfp8_param_ag: bool = False
 
 
 class MegatronMixedPrecision(Precision):
@@ -122,6 +123,7 @@ def __init__(
         fp16_hysteresis: int = 2,
         num_layers_at_start_in_bf16: int = 0,
         num_layers_at_end_in_bf16: int = 0,
+        reuse_grad_buf_for_mxfp8_param_ag: bool = False,
     ) -> None:
         if fp8_params is not None:
             logging.warning(
@@ -161,6 +163,7 @@ def __init__(
             fp8_param_gather=fp8_param_gather,
             num_layers_at_start_in_bf16=num_layers_at_start_in_bf16,
             num_layers_at_end_in_bf16=num_layers_at_end_in_bf16,
+            reuse_grad_buf_for_mxfp8_param_ag=reuse_grad_buf_for_mxfp8_param_ag,
             # fp16 loss scale
             loss_scale=fp16_loss_scale,
             initial_loss_scale=fp16_initial_loss_scale,
diff --git a/scripts/performance/helpers.py b/scripts/performance/helpers.py
@@ -226,8 +226,6 @@ def set_precision_configs(recipe, compute_dtype: str, fp8_recipe: str | None = N
     # Enable reuse_grad_buf_for_mxfp8_param_ag for MXFP8 and disable AG overlap
     # because it is not supported with reuse_grad_buf_for_mxfp8_param_ag
     if compute_dtype.lower() == "fp8" and fp8_recipe.lower() == "mxfp8":
-        recipe.trainer.strategy.ddp.reuse_grad_buf_for_mxfp8_param_ag = True
-        recipe.optim.config.reuse_grad_buf_for_mxfp8_param_ag = True
         comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
         if comm_overlap_callback_idx is not None:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather = False
diff --git a/tests/collections/llm/recipes/test_mixed_precision.py b/tests/collections/llm/recipes/test_mixed_precision.py
@@ -87,6 +87,7 @@ def test_bf16_with_mxfp8_mixed_config():
     assert config.fp8 == "hybrid"
     assert config.fp8_recipe == "mxfp8"
     assert config.fp8_param_gather is True
+    assert config.reuse_grad_buf_for_mxfp8_param_ag is True
 
 
 def test_fp16_with_mxfp8_mixed_config():
@@ -99,6 +100,7 @@ def test_fp16_with_mxfp8_mixed_config():
     assert config.fp8 == "hybrid"
     assert config.fp8_recipe == "mxfp8"
     assert config.fp8_param_gather is True
+    assert config.reuse_grad_buf_for_mxfp8_param_ag is True
 
 
 def test_bf16_with_fp8_current_scaling_mixed_config():