cp: fix: Disable cudnn sdpa backend when using activation checkpointing (1717) into r0.5.0 (#1727)

chtruong814 · yfw · web-flow · commit 70142eb1d3ef · 2026-01-07T06:17:45.000Z
Signed-off-by: Yi-Fu Wu &lt;yifu.wu@gmail.com&gt;
Signed-off-by: NeMo Bot &lt;nemo-bot@nvidia.com&gt;
Co-authored-by: Yi-Fu Wu &lt;yifu.wu@gmail.com&gt;
diff --git a/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py
@@ -183,6 +183,10 @@ def __init__(
         # - Packed sequence requires FA2 and CP must be 1
         # - CP > 1 requires SDPA
         cp_size_cfg = self.cfg["dtensor_cfg"]["context_parallel_size"]
+
+        # NeMoAutoModelForCausalLM uses flash_attention_2 by default
+        # so we need to set it to None if sequence packing is disabled
+        # https://github.com/NVIDIA-NeMo/Automodel/blob/7e748be260651349307862426c0c168cebdeeec3/nemo_automodel/components/_transformers/auto_model.py#L180
         attn_impl = (
             "flash_attention_2"
             if (self.enable_seq_packing and cp_size_cfg == 1)
@@ -273,23 +277,26 @@ def __init__(
             automodel_kwargs["use_liger_kernel"] = False
 
         with init_empty_weights():
-            # NeMoAutoModelForCausalLM uses flash_attention_2 by default
-            # so we need to set it to None if sequence packing is disabled
-            # https://github.com/NVIDIA-NeMo/Automodel/blob/7e748be260651349307862426c0c168cebdeeec3/nemo_automodel/components/_transformers/auto_model.py#L180
-            if cp_size > 1 or self.cfg["dtensor_cfg"]["activation_checkpointing"]:
-                # For cp, match Automodel's `get_train_context` in `cp_utils.py` where only
+            from torch.nn.attention import SDPBackend
+
+            if cp_size > 1:
+                # Match Automodel's `get_train_context` in `cp_utils.py` where only
                 # flash and efficient backends are supported
                 # Ref: https://github.com/NVIDIA-NeMo/Automodel/blob/81788d6f4848f5f066c4a6a2bece4689a6a83687/nemo_automodel/components/distributed/cp_utils.py#L57
-
-                # For activation_checkpointing, CUDNN_ATTENTION must be excluded
-                # since it results in an error:
-                # "Recomputed values for the following tensors have different metadata than during the forward pass."
-                from torch.nn.attention import SDPBackend
-
                 sdpa_method = [
                     SDPBackend.FLASH_ATTENTION,
                     SDPBackend.EFFICIENT_ATTENTION,
                 ]
+            elif self.cfg["dtensor_cfg"]["activation_checkpointing"]:
+                # For activation checkpointing, we must disable the cudnn SDPA backend because
+                # it may not be selected during recomputation.
+                # In that case, we will get the following error:
+                # "Recomputed values have different metadata than during forward pass."
+                sdpa_method = [
+                    SDPBackend.FLASH_ATTENTION,
+                    SDPBackend.EFFICIENT_ATTENTION,
+                    SDPBackend.MATH,
+                ]
             else:
                 sdpa_method = None
 
@@ -305,6 +312,13 @@ def __init__(
             if self.lora_enabled:
                 apply_lora_to_linear_modules(self.model, self.peft_config)
 
+        # For activation checkpointing, we also must globally disable the cudnn SDPA backend
+        # to ensure that cudnn does not get selected during recomputation.
+        if self.cfg["dtensor_cfg"]["activation_checkpointing"]:
+            from torch.backends import cuda
+
+            cuda.enable_cudnn_sdp(False)
+
         # Hold a copy of model state_dict keys before any parallelization
         self.model_state_dict_keys = list(self.model.state_dict().keys())