Enable AITER V3 kernels by default (#372)

ipanfilo · web-flow · commit 9eaaf4cbcedd · 2025-11-19T02:12:23.000-05:00
diff --git a/README.rst b/README.rst
@@ -264,15 +264,15 @@ Note that when using `THD` format tensors with CK Fused Attention, one should pa
 to indicate that there is no padding between sequences. Otherwise, passing proper tensors will indicate padding between sequences. This is the case
 for both the `FusedAttention` and `DotProductAttention` modules.
 
-FA v3 Kernels in CK Backend
+AITER FA v3 Kernels
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ROCm TE provides experimental support for flash-attention v3 fwd/bwd kernels using the ck backend for limited fused attention configs.
-To enable FA v3 kernels, the following environment variables can be used:
+ROCm TE supports flash-attention v3 fwd/bwd kernels on gfx942 and gfx950 using AITER backend.
+This functionality can be controlled by the following environment variables:
 
-* NVTE_CK_USES_FWD_V3 - by default 0, if set to 1, some cases will call the fwd v3 kernel, only applicable to the gfx942 architecture;
-* NVTE_CK_USES_BWD_V3 - by default 0, if set to 1, some cases will call the bwd v3 dqdkdv kernel;
-* NVTE_CK_IS_V3_ATOMIC_FP32 - by default 1, if set to 0 will use atomic fp16/bf16(w/o convert_dq kernel) in bwd pass when NVTE_CK_USES_BWD_V3 is set to 1;
-* NVTE_CK_HOW_V3_BF16_CVT - by default 1, float to bf16 convert type when bwd_v3 is set to 1, 0:RTNE; 1:RTNA; 2:RTZ, only applicable to the gfx942 architecture.
+* NVTE_CK_USES_FWD_V3 - by default 1, if set to 0, v3 kernels will not be used for fwd pass;
+* NVTE_CK_USES_BWD_V3 - by default 1, if set to 0, v3 kernels will not be used for bwd pass;
+* NVTE_CK_IS_V3_ATOMIC_FP32 - by default 1, if set to 0 will use atomic fp16/bf16(w/o convert_dq kernel) in bwd pass when v3 is enabled;
+* NVTE_CK_HOW_V3_BF16_CVT - by default 1, float to bf16 convert type when v3 is enabled, 0:RTNE; 1:RTNA; 2:RTZ, only applicable to the gfx942 architecture.
 
 Float to BFloat16 Conversion in CK Backend (gfx942 only)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/benchmarks/attention/benchmark_attention_rocm.py b/benchmarks/attention/benchmark_attention_rocm.py
@@ -104,19 +104,18 @@ def cleanup_env():
     for var in ATTENTION_ENV_VARS:
         os.environ[var] = "0"
 
-def setup_backend_env(backend_name, use_ck_bwd_v3=False, use_ck_fwd_v3=False, use_ck_v3_a16=False):
+def setup_backend_env(backend_name, use_ck_bwd_v3=True, use_ck_fwd_v3=True, use_ck_v3_a16=False):
     cleanup_env()
     
     if backend_name == "flash":
         os.environ["NVTE_FLASH_ATTN"] = "1"
     elif backend_name == "fused_ck":
         os.environ["NVTE_FUSED_ATTN"] = "1"
         os.environ["NVTE_FUSED_ATTN_CK"] = "1"
+        os.environ["NVTE_CK_USES_BWD_V3"] = "1" if use_ck_bwd_v3 else "0"
         if use_ck_bwd_v3:
-            os.environ["NVTE_CK_USES_BWD_V3"] = "1"
             os.environ["NVTE_CK_IS_V3_ATOMIC_FP32"] = "0" if use_ck_v3_a16 else "1"
-        if use_ck_fwd_v3:
-            os.environ["NVTE_CK_USES_FWD_V3"] = "1"
+        os.environ["NVTE_CK_USES_FWD_V3"] = "1" if use_ck_fwd_v3 else "0"
     elif backend_name == "fused_aotriton":
         os.environ["NVTE_FUSED_ATTN"] = "1"
         os.environ["NVTE_FUSED_ATTN_AOTRITON"] = "1"
@@ -359,7 +358,7 @@ def main(args):
     print(
         f"Device {device_id}: "
         f"{device_properties.name} GPU, "
-        f"sm{device_properties.major}{device_properties.minor} compute capability, "
+        f"{device_properties.gcnArchName.split(':')[0]} architecture, "
         f"{device_properties.total_memory/1024**3:.1f}GB memory"
     )
     # Benchmarking starts..
@@ -438,8 +437,8 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--use_ck_bwd_v3", action="store_true", help="Use NVTE_CK_USES_BWD_V3=1 for CK bwd kernels")
-    parser.add_argument("--use_ck_fwd_v3", action="store_true", help="Use NVTE_CK_USES_FWD_V3=1 for CK fwd kernels")
+    parser.add_argument("--no_ck_bwd_v3", action="store_false", dest="use_ck_bwd_v3", help="Set NVTE_CK_USES_BWD_V3=0 for CK bwd kernels")
+    parser.add_argument("--no_ck_fwd_v3", action="store_false", dest="use_ck_fwd_v3", help="Set NVTE_CK_USES_FWD_V3=0 for CK fwd kernels")
     parser.add_argument("--use_ck_v3_a16", action="store_true", help="Use NVTE_CK_IS_V3_ATOMIC_FP32=0 for atomic16. Default is 1")
     parser.add_argument("--run_sanity_checks", action="store_true", help="After benchmarking, verify profiler outputs.")
     args = parser.parse_args()
diff --git a/ci/jax.sh b/ci/jax.sh
@@ -50,12 +50,18 @@ run_lbl() {
     _test_label=""
 }
 
+run_default_fa_lbl() {
+    if [ $_fus_attn = "$_DEFAULT_FUSED_ATTN" ]; then
+        run_lbl "$@"
+    fi
+}
+
 run_test_config() {
     echo ==== Run with Fused attention backend: $_fus_attn ====
     run_default_fa 1 test_custom_call_compute.py
     run_default_fa 1 test_functions.py
     run 1 test_fused_attn.py
-    NVTE_CK_USES_FWD_V3=1 NVTE_CK_USES_BWD_V3=1 run_lbl "v3" 1 test_fused_attn.py # Using FAv3 for forward and backward pass
+    NVTE_CK_USES_FWD_V3=0 NVTE_CK_USES_BWD_V3=0 run_default_fa_lbl "v2" 3 test_fused_attn.py # Using FAv2 for forward and backward pass
     run_default_fa 1 test_helper.py
     run_default_fa 1 test_layer.py #it effectevly always uses unfused attention
     run_default_fa 1 test_sanity_import.py
diff --git a/ci/pytorch.sh b/ci/pytorch.sh
@@ -72,7 +72,6 @@ run_test_config(){
     run 1 test_sanity.py
     run_default_fa 1 test_sanity_import.py
     run_default_fa 1 fused_attn/test_fused_attn.py # Backend selection is controlled by the test
-    NVTE_CK_USES_FWD_V3=1 NVTE_CK_USES_BWD_V3=1 run_default_fa_lbl "v3" 1 fused_attn/test_fused_attn.py # Using FAv3 for forward and backward pass
     run_default_fa 1 triton_kernels/test_cast.py
     run_default_fa 1 triton_kernels/test_cast_mxfp8.py
     run_default_fa 1 triton_kernels/test_norm_common.py
@@ -113,7 +112,7 @@ run_benchmark() {
         return
     fi
 
-    python "$BENCH_SCRIPT" --use_ck_fwd_v3 --use_ck_bwd_v3 --run_sanity_checks || test_run_error $BENCH_SCRIPT
+    python "$BENCH_SCRIPT" --run_sanity_checks || test_run_error $BENCH_SCRIPT
 }
 
 # Single config mode, run it and return result
diff --git a/tests/pytorch/fused_attn/test_fused_attn.py b/tests/pytorch/fused_attn/test_fused_attn.py
@@ -92,7 +92,8 @@ def __del__(self):
 @pytest.fixture(autouse=True)
 def reset_attn_backend():
     env = EnvVarCleaner(["NVTE_FLASH_ATTN", "NVTE_FUSED_ATTN", "NVTE_UNFUSED_ATTN",
-                         "NVTE_FUSED_ATTN_CK", "NVTE_FUSED_ATTN_AOTRITON"])
+                         "NVTE_FUSED_ATTN_CK", "NVTE_FUSED_ATTN_AOTRITON",
+                         "NVTE_CK_USES_FWD_V3", "NVTE_CK_USES_BWD_V3"])
     yield
 
 
@@ -421,6 +422,8 @@ def test_dot_product_attention(
             os.environ["NVTE_FUSED_ATTN_BACKEND"] = "1"
             os.environ["NVTE_FUSED_ATTN_CK"] = "1"
             os.environ["NVTE_FUSED_ATTN_AOTRITON"] = "0"
+            os.environ["NVTE_CK_USES_FWD_V3"] = "1"
+            os.environ["NVTE_CK_USES_BWD_V3"] = "1"
             fused_attn_fwd_1, fused_attn_bwd_1 = _run_dot_product_attention(
                 dtype,
                 config,
@@ -432,8 +435,21 @@ def test_dot_product_attention(
                 is_training,
                 share_cu_seqlens_ref,
             )
-            del os.environ["NVTE_FUSED_ATTN_CK"]
-            del os.environ["NVTE_FUSED_ATTN_AOTRITON"]
+            if IS_HIP_EXTENSION:
+                os.environ["NVTE_CK_USES_FWD_V3"] = "0"
+                os.environ["NVTE_CK_USES_BWD_V3"] = "0"
+                fused_attn_fwd_2, fused_attn_bwd_2 = _run_dot_product_attention(
+                    dtype,
+                    config,
+                    "FusedAttention",
+                    ckpt_attn,
+                    qkv_layout,
+                    workspace_opt,
+                    pad_between_seqs,
+                    is_training,
+                    share_cu_seqlens_ref,
+                )
+
 
     # FlashAttention backend
     if flash_attn_supported:
@@ -469,6 +485,11 @@ def test_dot_product_attention(
         torch.testing.assert_close(fused_attn_fwd, fused_attn_fwd_1, **tols)
         for i, _ in enumerate(fused_attn_bwd):
             torch.testing.assert_close(fused_attn_bwd[i], fused_attn_bwd_1[i], **tols)
+        if IS_HIP_EXTENSION:
+            logging.info("[test_dot_product_attention]: fused attn backend 0 vs 2")
+            torch.testing.assert_close(fused_attn_fwd, fused_attn_fwd_2, **tols)
+            for i, _ in enumerate(fused_attn_bwd):
+                torch.testing.assert_close(fused_attn_bwd[i], fused_attn_bwd_2[i], **tols)
 
 
 @pytest.mark.skipif(get_cudnn_version() < (8, 9, 1), reason="cuDNN 8.9.1+ is required.")
diff --git a/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp b/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp
@@ -556,7 +556,7 @@ void fused_attn_ck_fwd_impl(
     if (env_p != nullptr && std::string(env_p) == "1")
       nvte_log_ck_config = true;
   }
-  bool nvte_ck_uses_fwd_v3 = getenv<int>("NVTE_CK_USES_FWD_V3", 0);
+  bool nvte_ck_uses_fwd_v3 = getenv<int>("NVTE_CK_USES_FWD_V3", 1);
 
   bool is_ragged = nvte_get_qkv_format(layout)==NVTE_QKV_Format::NVTE_THD;
 
@@ -1037,7 +1037,7 @@ void fused_attn_ck_bwd_impl(
 
   // bwd v3 is optional by enabling the following envs
   // default values follows the ck example setting
-  bool nvte_ck_uses_bwd_v3 = getenv<int>("NVTE_CK_USES_BWD_V3", 0);
+  bool nvte_ck_uses_bwd_v3 = getenv<int>("NVTE_CK_USES_BWD_V3", 1);
   bool nvte_ck_is_v3_atomic_fp32 = getenv<int>("NVTE_CK_IS_V3_ATOMIC_FP32", 1);
   int nvte_ck_how_v3_bf16_cvt = getenv<int>("NVTE_CK_HOW_V3_BF16_CVT", 1);
   if (nvte_log_ck_config) {