Fix JAX and Pytorch UT, code cleanup, ROCm 7.2 w/a (#404)

ipanfilo · ipanfilo · commit 87546b58cf12 · 2026-01-12T23:32:18.000-05:00
diff --git a/ci/jax.sh b/ci/jax.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE for license information.
 
@@ -54,6 +54,7 @@ run_default_fa_lbl() {
 
 run_test_config() {
     echo ==== Run with Fused attention backend: $_fus_attn ====
+    export NVTE_JAX_UNITTEST_LEVEL=L0 # this env variable controls parameters set for some tests
     run_default_fa 1 test_custom_call_compute.py
     run_default_fa 1 test_functions.py
     run 1 test_fused_attn.py
@@ -75,8 +76,10 @@ run_test_config_mgpu() {
 
     if [ $_fus_attn = $_DEFAULT_FUSED_ATTN ]; then
         _dfa_level=2
+        export NVTE_JAX_UNITTEST_LEVEL=L1
     else
         _dfa_level=3
+        export NVTE_JAX_UNITTEST_LEVEL=L2
     fi
     run $_dfa_level test_distributed_fused_attn.py $_timeout_args
     run_default_fa 3 test_distributed_layernorm.py
diff --git a/tests/pytorch/attention/test_attention.py b/tests/pytorch/attention/test_attention.py
@@ -193,7 +193,7 @@ def test_dot_product_attention(
         config.window_size = [2, 2]
     config.window_size = check_set_window_size(config.attn_mask_type, config.window_size)
 
-    is_training = True #PIV TODO: config.head_dim_qk <= 192 and config.head_dim_v <= 128
+    is_training = True
     available_backends, _, fused_attn_backends = get_available_attention_backends(
         config,
         qkv_dtype=dtype,
@@ -375,10 +375,6 @@ def test_dpa_checkpoint(dtype, model_configs, model):
     "mla_3_2": ModelConfig(8, 1, 16, 192, max_seqlen_kv=2048, head_dim_v=128),  # inference
     "mla_3_3": ModelConfig(8, 1, 16, 160, max_seqlen_kv=2048, head_dim_v=128),  # inference
     "mla_3_4": ModelConfig(8, 1, 16, 160, max_seqlen_kv=2048, head_dim_v=160),  # inference
-    #"mla_4_0": ModelConfig(#PIV TODO: do cross 0 and cross 1 cover it
-    #    10, 4096, 16, 192, max_seqlen_kv=4096, attn_mask_type="causal", head_dim_v=128
-    #),
-    #"mla_4_1": ModelConfig(10, 4096, 16, 192, max_seqlen_kv=4096, head_dim_v=128),
 }
 
 
diff --git a/tests/pytorch/attention/test_attention_with_cp.py b/tests/pytorch/attention/test_attention_with_cp.py
@@ -1,5 +1,5 @@
 # This file was modified for portability to AMDGPU
-# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -92,6 +92,9 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
         )
     if "p2p" not in cp_comm_type and config.head_dim_qk != config.head_dim_v:
         pytest.skip("MLA CP currently only support KV P2P!")
+    if IS_HIP_EXTENSION:
+        if config.head_dim_qk != config.head_dim_v and not FlashAttentionUtils.v3_is_installed:
+            pytest.skip("MLA FlashAttention requires v3+!")
 
     subprocess.run(
         get_bash_arguments(
diff --git a/tests/pytorch/attention/test_kv_cache.py b/tests/pytorch/attention/test_kv_cache.py
@@ -386,6 +386,12 @@ def get_tols(config, module, backend, dtype):
                     torch.half: (1e-2, 1e-2),
                     torch.bfloat16: (8e-2, 7e-2),
                 }
+            # With FA on ROCm it may not fit default tolerance
+            if IS_HIP_EXTENSION and backend == "FlashAttention":
+                tols = {
+                    torch.half: (1e-2, 1e-2),
+                    torch.bfloat16: (1e-1, 1e-1),
+                }
     if module == "DotProductAttention":
         tols = {
             torch.half: (1e-3, 1e-3),
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
@@ -656,6 +656,9 @@ def _test_e2e_selective_recompute(
 def test_gpt_selective_activation_recompute(dtype, bs, model, fp8, recipe, fp8_model_params):
     if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
         pytest.skip("FP8 parameters are not supported in debug mode.")
+    if (IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5) and
+        dtype in (torch.float16, torch.bfloat16) and rocm_attn_backend()[2]):
+        pytest.skip("Test is not supported on GFX950 with current parameters and CK fused attention backend and non-zero dropout.")
 
     config = model_configs[model]
 
@@ -775,6 +778,8 @@ def test_gpt_full_activation_recompute(
             and recipe.float8_per_tensor_scaling()
             ):
             pytest.skip("hipBLASLt does not provide suitable algorithms on GFX950 for this config.")
+        if (dtype in (torch.float16, torch.bfloat16) and rocm_attn_backend()[2]):
+            pytest.skip("Test is not supported on GFX950 with current parameters and CK fused attention backend and non-zero dropout.")
 
     config = model_configs[model]
     torch.compiler.reset() # avoid cache size limit overflow
@@ -926,6 +931,10 @@ def test_gpt_checkpointing(dtype, bs, model):
     config = model_configs[model]
     if not is_fused_attn_available(config, dtype, deterministic=True):
         pytest.skip("No attention backend available.")
+    if (IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5) and
+        dtype in (torch.float16, torch.bfloat16) and rocm_attn_backend()[2]):
+        pytest.skip("Test is not supported on GFX950 with current parameters and CK fused attention backend and non-zero dropout.")
+
     outputs = _test_e2e_checkpointing(bs, dtype, config, checkpoint=False)
     outputs_checkpoint = _test_e2e_checkpointing(bs, dtype, config, checkpoint=True)
 
@@ -2685,6 +2694,9 @@ def _test_gpt_fp8_parameters(bs, dtype, config, fp8_model_params, recipe):
 def test_gpt_fp8_parameters(dtype, bs, model, recipe):
     if NVTE_TEST_NVINSPECT_ENABLED:
         pytest.skip("FP8 parameters are not supported in debug mode.")
+    if (IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5) and
+        dtype in (torch.float16, torch.bfloat16) and rocm_attn_backend()[2]):
+        pytest.skip("Test is not supported on GFX950 with current parameters and CK fused attention backend and non-zero dropout.")
 
     config = model_configs[model]
 
@@ -2972,6 +2984,9 @@ def test_fp8gemm_with_unfused_quantization(N, datatype, input_quantizer, out_qua
         pytest.skip(reason_for_no_fp8)
     if is_mxfp8_needed and not mxfp8_available:
         pytest.skip(reason_for_no_mxfp8)
+    if IS_HIP_EXTENSION and get_device_compute_capability() == (9, 5):
+        if isinstance(out_quantizer, Float8Quantizer):
+            pytest.skip("hipBLASLt does not provide suitable algorithms on GFX950 for this config.")
     inp_fp8 = input_quantizer(torch.randn(N, N, device="cuda", dtype=datatype))
     weight_fp8 = input_quantizer(torch.randn(N, N, device="cuda", dtype=datatype))
     outp_type = torch.float32
diff --git a/transformer_engine/pytorch/tensor/mxfp8_tensor.py b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
@@ -1,3 +1,5 @@
+# This file was modified for portability to AMDGPU
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -110,7 +112,8 @@ def make_empty(
 
         # Allocate FP8 data
         data = torch.empty(shape, dtype=torch.uint8, device=device)
-        scale_inv = torch.empty(
+        # ROCm TE does not implement fuse padding zeros so use zero tensor here
+        scale_inv = torch.zeros(
             round_up_to_nearest_multiple(math.prod(shape[:-1]), 128),
             round_up_to_nearest_multiple(shape[-1] // MXFP8_BLOCK_SCALING_SIZE, 4),
             dtype=torch.uint8,
@@ -122,7 +125,8 @@ def make_empty(
         columnwise_scale_inv = None
         if self.columnwise_usage:
             columnwise_data = torch.empty_like(data)
-            columnwise_scale_inv = torch.empty(
+            # ROCm TE does not implement fuse padding zeros so use zero tensor here
+            columnwise_scale_inv = torch.zeros(
                 round_up_to_nearest_multiple(math.prod(shape[:-1]) // MXFP8_BLOCK_SCALING_SIZE, 4),
                 round_up_to_nearest_multiple(shape[-1], 128),
                 dtype=torch.uint8,