ROCm
diff --git a/‎benchmarks/attention/benchmark_attention_rocm.py‎
Lines changed: 17 additions & 13 deletions b/‎benchmarks/attention/benchmark_attention_rocm.py‎
Lines changed: 17 additions & 13 deletions
diff --git a/‎build_tools/pytorch.py‎
Lines changed: 0 additions & 17 deletions b/‎build_tools/pytorch.py‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎ci/pytorch.sh‎
Lines changed: 3 additions & 3 deletions b/‎ci/pytorch.sh‎
Lines changed: 3 additions & 3 deletions
@@ -13,15 +13,19 @@
 import transformer_engine
 from transformer_engine_torch import NVTE_Fused_Attn_Backend
 
-# Add test_fused_attn to the sys path 
+# Add paths tests/pytorch/ and tests/pytorch/attention to the sys path 
 tests_path = os.path.abspath(
-    os.path.join(os.path.dirname(__file__), "../../tests/pytorch/fused_attn")
+    os.path.join(os.path.dirname(__file__), "../../tests")
 )
-sys.path.append(tests_path)
+sys.path.append(tests_path + "/pytorch")
+sys.path.append(tests_path + "/pytorch/attention")
 
-from test_fused_attn import (
+# Add tests/pytorch/utils.py path into sys path
+from utils import (
     ModelConfig,
-    _get_attention_backends,
+    get_available_attention_backends,
+)
+from test_attention import (
     _run_dot_product_attention,
 )
 
@@ -46,12 +50,12 @@
 is_training = True
 
 model_configs = {
-    #   test:             b,  h, hg,   d,   sq,  skv,   p,     mask,              bias
-    "test_0": ModelConfig(2, 16, 16, 64, 512, 512, 0.0, "no_mask", "no_bias"),  # short seq
-    "test_1": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, "causal", "no_bias"),  # longer seq, mask
-    "test_2": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, "causal", "post_scale_bias"),  # bias
-    "test_3": ModelConfig(2, 32, 4, 128, 8192, 8192, 0.0, "causal", "no_bias"),  # GQA
-    "test_4": ModelConfig(2, 128, 8, 128, 8192, 8192, 0.0, "causal_bottom_right", "no_bias")
+    #   test:             b,  sq, h, d
+    "test_0": ModelConfig(2, 512, 16, 64),  # short seq
+    "test_1": ModelConfig(2, 2048, 16, 128, attn_mask_type="causal"),  # longer seq, mask
+    "test_2": ModelConfig(2, 2048, 16, 128, attn_mask_type="causal", attn_bias_type="post_scale_bias"),  # bias
+    "test_3": ModelConfig(2, 8192, 32, 128, num_gqa_groups=8, attn_mask_type="causal"),  # GQA
+    "test_4": ModelConfig(2, 8192, 128, 128, num_gqa_groups=16, attn_mask_type="causal_bottom_right")
 }
 
 # DataFrame indices and columns for results
@@ -303,7 +307,7 @@ def sanity_checks(
     }
 
     for model, cfg in model_configs.items():
-        avail, _, fused_bes = _get_attention_backends(
+        avail, _, fused_bes = get_available_attention_backends(
             cfg,
             qkv_dtype=dtype,
             qkv_layout=qkv_layout,
@@ -364,7 +368,7 @@ def main(args):
     # Benchmarking starts..
     for model in model_configs.keys():
         config = model_configs[model]
-        available_backends, _, fused_attn_backends = _get_attention_backends(
+        available_backends, _, fused_attn_backends = get_available_attention_backends(
             config,
             qkv_dtype=dtype,
             qkv_layout=qkv_layout,
 
@@ -27,24 +27,7 @@
 
 def install_requirements() -> List[str]:
     """Install dependencies for TE/PyTorch extensions."""
-<<<<<<< HEAD
-    reqs = ["einops"]
-    if not rocm_build():
-        reqs.append(
-            "nvdlfw-inspect @"
-            " git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git@v0.1#egg=nvdlfw-inspect"
-        )
-    reqs.extend(
-        [
-            "torch>=2.1",
-            "onnx",
-            "onnxscript@git+https://github.com/microsoft/onnxscript.git@51ecf47523ef079c53b0e620c62d56d70cfd3871",
-        ]
-    )
-    return reqs
-=======
     return ["torch>=2.1", "einops", "onnxscript==0.3.1", "onnx"]
->>>>>>> upstream/release_v2.6
 
 
 def test_requirements() -> List[str]:
 
@@ -65,7 +65,7 @@ run_test_config(){
     run_default_fa 1 test_recipe.py
     run 1 test_sanity.py
     run_default_fa 1 test_sanity_import.py
-    run_default_fa 1 fused_attn/test_fused_attn.py # Backend selection is controlled by the test
+    run_default_fa 1 attention/test_attention.py # Backend selection is controlled by the test
     run_default_fa 1 triton_kernels/test_cast.py
     run_default_fa 1 triton_kernels/test_cast_mxfp8.py
     run_default_fa 1 triton_kernels/test_norm_common.py
@@ -88,8 +88,8 @@ run_test_config_mgpu(){
     run_default_fa 2 distributed/test_numerics.py
     run_default_fa 1 distributed/test_torch_fsdp2.py
     run_default_fa 2 distributed/test_torch_fsdp2_fp8.py
-    run_default_fa_lbl "flash" 3 fused_attn/test_fused_attn_with_cp.py -k "with_flash"
-    run_default_fa_lbl "fused" 2 fused_attn/test_fused_attn_with_cp.py -k "with_fused"
+    run_default_fa_lbl "flash" 3 attention/test_attention_with_cp.py -k "with_flash"
+    run_default_fa_lbl "fused" 2 attention/test_attention_with_cp.py -k "with_fused"
 }
 
 run_benchmark() {