ROCm
diff --git a/‎benchmarks/attention/benchmark_attention.py‎
Lines changed: 4 additions & 4 deletions b/‎benchmarks/attention/benchmark_attention.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎build_tools/VERSION.txt‎
Lines changed: 1 addition & 1 deletion b/‎build_tools/VERSION.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build_tools/pytorch.py‎
Lines changed: 4 additions & 0 deletions b/‎build_tools/pytorch.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/debug/1_getting_started.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/debug/1_getting_started.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/examples/attention/arbitrary_mask_to_post_scale_bias.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/examples/attention/arbitrary_mask_to_post_scale_bias.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/examples/attention/attention.ipynb‎
Lines changed: 9 additions & 9 deletions b/‎docs/examples/attention/attention.ipynb‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎docs/examples/attention/example_attention.py‎
Lines changed: 4 additions & 4 deletions b/‎docs/examples/attention/example_attention.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎qa/L0_jax_unittest/test.sh‎
Lines changed: 2 additions & 2 deletions b/‎qa/L0_jax_unittest/test.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎qa/L0_pytorch_unittest/test.sh‎
Lines changed: 2 additions & 2 deletions b/‎qa/L0_pytorch_unittest/test.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎qa/L1_pytorch_distributed_unittest/test.sh‎
Lines changed: 2 additions & 1 deletion b/‎qa/L1_pytorch_distributed_unittest/test.sh‎
Lines changed: 2 additions & 1 deletion
@@ -9,11 +9,11 @@
 import torch
 import nvtx
 import transformer_engine
-from tests.pytorch.fused_attn.test_fused_attn import (
+from tests.pytorch.utils import (
     ModelConfig,
-    _get_attention_backends,
-    _run_dot_product_attention,
+    get_available_attention_backends,
 )
+from tests.pytorch.attention.test_attention import _run_dot_product_attention
 
 pd.set_option("display.precision", 4)
 
@@ -197,7 +197,7 @@ def main():
     )
     for model in model_configs.keys():
         config = model_configs[model]
-        available_backends, fused_attn_backends = _get_attention_backends(
+        available_backends, _, fused_attn_backends = get_available_attention_backends(
             config,
             qkv_dtype=dtype,
             qkv_layout=qkv_layout,
 
@@ -1 +1 @@
-2.6.0.dev0
+2.6.0
@@ -27,6 +27,7 @@
 
 def install_requirements() -> List[str]:
     """Install dependencies for TE/PyTorch extensions."""
+<<<<<<< HEAD
     reqs = ["einops"]
     if not rocm_build():
         reqs.append(
@@ -41,6 +42,9 @@ def install_requirements() -> List[str]:
         ]
     )
     return reqs
+=======
+    return ["torch>=2.1", "einops", "onnxscript==0.3.1", "onnx"]
+>>>>>>> upstream/release_v2.6
 
 
 def test_requirements() -> List[str]:
 
@@ -21,7 +21,7 @@ Transformer Engine provides a set of precision debug tools which allow you to ea
 There are 4 things one needs to do to use Transformer Engine debug features:
 
 1. Create a configuration YAML file to configure the desired features.
-2. Import, and initialize the `Nvidia-DL-Framework-Inspect <https://github.com/NVIDIA/nvidia-dlfw-inspect>`_ tool, which is installed as the dependency of the Transformer Engine.
+2. Import, initialize, and install the `Nvidia-DL-Framework-Inspect <https://github.com/NVIDIA/nvidia-dlfw-inspect>`_ tool.
 3. One can pass ``name="..."`` when creating TE layers to easier identify layer names. If this is not provided, names will be inferred automatically.
 4. Invoke ``debug_api.step()`` at the end of one forward-backward pass.
 
@@ -238,4 +238,4 @@ Let's run training and open TensorBoard by ``tensorboard --logdir=./tensorboard_
 .. figure:: ./img/tensorboard.png
    :align: center
 
-   Fig 2: TensorBoard with plotted stats.
+   Fig 2: TensorBoard with plotted stats.
@@ -5,7 +5,7 @@
 import os
 import torch
 from typing import Tuple
-from tests.pytorch.fused_attn.test_fused_attn import ModelConfig
+from tests.pytorch.utils import ModelConfig
 from transformer_engine.pytorch.attention import DotProductAttention
 
 # Initialize RNG state
 
@@ -375,7 +375,7 @@
     "\n",
     "Our [unit tests](https://github.com/NVIDIA/TransformerEngine/tree/main/tests) demonstrate the use of Transformer Engine dot product attention APIs. Users are encouraged to use them as a template when integrating Transformer Engine to their ML workflows.\n",
     "\n",
-    "For example, in PyTorch, [test_dot_product_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) offers a variety of use cases of `pytorch.DotProductAttention`, from data types, model configs, checkpointing, to QKV layouts."
+    "For example, in PyTorch, [test_dot_product_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py) offers a variety of use cases of `pytorch.DotProductAttention`, from data types, model configs, checkpointing, to QKV layouts."
    ]
   },
   {
@@ -394,10 +394,10 @@
     "| Framework-native attention | BF16, FP16, FP32 |  Any   | No, unless used as a mask  | Yes | Yes (PyTorch only) | No                                  | Yes |\n",
     "\n",
     "Some unit tests are provided to serve as a starting point for integrating such features into users' models. For example,\n",
-    "- sliding window attention: [test_dpa_swa](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)\n",
-    "- MQA/GQA: [test_te_layer_mqa_gqa](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)\n",
-    "- Multi-Latent Attention: [test_dpa_mla](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)\n",
-    "- context parallelism: [test_cp_with_fused_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn_with_cp.py), [test_cp_with_flash_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn_with_cp.py)"
+    "- sliding window attention: [test_dpa_swa](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py)\n",
+    "- MQA/GQA: [test_te_layer_mqa_gqa](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py)\n",
+    "- Multi-Latent Attention: [test_dpa_mla](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py)\n",
+    "- context parallelism: [test_cp_with_fused_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention_with_cp.py), [test_cp_with_flash_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention_with_cp.py)"
    ]
   },
   {
@@ -458,7 +458,7 @@
     "  </tr>\n",
     "</table>\n",
     "\n",
-    "Some example usage of the different layouts can be found at [test_dpa_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) and [test_dpa_qkv_layout_thd](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py). Transformer Engine also provides a utility function [transformer_engine.pytorch.attention.dot_product_attention.utils.get_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py) to help determine which layout a set of `q`, `k`, `v` tensors have (PyTorch only).\n",
+    "Some example usage of the different layouts can be found at [test_dpa_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py) and [test_dpa_qkv_layout_thd](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py). Transformer Engine also provides a utility function [transformer_engine.pytorch.attention.dot_product_attention.utils.get_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py) to help determine which layout a set of `q`, `k`, `v` tensors have (PyTorch only).\n",
     "\n",
     "<div class=\"alert alert-info\">\n",
     "<b>Note</b>\n",
@@ -548,7 +548,7 @@
    "id": "dda4a589",
    "metadata": {},
    "source": [
-    "Some more examples of running Transformer Engine with different attention masks can be found at [test_dpa_mask](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py).\n",
+    "Some more examples of running Transformer Engine with different attention masks can be found at [test_dpa_mask](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py).\n",
     "\n",
     "### 3.3 Attention Bias\n",
     "\n",
@@ -594,7 +594,7 @@
     "\n",
     "The framework-native backends do not explicitly support `ALiBi`, but users can convert `ALiBi` to a regular `post_scale_bias` bias to achieve the same effect. In PyTorch, this utility function, `transformer_engine.pytorch.attention.get_alibi`, can be used to help with the conversion.\n",
     "\n",
-    "More examples of how to use the various attention biases are at [test_dpa_bias](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)."
+    "More examples of how to use the various attention biases are at [test_dpa_bias](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py)."
    ]
   },
   {
@@ -612,7 +612,7 @@
     "\n",
     "- `DelayedScaling.fp8_mha=True (default=False)`: This option, on top of `fp8_dpa=True`, removes the casting operations at the beginning and end of the `FusedAttention` module. This feature is experimental. \n",
     "\n",
-    "Examples of using the two features are available at [test_dpa_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) and [test_mha_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py). To disable FP8 attention for backward and only use it for forward, users can also set `NVTE_FP8_DPA_BWD=0 (default=1)`."
+    "Examples of using the two features are available at [test_dpa_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py) and [test_mha_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/attention/test_attention.py). To disable FP8 attention for backward and only use it for forward, users can also set `NVTE_FP8_DPA_BWD=0 (default=1)`."
    ]
   }
  ],
 
@@ -9,11 +9,11 @@
 import torch
 import nvtx
 import transformer_engine
-from tests.pytorch.fused_attn.test_fused_attn import (
+from tests.pytorch.utils import (
     ModelConfig,
-    _get_attention_backends,
-    _run_dot_product_attention,
+    get_available_attention_backends,
 )
+from tests.pytorch.attention.test_attention import _run_dot_product_attention
 
 # data type
 dtype = torch.bfloat16
@@ -90,7 +90,7 @@ def main():
     models = ["test_0"]
     for model in models:
         config = model_configs[model]
-        available_backends, fused_attn_backends = _get_attention_backends(
+        available_backends, _, fused_attn_backends = get_available_attention_backends(
             config,
             qkv_dtype=dtype,
             qkv_layout=qkv_layout,
 
@@ -25,7 +25,7 @@ pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
 
-python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_jax_not_distributed.xml $TE_PATH/tests/jax -k 'not distributed' --ignore=$TE_PATH/tests/jax/test_helper.py || test_fail "tests/jax/*not_distributed_*"
+python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_jax_not_distributed.xml $TE_PATH/tests/jax -k 'not distributed' || test_fail "tests/jax/*not_distributed_*"
 
 pip3 install -r $TE_PATH/examples/jax/mnist/requirements.txt || error_exit "Failed to install mnist requirements"
 python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_mnist.xml $TE_PATH/examples/jax/mnist || test_fail "mnist"
@@ -36,7 +36,7 @@ export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
 python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_test_single_gpu_encoder.xml $TE_PATH/examples/jax/encoder/test_single_gpu_encoder.py || test_fail "test_single_gpu_encoder.py"
 # Test without custom calls
 export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
-NVTE_JAX_CUSTOM_CALLS_RE="" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_test_single_gpu_encoder.xml $TE_PATH/examples/jax/encoder/test_single_gpu_encoder.py || test_fail "test_single_gpu_encoder.py without custom calls"
+NVTE_JAX_CUSTOM_CALLS="false" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_test_single_gpu_encoder.xml $TE_PATH/examples/jax/encoder/test_single_gpu_encoder.py || test_fail "test_single_gpu_encoder.py without custom calls"
 
 if [ $RET -ne 0 ]; then
     echo "Error: some sub-tests failed: $FAILED_CASES"
 
@@ -45,8 +45,8 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py || test_fail "test_permutation.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"
 NVTE_FLASH_ATTN=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py || test_fail "test_fused_attn.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/fused_attn/test_kv_cache.py || test_fail "test_kv_cache.py"
+python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention.xml $TE_PATH/tests/pytorch/attention/test_attention.py || test_fail "test_attention.py"
+python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/attention/test_kv_cache.py || test_fail "test_kv_cache.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_hf_integration.xml $TE_PATH/tests/pytorch/test_hf_integration.py || test_fail "test_hf_integration.py"
 NVTE_TEST_CHECKPOINT_ARTIFACT_PATH=$TE_PATH/artifacts/tests/pytorch/test_checkpoint python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_checkpoint.xml $TE_PATH/tests/pytorch/test_checkpoint.py || test_fail "test_checkpoint.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_router.xml $TE_PATH/tests/pytorch/test_fused_router.py || test_fail "test_fused_router.py"
 
@@ -23,12 +23,13 @@ mkdir -p "$XML_LOG_DIR"
 
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
 
+python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/distributed/test_sanity.py || test_fail "test_sanity.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py || test_fail "test_fusible_ops.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_torch_fsdp2.xml $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py || test_fail "test_torch_fsdp2.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_comm_gemm_overlap.xml $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py || test_fail "test_comm_gemm_overlap.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_userbuffers.xml $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py || test_fail "test_fusible_ops_with_userbuffers.py"
-python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn_with_cp.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py || test_fail "test_fused_attn_with_cp.py"
+python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_attention_with_cp.xml $TE_PATH/tests/pytorch/attention/test_attention_with_cp.py || test_fail "test_attention_with_cp.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@`
`27`	`27`
`28`	`28`	`def install_requirements() -> List[str]:`
`29`	`29`	`"""Install dependencies for TE/PyTorch extensions."""`
	`30`	`+<<<<<<< HEAD`
`30`	`31`	`reqs = ["einops"]`
`31`	`32`	`if not rocm_build():`
`32`	`33`	`reqs.append(`
`@@ -41,6 +42,9 @@ def install_requirements() -> List[str]:`
`41`	`42`	`]`
`42`	`43`	`)`
`43`	`44`	`return reqs`
	`45`	`+=======`
	`46`	`+ return ["torch>=2.1", "einops", "onnxscript==0.3.1", "onnx"]`
	`47`	`+>>>>>>> upstream/release_v2.6`
`44`	`48`
`45`	`49`
`46`	`50`	`def test_requirements() -> List[str]:`