From f17de5438a962842bfca4e51492bbef9e62b2de9 Mon Sep 17 00:00:00 2001 From: Anh Minh Nguyen Hoang Date: Tue, 4 Mar 2025 01:07:02 +0000 Subject: [PATCH 1/6] Add new output --- docs/examples/attention/attention.ipynb | 132 ++++++++++++++++++------ 1 file changed, 101 insertions(+), 31 deletions(-) diff --git a/docs/examples/attention/attention.ipynb b/docs/examples/attention/attention.ipynb index 27017b477..e0903d74f 100644 --- a/docs/examples/attention/attention.ipynb +++ b/docs/examples/attention/attention.ipynb @@ -184,7 +184,64 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, + "id": "b95a2a20", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting nvtx\n", + " Downloading nvtx-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)\n", + "Downloading nvtx-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (474 kB)\n", + "Installing collected packages: nvtx\n", + "Successfully installed nvtx-0.2.11\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install nvtx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37821a48", + "metadata": {}, + "outputs": [], + "source": [ + "# Make sure tests is importable\n", + "!touch ../../../tests/__init__.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2793ef00", + "metadata": {}, + "outputs": [], + "source": [ + "# Exclude unimportable function\n", + "!sed -i '27s/^/#/' ../../../tests/pytorch/fused_attn/test_fused_attn.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "398e045d", + "metadata": {}, + "outputs": [], + "source": [ + "# Avoid importing transformer_engine directory and use the installed package instead\n", + "!mv ../../../transformer_engine/ ../../../transformer_engine_save/" + ] + }, + { + "cell_type": "code", + "execution_count": 28, "id": "50852cb5", "metadata": {}, "outputs": [ @@ -192,22 +249,36 @@ "name": "stdout", "output_type": "stream", "text": [ - "Device 0: NVIDIA H100 80GB HBM3 GPU, sm90 compute capability, 79.1GB memory\n", - "Running test_0 with cuDNN attention and flash-attention...\n", - "Running test_1 with cuDNN attention and flash-attention...\n", - "Running test_2 with cuDNN attention...\n", - "Running test_3 with cuDNN attention and flash-attention...\n", - "\n", - " cuDNN fwd+bwd (ms) flash-attn fwd+bwd (ms) cuDNN vs flash speedup\n", - "test_0 0.0340 0.0468 1.3786\n", - "test_1 0.3664 0.5850 1.5968\n", - "test_2 0.9332 0.0000 0.0000\n", - "test_3 7.4875 11.8879 1.5877\n" + "Device 0: AMD Instinct MI300X GPU, sm94 compute capability, 192.0GB memory\n", + "Running test_0 with cuDNN attention...\n", + "Error: results.stats.csv not found!\n", + "Error: results.stats.csv not found!\n", + "Error: results.stats.csv not found!\n", + "Traceback (most recent call last):\n", + " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 273, in \n", + " main()\n", + " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 255, in main\n", + " parse_results(model, df_times, filename_flash_attn, filename_fused_attn, filename_fused_ck, filename_fused_aotriton)\n", + " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 161, in parse_results\n", + " parse_helper(model, filename_fused_attn, \"FmhaFwd\", \"FmhaBwd\", \"FusedAttention\", df_times)\n", + " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 141, in parse_helper\n", + " df = pd.read_csv(filename)\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 912, in read_csv\n", + " return _read(filepath_or_buffer, kwds)\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 577, in _read\n", + " parser = TextFileReader(filepath_or_buffer, **kwds)\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 1407, in __init__\n", + " self._engine = self._make_engine(f, self.engine)\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 1661, in _make_engine\n", + " self.handles = get_handle(\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/common.py\", line 859, in get_handle\n", + " handle = open(\n", + "FileNotFoundError: [Errno 2] No such file or directory: 'profiler_outputs/prof_fused_test_0.csv'\n" ] } ], "source": [ - "!cd ../../../benchmarks/attention/ && python benchmark_attention.py" + "!cd ../../../ && python benchmarks/attention/benchmark_attention_rocm.py" ] }, { @@ -280,7 +351,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "906b8cf1", "metadata": {}, "outputs": [ @@ -292,15 +363,12 @@ "Run cuDNN attention...\n", "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", "\n", - "Run flash-attention...\n", - "[INFO | DotProductAttention]: Running with FlashAttention backend\n", - "\n", "Test passed.\n" ] } ], "source": [ - "!NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python example_attention.py" + "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python docs/examples/attention/example_attention.py" ] }, { @@ -313,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 32, "id": "d3637094", "metadata": {}, "outputs": [ @@ -323,25 +391,18 @@ "text": [ "\n", "Run cuDNN attention...\n", - "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.10.0.dev0+ee85a91', 'compute_capability': 'sm90', 'flash_attn_version': , 'cudnn_version': '9.3.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bshd_bshd_bshd', 'batch_size': 2, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 512, 'max_seqlen_kv': 512, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bshd_bshd_bshd', 'batch_size': 2, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 512, 'max_seqlen_kv': 512, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=True (sub-backend 1), UnfusedDotProductAttention=True}\n", "[DEBUG | DotProductAttention]: Selected backend = FusedAttention (sub-backend 1)\n", "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", "\n", - "Run flash-attention...\n", - "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.10.0.dev0+ee85a91', 'compute_capability': 'sm90', 'flash_attn_version': , 'cudnn_version': '9.3.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bshd_bshd_bshd', 'batch_size': 2, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 512, 'max_seqlen_kv': 512, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", - "[DEBUG | DotProductAttention]: Disabling FusedAttention due to NVTE_FUSED_ATTN=0\n", - "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=True, FusedAttention=False, UnfusedDotProductAttention=True}\n", - "[DEBUG | DotProductAttention]: Selected backend = FlashAttention\n", - "[INFO | DotProductAttention]: Running with FlashAttention backend\n", - "\n", "Test passed.\n" ] } ], "source": [ - "!NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python example_attention.py" + "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python docs/examples/attention/example_attention.py" ] }, { @@ -544,9 +605,18 @@ "output_type": "stream", "text": [ "Run with post_scale_bias:\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'post_scale_bias', 'core_attention_bias_shape': 'bhss', 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", + "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=True (sub-backend 1), UnfusedDotProductAttention=True}\n", + "[DEBUG | DotProductAttention]: Selected backend = FusedAttention (sub-backend 1)\n", "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", "\n", "Run with arbitrary mask:\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'arbitrary', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", + "[DEBUG | DotProductAttention]: Disabling FusedAttention for arbitrary mask\n", + "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=True}\n", + "[DEBUG | DotProductAttention]: Selected backend = UnfusedDotProductAttention\n", "[INFO | DotProductAttention]: Running with UnfusedDotProductAttention backend\n", "\n", "Test passed!\n" @@ -554,7 +624,7 @@ } ], "source": [ - "!NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python arbitrary_mask_to_post_scale_bias.py" + "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python docs/examples/attention/arbitrary_mask_to_post_scale_bias.py" ] }, { @@ -632,7 +702,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "py_3.10", "language": "python", "name": "python3" }, @@ -646,7 +716,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.15" } }, "nbformat": 4, From 7f51aa88d68145511d1c224f8297df02ef8112f4 Mon Sep 17 00:00:00 2001 From: Anh Minh Nguyen Hoang Date: Tue, 4 Mar 2025 11:54:02 +0200 Subject: [PATCH 2/6] Add ROCm version of example ipynb --- docs/examples/attention/attention_rocm.ipynb | 685 +++++++++++++++++++ 1 file changed, 685 insertions(+) create mode 100644 docs/examples/attention/attention_rocm.ipynb diff --git a/docs/examples/attention/attention_rocm.ipynb b/docs/examples/attention/attention_rocm.ipynb new file mode 100644 index 000000000..3df948a25 --- /dev/null +++ b/docs/examples/attention/attention_rocm.ipynb @@ -0,0 +1,685 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "040f466a", + "metadata": {}, + "source": [ + "# Attention Is All You Need!\n", + "\n", + "The core idea behind Transformer models is the attention mechanism [[1]](https://arxiv.org/abs/1706.03762). It identifies the correlation between words, selects the most important parts of the sentence to focus on, and captures meaningful patterns and dependencies in the data. Figure 1 shows a typical attention mechanism, where pre-softmax operations can be a combination of scaling, bias and masking while the post-softmax operation is often just dropout.\n", + "\n", + "
\n", + "\n", + "
Figure 1: Dot product attention.
\n", + "
\n", + "\n", + "[Transformer Engine](https://github.com/NVIDIA/TransformerEngine.git) supports the calculation of dot product attention in three frameworks, [PyTorch](https://github.com/pytorch/pytorch), [JAX](https://github.com/google/jax) and [PaddlePaddle](https://github.com/PaddlePaddle/Paddle). The API for each framework is\n", + "\n", + "- [transformer_engine.pytorch.DotProductAttention](../../api/pytorch.rst#transformer_engine.pytorch.DotProductAttention)\n", + "- [transformer_engine.jax.flax.DotProductAttention](../../api/jax.rst#transformer_engine.jax.flax.DotProductAttention)\n", + "- [transformer_engine.paddle.DotProductAttention](../../api/paddle.rst#transformer_engine.paddle.DotProductAttention)" + ] + }, + { + "cell_type": "markdown", + "id": "89a7d849", + "metadata": {}, + "source": [ + "## 1. Attention Backends\n", + "\n", + "Transformer Engine provides multiple attention backends for each supported framework. The framework-native backends provide a robust baseline, while the fused, GPU-optimized implementations offer more performance. For example, the flash-attention and cuDNN attention backends in PyTorch. The framework-native backends are often named with \"unfused\", while the more optimized backends are \"fused\" or \"flash\".\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FrameworkBackend (Module Name)Module Location
PyTorchROCm/cuDNN attention (`FusedAttention`) [transformer_engine.pytorch.attention](https://github.com/ROCm/TransformerEngine/blob/dev/transformer_engine/pytorch/attention.py)
flash-attention (`FlashAttention`)
\n", + " PyTorch-native attention (`UnfusedDotProductAttention`)\n", + "
JAXROCm/cuDNN attention (`_FusedDotProductAttention`)[transformer_engine.jax.flax.transformer](https://github.com/ROCm/TransformerEngine/blob/dev/transformer_engine/jax/flax/transformer.py)
JAX-native attention (`_UnfusedDotProductAttention`)
PaddlePaddle ROCm/cuDNN attention (`_te_forward`) [transformer_engine.paddle.layer.attention](https://github.com/ROCm/TransformerEngine/blob/dev/transformer_engine/paddle/layer/attention.py)\n", + "
PaddlePaddle-native attention (`_pd_forward`)
" + ] + }, + { + "cell_type": "markdown", + "id": "c90a2573", + "metadata": {}, + "source": [ + "### 1.1 Flash vs. Non-Flash\n", + "\n", + "The attention calculation has quadratic computational and memory complexities to the sequence length. Its runtime and memory requirements quadruple, when the sequence length doubles. This presents a significant challenge to scale Transformer models up for longer contexts, in order to achieve higher model quality.\n", + "\n", + "Compared to the standard, non-flash algorithm, the flash algorithm [[2]](https://arxiv.org/abs/2205.14135) was proposed to reduce the memory scaling to linear and improve the computational efficiency through optimized memory accesses. It employs the following two distinctive techniques.\n", + "\n", + "- **Tiling:** The non-flash algorithm tries to process the query, key, value tensors in one single step, requiring large amounts of global memory and incurring high volumes of reads/writes between global memory and shared memory. The flash algorithm decomposes the input into several tiles, based on the available shared memory and register size, and it computes the softmax one tile at a time.\n", + "\n", + "- **Recomputation:** The non-flash algorithm stores the softmax matrix (quadratic to sequence length) to global memory for the backward pass, while the flash algorithm only saves the softmax normalization factors (linear to sequence length). This reduces the amount of memory required as well as the bandwidth utilization between global memory and shared memory. Even though there is extra computation incurred in order to recalculate the attention in the backward pass, the bandwidth savings still provide significant improvement in efficiency.\n", + "\n", + "
\n", + "Note: \n", + " \n", + "Transformer Engine's flash-attention backend, available in PyTorch, and cuDNN attention backend (sub-backends 1 and 2), available in PyTorch, JAX and PaddlePaddle, are both based on the flash algorithm.\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "b5ce567d", + "metadata": {}, + "source": [ + "### 1.2 flash-attention\n", + "\n", + "The flash-attention backend, available only in PyTorch, is a module wrapped around the public `flash-attn` package [[3]](https://github.com/Dao-AILab/flash-attention). \n", + "\n", + "The flash-attention backend supports `flash-attn`'s features as well as a few extra functionalities to facilitate the use of `flash-attn`, such as converting the `attention_mask` to cumulative sequence lengths `cu_seqlens` for `padding` mask use cases. Please see `transformer_engine.pytorch.attention.FlashAttention` for details.\n", + "\n", + "The `flash-attn` dependency is regularly updated in Transformer Engine. As of v1.10, Transformer Engine supports `flash-attn` 2.0.6+ (see [setup.py](https://github.com/NVIDIA/TransformerEngine/blob/main/setup.py)).\n", + "\n", + "To understand `flash-attn`'s performance, please refer to their benchmarks [here](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#performance).\n", + "\n", + "### 1.3 Attention backends\n", + "\n", + "The cuDNN attention backend, available in PyTorch, JAX and PaddlePaddle, offers another high-performance solution to the attention calculation. It requires [cuDNN](https://developer.nvidia.com/cudnn) to run, and has several sub-backends to support the different precisions and sequence lengths.\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BackendSub-BackendAlgorithmPrecisionSequence LengthArchitectureAdditional info
ROCmAOTritonFlashBF16/FP16 Any gfx942, gfx90a
ROCmCKFlashBF16/FP16 Any gfx942, gfx90a
cuDNN0Non-FlashBF16/FP16 ≤512 sm80, 90 [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/latest/developer/graph-api.html#fused-attention-fprop)
cuDNN1FlashBF16/FP16 Any sm80+ [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/latest/developer/graph-api.html#fused-flash-attention-fprop),\n", + " [cudnn-frontend](https://github.com/NVIDIA/cudnn-frontend/blob/main/docs/operations/Attention.md#scaled-dot-product-attention)\n", + "
cuDNN2FlashFP8 cuDNN pre-9.0: ≤512 cuDNN pre-9.0: sm90
cuDNN 9.0+: Any cuDNN 9.0+: sm90+ cuDNN 9.0+: [cudnn-frontend](https://github.com/NVIDIA/cudnn-frontend/blob/main/docs/operations/Attention.md#scaled-dot-product-attention-fp8)\n", + "
\n", + "\n", + "The ROCm attention, cuDNN attention, and flash-attention backends have several notable differences. As of Transformer Engine 1.10, cuDNN 9.3 and `flash-attn` 2.4.2,\n", + "\n", + "- flash-attention only supports the PyTorch framework while The ROCm attention and cuDNN attention supports PyTorch, JAX and PaddlePaddle.\n", + "- flash-attention supports BF16, FP16 precisions while The ROCm attention and cuDNN attention also supports FP8 (ROCm through its [UnfusedAttention](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/pytorch/attention.py#L5665) backend and CuDNN through its sub-backend 2).\n", + "- flash-attention supports `bshd`, `thd` input formats, without any transposes, and `sbhd` format, with transposes, while cuDNN attention supports all three formats without transposes (see Section 3.1 for more details).\n", + "- flash-attention does not support `post_scale_bias`, and cuDNN attention does.\n", + "- flash-attention supports KV-caching and paged attention, and cuDNN attention does not.\n", + "- flash-attention uses bottom right diagonal for `causal` mask in cross attention (see [change log](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#21-change-behavior-of-causal-flag)), and cuDNN attention supports both top left and bottom right.\n", + "- flash-attention outperforms cuDNN attention on Ampere architectures, and cuDNN attention has 20-50% advantages on Hopper architectures, based on our benchmarks for a number of commonly-used model configurations.\n", + "\n", + "To compare cuDNN attention and flash-attention, users can modify the `model_configs` dictionary in [benchmarks/attention/benchmark_attention.py](https://github.com/NVIDIA/TransformerEngine/blob/main/benchmarks/attention/benchmark_attention.py) to collect performance numbers. The script runs each entry in `model_configs` for `num_iters` times, each time with one forward pass and one backward pass. Both backends are tried, and if one backend does not have support for the specific user input, the runtimes and speedups in the final table would be 0." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5b8e3d7", + "metadata": {}, + "outputs": [], + "source": [ + "model_configs = {\n", + " # test: b, h, hg, d, sq, skv, p, mask, bias\n", + " \"test_0\": ModelConfig(2, 16, 16, 64, 512, 512, 0.0, \"no_mask\", \"no_bias\"), # short seq\n", + " \"test_1\": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, \"causal\", \"no_bias\"), # longer seq, mask\n", + " \"test_2\": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, \"causal\", \"post_scale_bias\"), # bias\n", + " \"test_3\": ModelConfig(2, 32, 4, 128, 8192, 8192, 0.0, \"causal\", \"no_bias\"), # GQA\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b95a2a20", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting nvtx\n", + " Downloading nvtx-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)\n", + "Downloading nvtx-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (474 kB)\n", + "Installing collected packages: nvtx\n", + "Successfully installed nvtx-0.2.11\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install nvtx" + ] + }, + { + "cell_type": "markdown", + "id": "9a615119", + "metadata": {}, + "source": [ + "## 2. Backend Selection\n", + "\n", + "Given the various attention backends, Transformer Engine has a selection logic in place to choose the most appropriate backend for a particular set of user inputs and runtime environment. The selection logic is based on both backend availability and backend performance.\n", + "\n", + "Backend availability is determined by factors such as model configuration, training hyper-parameters, software versions, and the GPU architecture in question. For example, some considerations are the sequence length, number of attention heads, head size, attention mask type, attention bias type, training or inference mode, self or cross attention, MHA or MQA/GQA, `flash-attn`/cuDNN library versions, and the compute capability of the GPU.\n", + "\n", + "When there are multiple backends available, Transformer Engine makes backend selection based on performance. In general, there are a few rules being followed in our selection logic (see table below). As we monitor the performance of different backends, the selection logic may change.\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FrameworkSelection Order
PyTorchsm90: cuDNN attention > flash-attention > PyTorch-native attention
sm80: flash-attention > cuDNN attention > PyTorch-native attention
\n", + " cuDNN attention: sub-backend 1 > sub-backend 0\n", + "
JAXcuDNN attention > JAX-native attention
PaddlePaddle cuDNN attention > PaddlePaddle-native attention
" + ] + }, + { + "cell_type": "markdown", + "id": "e6c0f3f0", + "metadata": {}, + "source": [ + "### 2.1 Debug Information\n", + "\n", + "To find out which backend is being used during runtime, we have the following two debugging flags. Logging is done by using the `logging` package.\n", + "```\n", + "NVTE_DEBUG = 0/1 # disables/enables debugging\n", + "NVTE_DEBUG_LEVEL = 0/1/2 # enables logging.WARNING/INFO/DEBUG-level messages\n", + "```\n", + "
\n", + "Note:\n", + " \n", + "These flags are supported in PyTorch only as of Transformer Engine 1.10. JAX and PaddlePaddle support is expected to be added in the future.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "16660323", + "metadata": {}, + "source": [ + "The example script [example_attention.py](https://raw.githubusercontent.com/NVIDIA/TransformerEngine/main/docs/examples/attention/example_attention.py) runs a very basic model with two attention backends, cuDNN attention and flash-attention. Here `NVTE_DEBUG_LEVEL=1` allows us to find out which backend/sub-backend is used in runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "906b8cf1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Run cuDNN attention...\n", + "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", + "\n", + "Run flash-attention...\n", + "[INFO | DotProductAttention]: Running with FlashAttention backend\n", + "\n", + "Test passed.\n" + ] + } + ], + "source": [ + "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python docs/examples/attention/example_attention.py" + ] + }, + { + "cell_type": "markdown", + "id": "8ca99461", + "metadata": {}, + "source": [ + "`NVTE_DEBUG_LEVEL=2` allows us to find out more about the backend selection logic. Users are encouraged to double check the `config` and provide it to the Transformer Engine team if they would like to file a bug. " + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "d3637094", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Run cuDNN attention...\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bshd_bshd_bshd', 'batch_size': 2, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 512, 'max_seqlen_kv': 512, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", + "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=True (sub-backend 1), UnfusedDotProductAttention=True}\n", + "[DEBUG | DotProductAttention]: Selected backend = FusedAttention (sub-backend 1)\n", + "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", + "\n", + "Test passed.\n" + ] + } + ], + "source": [ + "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python docs/examples/attention/example_attention.py" + ] + }, + { + "cell_type": "markdown", + "id": "611d8fdb", + "metadata": {}, + "source": [ + "### 2.2 User Control\n", + "\n", + "Users usually do not need to worry about the backend selection. However, if there is a convergence or performance issue encountered, Transformer Engine provides a few other environment variables for users to experiment with different backends.\n", + "\n", + "**flash-attention or cuDNN attention:**\n", + "Users can enable/disable the flash-attention backend or cuDNN attention backend via the following two environment variables in PyTorch.\n", + "```\n", + "NVTE_FLASH_ATTN = 0 # disables flash-attention; default = 1\n", + "NVTE_FUSED_ATTN = 0 # disables cuDNN attention; default = 1\n", + "```\n", + "\n", + "**cuDNN attention sub-backends:**\n", + "This environment variable allows users to express their preference of cuDNN attention sub-backends. However, the elected sub-backend will only be used *if* it is eligible, i.e. if it has support for the provided inputs and runtime environment.\n", + "```\n", + "NVTE_FUSED_ATTN_BACKEND = 0/1/2 # user preference of cuDNN sub-backend\n", + "```\n", + "\n", + "**Execution paths of cuDNN sub-backend 1:**\n", + "cuDNN attention sub-backend 1 also offers two execution paths: workspace optimization path and non-workspace optimization path. The workspace optimization path requires a larger amount of global memory, provides determinism, and offers bias gradient support. Before cuDNN 9.0, it also has 20-30% performance advantage over the non-workspace optimization path. But after cuDNN 9.0, it is 20-30% slower than the non-workspace optimization path.\n", + "\n", + "Users can experiment with these two paths through the following environment variable. However, please be aware of the possible Out-Of-Memory risks.\n", + "```\n", + "Before cuDNN 9.0:\n", + " NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT = 0 # disables workspace optimization path\n", + " NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT = 1 # enables workspace optimization path\n", + "\n", + "After cuDNN 9.0:\n", + " NVTE_ALLOW_NONDETERMINISTIC_ALGO = 1 # disables workspace optimization path\n", + " NVTE_ALLOW_NONDETERMINISTIC_ALGO = 0 # enables workspace optimization path\n", + "```\n", + "
\n", + "Note\n", + " \n", + "Environment variables NVTE_FLASH_ATTN, NVTE_FUSED_ATTN, NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT and NVTE_ALLOW_NONDETERMINISTIC_ALGO are only supported in PyTorch, and will be added to JAX and PaddlePaddle in the future.\n", + "
\n", + "\n", + "### 2.3 Example Tests\n", + "\n", + "Our [unit tests](https://github.com/NVIDIA/TransformerEngine/tree/main/tests) demonstrate the use of Transformer Engine dot product attention APIs. Users are encouraged to use them as a template when integrating Transformer Engine to their ML workflows.\n", + "\n", + "For example, in PyTorch, [test_dot_product_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) offers a variety of use cases of `pytorch.DotProductAttention`, from data types, model configs, checkpointing, to QKV layouts." + ] + }, + { + "cell_type": "markdown", + "id": "e60a2a3e", + "metadata": {}, + "source": [ + "## 3. Backend Support\n", + "\n", + "Transformer Engine supports commonly-used features such as self and cross attention, FP16/BF16 precisions, dropout, and checkpointing. But it also offers a range of other features. As of v1.10, Transformer Engine's attention backends have the following support matrix.\n", + "\n", + "| Attention Backend | Precision | Architecture | Sliding Window Attention | MQA/GQA | Multi-Latent Attention | Context Parallelism | Determinism Possible |\n", + "| :---------------- | :-------- | :----------- | :----------------------- | :------ | :--------------------- | :------------------ | :------------ |\n", + "| ROCm attention (all frameworks) | BF16, FP16 | gfx90a, gfx942 | No | Yes | Yes | Yes (`bshd`,`sbhd`) | Yes [CK backend, with extra dq_acc buffer](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/pytorch/attention.py#L882-L887|\n", + "| cuDNN attention (all frameworks) | BF16, FP16, FP8 (PyTorch only) | sm80+ | No | Yes | Yes | Yes (`bshd`,`sbhd`, `thd`) | Yes |\n", + "| flash-attention (PyTorch) | BF16, FP16 | sm80+ | Yes | Yes | No | Yes (`bshd`,`thd`) | Yes |\n", + "| Framework-native attention | BF16, FP16, FP32 | Any | No, unless used as a mask | Yes | Yes (PyTorch only) | No | Yes |\n", + "\n", + "Some unit tests are provided to serve as a starting point for integrating such features into users' models. For example,\n", + "- sliding window attention: [test_dpa_swa](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)\n", + "- MQA/GQA: [test_te_layer_mqa_gqa](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)\n", + "- Multi-Latent Attention: [test_dpa_mla](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)\n", + "- context parallelism: [test_cp_with_fused_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn_with_cp.py), [test_cp_with_flash_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn_with_cp.py)" + ] + }, + { + "cell_type": "markdown", + "id": "fbdcb327", + "metadata": {}, + "source": [ + "### 3.1 QKV Layout\n", + "\n", + "Transformer Engine supports various layouts of the query `q`, key `k`, value `v` tensors. It has defined 15 QKV layouts, which are grouped into 3 QKV formats and 5 QKV layout groups to help with similar memory/computational operations across different layouts. The mapping relationships of these layouts and groups are,\n", + "\n", + "| `qkv_layout`         | `qkv_layout_group`=`3hd` | `h3d` | `hd_2hd` | `hd_h2d` | `hd_hd_hd` |\n", + "| ----------: | -----------: | -----: | ----------: | ----------: | -------------: |\n", + "| `qkv_format`=`sbhd` | `sb3hd` | `sbh3d` | `sbhd_sb2hd` | `sbhd_sbh2d` | `sbhd_sbhd_sbhd` |\n", + "| `bshd` | `bs3hd` | `bsh3d` | `bshd_bs2hd` | `bshd_bsh2d` | `bshd_bshd_bshd` |\n", + "| `thd` | `t3hd` | `th3d` | `thd_t2hd` | `thd_th2d` | `thd_thd_thd` |\n", + "\n", + "The notation system is that `b` stands for the batch size, `s` sequence length, `h` number of attention heads, `d` head dimension, and `t` the total number of tokens in the batch, i.e. `t = sum(s_i) for i in 0,...,b-1`. Here are a few examples of the layouts and their explanations to help clarify the definition.\n", + "\n", + "**qkv_layout=sb3hd:**\n", + "`q`, `k`, `v` are sequence first, i.e. `s` is the leading dimension in each tensor. They are different slices of one tensor `qkv`: `q, k, v = [qkv[:,:,i,:,:] for i in range(3)]`. They are interleaved at the `h * d` dimension.\n", + "\n", + "**qkv_layout=bshd_bsh2d:**\n", + "`q`, `k`, `v` are batch first, i.e. `b` is the leading dimension in each tensor. `q` is contiguous, and `k`, `v` are different slices of tensor `kv`: `k, v = [kv[:,:,:,i,:] for i in range(2)]`. `k`, `v` are interleaved at the `d` dimension.\n", + "\n", + "The `s` and `h` in `bsh2d` are the max sequence length and number of heads for `k`, `v`, which can be different from the `s` and `h` in `bshd` for `q`. We denoted them as the same for brevity reasons. Transformer Engine does differentiate their values for actual execution.\n", + "\n", + "**qkv_layout=thd_thd_thd:**\n", + "`q`, `k`, `v` have variable sequence lengths in a batch. They are all contiguous and have no interleaving.\n", + "\n", + "As of v1.10, Transformer Engine has the following support matrix.\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BackendSupported QKV FormatsNotes
flash-attention`bshd`, `sbhd`, `thd`PyTorch: 3 formats, i.e. 15 layouts
ROCm attention`bshd`, `sbhd`PyTorch: 2 formats, i.e. 10 layouts [1](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp#L108-L114) [2](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/common/fused_attn_rocm/fused_attn_aotriton.cpp#L90-L95)
\n", + " JAX, PaddlePaddle: `bs3hd`, `bshd_bs2hd`, `bshd_bshd_bshd` layouts\n", + "
cuDNN attention`bshd`, `sbhd`, `thd`PyTorch: 3 formats, i.e. 15 layouts
\n", + " JAX, PaddlePaddle: `bs3hd`, `bshd_bs2hd`, `bshd_bshd_bshd` layouts\n", + "
Framework-native attention`bshd`, `sbhd`PyTorch, JAX, PaddlePaddle: 2 formats, i.e. 10 layouts
\n", + "\n", + "Some example usage of the different layouts can be found at [test_dpa_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) and [test_dpa_qkv_layout_thd](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py). Transformer Engine also provides a utility function [transformer_engine.pytorch.attention.get_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py) to help determine which layout a set of `q`, `k`, `v` tensors have (PyTorch only).\n", + "\n", + "
\n", + "Note\n", + " \n", + "When RoPE is employed, the qkv_layout may change in Transformer Engine PyTorch through [get_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py). This is due to the in-place nature of our RoPE implementations. We convert `q`, `k`, `v` tensors from their initial layout to the corresponding hd_hd_hd layout. For example, from sbh3d in pytorch.MultiHeadAttention before RoPE, to sbhd_sbhd_sbhd in pytorch.DotProductAttention after RoPE.\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "id": "855d9616", + "metadata": {}, + "source": [ + "### 3.2 Attention Mask\n", + "\n", + "Transformer Engine supports 7 mask types, and all the masks are defined as `True` masking out the corresponding element and `False` including the corresponding element in attention calculation.\n", + "\n", + "- `no_mask`, `padding`, `causal`, `causal_bottom_right`, `padding_causal`, `padding_causal_bottom_right`, `arbitrary`\n", + "\n", + "Different backends offer different support for attention mask. As of Transformer Engine 1.10,\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BackendSupported Mask TypesRequires `attention_mask`
flash-attention
  • `no_mask`, `causal` (self-attention),
  • `padding`, `padding_causal` (self-attention),
  • `causal_bottom_right`, `padding_causal_bottom_right`
  • `no_mask`, `causal` `causal_bottom_right`: No
  • `padding`, `padding_causal`, `padding_causal_bottom_right`: Yes if `cu_seqlens` not provided
  • `arbitrary`: Yes
  • cuDNN attention
  • `no_mask`, `causal`,
  • `padding`, `padding_causal`,
  • `causal_bottom_right`, `padding_causal_bottom_right`
  • Framework-native attention
  • All (PyTorch)
  • `no_mask`, `causal`, `padding` (Jax, PaddlePaddle)
  • \n", + "\n", + "**Padding masks:** For `padding`, `padding_causal`, `padding_causal_bottom_right` mask types, users need to provide sequence length information to help Transformer Engine figure out where each sequence ends in a batch. As of Transformer Engine 1.10, there are two options to do so in PyTorch and one in JAX and PaddlePaddle.\n", + "\n", + "* PyTorch: When both options are provided by the user, `cu_seqlens` is preferred as there is no extra conversion needed.\n", + " - `cu_seqlens`: Users can provide cumulative sequence length tensors `cu_seqlens_q` and `cu_seqlens_kv` for `q` and `k`/`v` to the flash-attention or cuDNN attention backend. An example of `cu_seqlens` is `[0, 2, 6, 7]` for a batch of 3 `[aa000, bbbb0, c0000]`.\n", + " - `attention_mask`: Users can also provide `attention_mask` as an alternative, which will then be converted to `cu_seqlens`. For self-attention, `attention_mask` should be one single tensor in shape `[batch_size, 1, 1, seqlen_q]`, and for cross-attention, `attention_mask` should be a list of two tensors in shapes `[batch_size, 1, 1, seqlen_q]` and `[batch_size, 1, 1, seqlen_kv]`, respectively.\n", + "\n", + "\n", + "* JAX and PaddlePaddle: Users should provide the `attention_mask` tensor in shape `[batch_size, 1, seqlen_q, seqlen_kv]`.\n", + "\n", + "**qkv_format=thd:** Transformer Engine extracts the max sequence length information from `q`, `k`, `v` if `max_seqlen_q` and `max_seqlen_kv` are not provided. This requires GPU-CPU copy and synchronization operations. For performance reasons, please set `max_seqlen_q` and `max_seqlen_kv` to their appropriate values for `thd` QKV format.\n", + "\n", + "**Arbitrary mask:** cuDNN does not support `Arbitrary` mask type as of v9.3. However, users can convert the mask to a regular `post_scale_bias` bias and achieve the same functionality. An example script for this conversion is [arbitrary_mask_to_post_scale_bias.py](https://raw.githubusercontent.com/NVIDIA/TransformerEngine/main/docs/examples/attention/arbitrary_mask_to_post_scale_bias.py).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1f25a9b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Run with post_scale_bias:\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'post_scale_bias', 'core_attention_bias_shape': 'bhss', 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", + "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=True (sub-backend 1), UnfusedDotProductAttention=True}\n", + "[DEBUG | DotProductAttention]: Selected backend = FusedAttention (sub-backend 1)\n", + "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", + "\n", + "Run with arbitrary mask:\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'arbitrary', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", + "[DEBUG | DotProductAttention]: Disabling FusedAttention for arbitrary mask\n", + "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=True}\n", + "[DEBUG | DotProductAttention]: Selected backend = UnfusedDotProductAttention\n", + "[INFO | DotProductAttention]: Running with UnfusedDotProductAttention backend\n", + "\n", + "Test passed!\n" + ] + } + ], + "source": [ + "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python docs/examples/attention/arbitrary_mask_to_post_scale_bias.py" + ] + }, + { + "cell_type": "markdown", + "id": "dda4a589", + "metadata": {}, + "source": [ + "Some more examples of running Transformer Engine with different attention masks can be found at [test_dpa_mask](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py).\n", + "\n", + "### 3.3 Attention Bias\n", + "\n", + "Transformer Engine supports 4 attention bias types, `no_bias`, `pre_scale_bias`, `post_scale_bias`, and `ALiBi` (with/without custom slopes). As of Transformer Engine 1.10, their support matrix is as follows.\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    BackendBias TypeBias ShapeBias Data TypeArchitecture
    flash-attention`no_bias`, `ALiBi` (with slopes)N/AALiBi slopes: FP32sm80+
    cuDNN attentionPyTorch: `no_bias`, `post_scale_bias`, `ALiBi` (without slopes)`post_scale_bias`: BHSS, 1HSS, B1SS, 11SS for forward, 1HSS for backward`post_scale_bias`: same as QKV typecuDNN 8.9.6+: sm90
    JAX, PaddlePaddle: `no_bias`, `post_scale_bias`ALiBi slopes: FP32cuDNN 9.0+: sm80+
    Framework-native attention`no_bias`, `pre_scale_bias`, `post_scale_bias``post_scale_bias`: BHSS, 1HSS, B1SS, 11SS `post_scale_bias`: same as QKV typesm80+
    \n", + "\n", + "The flash-attention backend enables `ALiBi` by asking user to pass in an `alibi_slopes` tensor, which can be the default slopes of vanilla ALiBi, or user-defined slopes. On the other hand, cuDNN attention supports `ALiBi` by taking in a `Boolean` flag, and it only supports vanilla ALiBi as of cuDNN 9.0.\n", + "\n", + "The framework-native backends do not explicitly support `ALiBi`, but users can convert `ALiBi` to a regular `post_scale_bias` bias to achieve the same effect. In PyTorch, this utility function, `transformer_engine.pytorch.attention.get_alibi`, can be used to help with the conversion.\n", + "\n", + "More examples of how to use the various attention biases are at [test_dpa_bias](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)." + ] + }, + { + "cell_type": "markdown", + "id": "a0702339", + "metadata": {}, + "source": [ + "### 3.4 FP8 Attention\n", + "\n", + "A unique feature of Transformer Engine is its FP8 support, not only for the `Linear` layers but also for dot product attention. Transformer Engine's FP8 attention support is through its cuDNN attention sub-backend 2. Recall Figure 1: the two `MatMul` operations are performed in FP8 for computational efficiency, and the `SoftMax` operation is performed in FP32 for numerical accuracy.\n", + "\n", + "Transformer Engine supports FP8 attention through its [C APIs](../../api/c/fused_attn.rst), and [PyTorch API](../../api/pytorch.rst#transformer_engine.pytorch.DotProductAttention), as of v1.10. Its PyTorch API offers two options, both controlled through the FP8 recipe definition, `transformer_engine.common.recipe.DelayedScaling`.\n", + "\n", + "- `DelayedScaling.fp8_dpa=True (default=False)`: This enables the use of cuDNN attention sub-backend 2, when it does support the provided user inputs. The `FusedAttention` module for cuDNN attention takes FP16 or BF16 tensors as inputs, performs dot product attention in FP8, and returns attention logits in FP16 or BF16 (same as the input type). Casting operations are required to cast tensors to FP8 at the beginning, and back to FP16/BF16 at the end of the module.\n", + "\n", + "- `DelayedScaling.fp8_mha=True (default=False)`: This option, on top of `fp8_dpa=True`, removes the casting operations at the beginning and end of the `FusedAttention` module. This feature is experimental. \n", + "\n", + "Examples of using the two features are available at [test_dpa_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) and [test_mha_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py). To disable FP8 attention for backward and only use it for forward, users can also set `NVTE_FP8_DPA_BWD=0 (default=1)`." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py_3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 5484445d9c48f80c8f850fa1b75984e1e098bf37 Mon Sep 17 00:00:00 2001 From: Anh Minh Nguyen Hoang Date: Tue, 4 Mar 2025 11:56:05 +0200 Subject: [PATCH 3/6] revert changes on NVDA example ipynb --- docs/examples/attention/attention.ipynb | 132 ++++++------------------ 1 file changed, 31 insertions(+), 101 deletions(-) diff --git a/docs/examples/attention/attention.ipynb b/docs/examples/attention/attention.ipynb index e0903d74f..27017b477 100644 --- a/docs/examples/attention/attention.ipynb +++ b/docs/examples/attention/attention.ipynb @@ -184,64 +184,7 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "b95a2a20", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting nvtx\n", - " Downloading nvtx-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)\n", - "Downloading nvtx-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (474 kB)\n", - "Installing collected packages: nvtx\n", - "Successfully installed nvtx-0.2.11\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install nvtx" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "37821a48", - "metadata": {}, - "outputs": [], - "source": [ - "# Make sure tests is importable\n", - "!touch ../../../tests/__init__.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2793ef00", - "metadata": {}, - "outputs": [], - "source": [ - "# Exclude unimportable function\n", - "!sed -i '27s/^/#/' ../../../tests/pytorch/fused_attn/test_fused_attn.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "398e045d", - "metadata": {}, - "outputs": [], - "source": [ - "# Avoid importing transformer_engine directory and use the installed package instead\n", - "!mv ../../../transformer_engine/ ../../../transformer_engine_save/" - ] - }, - { - "cell_type": "code", - "execution_count": 28, + "execution_count": 1, "id": "50852cb5", "metadata": {}, "outputs": [ @@ -249,36 +192,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Device 0: AMD Instinct MI300X GPU, sm94 compute capability, 192.0GB memory\n", - "Running test_0 with cuDNN attention...\n", - "Error: results.stats.csv not found!\n", - "Error: results.stats.csv not found!\n", - "Error: results.stats.csv not found!\n", - "Traceback (most recent call last):\n", - " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 273, in \n", - " main()\n", - " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 255, in main\n", - " parse_results(model, df_times, filename_flash_attn, filename_fused_attn, filename_fused_ck, filename_fused_aotriton)\n", - " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 161, in parse_results\n", - " parse_helper(model, filename_fused_attn, \"FmhaFwd\", \"FmhaBwd\", \"FusedAttention\", df_times)\n", - " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 141, in parse_helper\n", - " df = pd.read_csv(filename)\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 912, in read_csv\n", - " return _read(filepath_or_buffer, kwds)\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 577, in _read\n", - " parser = TextFileReader(filepath_or_buffer, **kwds)\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 1407, in __init__\n", - " self._engine = self._make_engine(f, self.engine)\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 1661, in _make_engine\n", - " self.handles = get_handle(\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/common.py\", line 859, in get_handle\n", - " handle = open(\n", - "FileNotFoundError: [Errno 2] No such file or directory: 'profiler_outputs/prof_fused_test_0.csv'\n" + "Device 0: NVIDIA H100 80GB HBM3 GPU, sm90 compute capability, 79.1GB memory\n", + "Running test_0 with cuDNN attention and flash-attention...\n", + "Running test_1 with cuDNN attention and flash-attention...\n", + "Running test_2 with cuDNN attention...\n", + "Running test_3 with cuDNN attention and flash-attention...\n", + "\n", + " cuDNN fwd+bwd (ms) flash-attn fwd+bwd (ms) cuDNN vs flash speedup\n", + "test_0 0.0340 0.0468 1.3786\n", + "test_1 0.3664 0.5850 1.5968\n", + "test_2 0.9332 0.0000 0.0000\n", + "test_3 7.4875 11.8879 1.5877\n" ] } ], "source": [ - "!cd ../../../ && python benchmarks/attention/benchmark_attention_rocm.py" + "!cd ../../../benchmarks/attention/ && python benchmark_attention.py" ] }, { @@ -351,7 +280,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "906b8cf1", "metadata": {}, "outputs": [ @@ -363,12 +292,15 @@ "Run cuDNN attention...\n", "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", "\n", + "Run flash-attention...\n", + "[INFO | DotProductAttention]: Running with FlashAttention backend\n", + "\n", "Test passed.\n" ] } ], "source": [ - "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python docs/examples/attention/example_attention.py" + "!NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python example_attention.py" ] }, { @@ -381,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 23, "id": "d3637094", "metadata": {}, "outputs": [ @@ -391,18 +323,25 @@ "text": [ "\n", "Run cuDNN attention...\n", - "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bshd_bshd_bshd', 'batch_size': 2, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 512, 'max_seqlen_kv': 512, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.10.0.dev0+ee85a91', 'compute_capability': 'sm90', 'flash_attn_version': , 'cudnn_version': '9.3.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bshd_bshd_bshd', 'batch_size': 2, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 512, 'max_seqlen_kv': 512, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=True (sub-backend 1), UnfusedDotProductAttention=True}\n", "[DEBUG | DotProductAttention]: Selected backend = FusedAttention (sub-backend 1)\n", "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", "\n", + "Run flash-attention...\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.10.0.dev0+ee85a91', 'compute_capability': 'sm90', 'flash_attn_version': , 'cudnn_version': '9.3.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bshd_bshd_bshd', 'batch_size': 2, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 512, 'max_seqlen_kv': 512, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Disabling FusedAttention due to NVTE_FUSED_ATTN=0\n", + "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=True, FusedAttention=False, UnfusedDotProductAttention=True}\n", + "[DEBUG | DotProductAttention]: Selected backend = FlashAttention\n", + "[INFO | DotProductAttention]: Running with FlashAttention backend\n", + "\n", "Test passed.\n" ] } ], "source": [ - "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python docs/examples/attention/example_attention.py" + "!NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python example_attention.py" ] }, { @@ -605,18 +544,9 @@ "output_type": "stream", "text": [ "Run with post_scale_bias:\n", - "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'post_scale_bias', 'core_attention_bias_shape': 'bhss', 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", - "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", - "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=True (sub-backend 1), UnfusedDotProductAttention=True}\n", - "[DEBUG | DotProductAttention]: Selected backend = FusedAttention (sub-backend 1)\n", "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", "\n", "Run with arbitrary mask:\n", - "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'arbitrary', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", - "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", - "[DEBUG | DotProductAttention]: Disabling FusedAttention for arbitrary mask\n", - "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=True}\n", - "[DEBUG | DotProductAttention]: Selected backend = UnfusedDotProductAttention\n", "[INFO | DotProductAttention]: Running with UnfusedDotProductAttention backend\n", "\n", "Test passed!\n" @@ -624,7 +554,7 @@ } ], "source": [ - "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python docs/examples/attention/arbitrary_mask_to_post_scale_bias.py" + "!NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python arbitrary_mask_to_post_scale_bias.py" ] }, { @@ -702,7 +632,7 @@ ], "metadata": { "kernelspec": { - "display_name": "py_3.10", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -716,7 +646,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.10.12" } }, "nbformat": 4, From fec4facabcbd4fb69ea5dc2ba0f0d2b0da86be1a Mon Sep 17 00:00:00 2001 From: Anh Minh Nguyen Hoang Date: Tue, 4 Mar 2025 12:11:46 +0200 Subject: [PATCH 4/6] Add missings --- docs/examples/attention/attention_rocm.ipynb | 78 +++++++++++++++++++- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/docs/examples/attention/attention_rocm.ipynb b/docs/examples/attention/attention_rocm.ipynb index 3df948a25..9e3c323c6 100644 --- a/docs/examples/attention/attention_rocm.ipynb +++ b/docs/examples/attention/attention_rocm.ipynb @@ -228,6 +228,81 @@ "!pip install nvtx" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "37821a48", + "metadata": {}, + "outputs": [], + "source": [ + "# Make sure tests is importable\n", + "!touch ../../../tests/__init__.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2793ef00", + "metadata": {}, + "outputs": [], + "source": [ + "# Exclude unimportable function\n", + "!sed -i '27s/^/#/' ../../../tests/pytorch/fused_attn/test_fused_attn.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "398e045d", + "metadata": {}, + "outputs": [], + "source": [ + "# Avoid importing transformer_engine directory and use the installed package instead\n", + "!mv ../../../transformer_engine/ ../../../transformer_engine_save/" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "50852cb5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Device 0: AMD Instinct MI300X GPU, sm94 compute capability, 192.0GB memory\n", + "Running test_0 with cuDNN attention...\n", + "Error: results.stats.csv not found!\n", + "Error: results.stats.csv not found!\n", + "Error: results.stats.csv not found!\n", + "Traceback (most recent call last):\n", + " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 273, in \n", + " main()\n", + " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 255, in main\n", + " parse_results(model, df_times, filename_flash_attn, filename_fused_attn, filename_fused_ck, filename_fused_aotriton)\n", + " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 161, in parse_results\n", + " parse_helper(model, filename_fused_attn, \"FmhaFwd\", \"FmhaBwd\", \"FusedAttention\", df_times)\n", + " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 141, in parse_helper\n", + " df = pd.read_csv(filename)\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 912, in read_csv\n", + " return _read(filepath_or_buffer, kwds)\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 577, in _read\n", + " parser = TextFileReader(filepath_or_buffer, **kwds)\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 1407, in __init__\n", + " self._engine = self._make_engine(f, self.engine)\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 1661, in _make_engine\n", + " self.handles = get_handle(\n", + " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/common.py\", line 859, in get_handle\n", + " handle = open(\n", + "FileNotFoundError: [Errno 2] No such file or directory: 'profiler_outputs/prof_fused_test_0.csv'\n" + ] + } + ], + "source": [ + "!cd ../../../ && python benchmarks/attention/benchmark_attention_rocm.py" + ] + }, { "cell_type": "markdown", "id": "9a615119", @@ -310,9 +385,6 @@ "Run cuDNN attention...\n", "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", "\n", - "Run flash-attention...\n", - "[INFO | DotProductAttention]: Running with FlashAttention backend\n", - "\n", "Test passed.\n" ] } From 069d5870aed0cbedc404556f980422af8a54562f Mon Sep 17 00:00:00 2001 From: Anh Minh Nguyen Hoang Date: Fri, 14 Mar 2025 13:57:55 +0000 Subject: [PATCH 5/6] doc: Update contents with more findings and ROCm backend run output --- docs/examples/attention/attention_rocm.ipynb | 172 +++++++++++++------ 1 file changed, 118 insertions(+), 54 deletions(-) diff --git a/docs/examples/attention/attention_rocm.ipynb b/docs/examples/attention/attention_rocm.ipynb index 9e3c323c6..7e17785f3 100644 --- a/docs/examples/attention/attention_rocm.ipynb +++ b/docs/examples/attention/attention_rocm.ipynb @@ -126,7 +126,7 @@ " Flash\n", " BF16/FP16\n", " Any \n", - " gfx942, gfx90a \n", + " gfx942 \n", " \n", " \n", " \n", @@ -135,7 +135,7 @@ " Flash\n", " BF16/FP16\n", " Any \n", - " gfx942, gfx90a \n", + " gfx942 \n", " \n", " \n", " \n", @@ -206,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "id": "b95a2a20", "metadata": {}, "outputs": [ @@ -220,7 +220,9 @@ "Installing collected packages: nvtx\n", "Successfully installed nvtx-0.2.11\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", - "\u001b[0m" + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], @@ -252,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "398e045d", "metadata": {}, "outputs": [], @@ -263,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 9, "id": "50852cb5", "metadata": {}, "outputs": [ @@ -273,34 +275,74 @@ "text": [ "Device 0: AMD Instinct MI300X GPU, sm94 compute capability, 192.0GB memory\n", "Running test_0 with cuDNN attention...\n", - "Error: results.stats.csv not found!\n", - "Error: results.stats.csv not found!\n", - "Error: results.stats.csv not found!\n", - "Traceback (most recent call last):\n", - " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 273, in \n", - " main()\n", - " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 255, in main\n", - " parse_results(model, df_times, filename_flash_attn, filename_fused_attn, filename_fused_ck, filename_fused_aotriton)\n", - " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 161, in parse_results\n", - " parse_helper(model, filename_fused_attn, \"FmhaFwd\", \"FmhaBwd\", \"FusedAttention\", df_times)\n", - " File \"/root/code/TransformerEngine/benchmarks/attention/benchmark_attention_rocm.py\", line 141, in parse_helper\n", - " df = pd.read_csv(filename)\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 912, in read_csv\n", - " return _read(filepath_or_buffer, kwds)\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 577, in _read\n", - " parser = TextFileReader(filepath_or_buffer, **kwds)\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 1407, in __init__\n", - " self._engine = self._make_engine(f, self.engine)\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py\", line 1661, in _make_engine\n", - " self.handles = get_handle(\n", - " File \"/opt/conda/envs/py_3.10/lib/python3.10/site-packages/pandas/io/common.py\", line 859, in get_handle\n", - " handle = open(\n", - "FileNotFoundError: [Errno 2] No such file or directory: 'profiler_outputs/prof_fused_test_0.csv'\n" + "Running CK Backend\n", + "Running AOTriton Backend\n", + "environ({'SHELL': '/bin/bash', 'NVTE_FRAMEWORK': 'pytorch', 'INSTALLED_DB': 'yes', 'PYTORCH_TESTING_DEVICE_ONLY_FOR': 'cuda', 'PYTHONUNBUFFERED': '1', 'CONDA_EXE': '/opt/conda/bin/conda', '_CE_M': '', 'BUILD_ENVIRONMENT': 'pytorch-linux-jammy-rocm6.3-py3.10', 'HOSTNAME': 'tw036', 'SSH_AUTH_SOCK': '/tmp/vscode-ssh-auth-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock', 'PYTHON_FROZEN_MODULES': 'on', 'ELECTRON_RUN_AS_NODE': '1', 'REMOTE_CONTAINERS_IPC': '/tmp/vscode-remote-containers-ipc-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock', 'NCCL_SOCKET_IFNAME': 'ens51np0', 'PWD': '/workspace/code/TransformerEngine/benchmarks/attention', 'CONDA_ROOT': '/opt/conda', 'CONDA_PREFIX': '/opt/conda/envs/py_3.10', 'MEGATRON_LM_PATH': '/workspace/Megatron-LM', 'VSCODE_ESM_ENTRYPOINT': 'vs/workbench/api/node/extensionHostProcess', 'PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING': '1', 'NCCL_IB_HCA': 'rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7', 'MPI_HOME': '/opt/ompi', 'HOME': '/root', 'LANG': 'C.UTF-8', 'LS_COLORS': '', 'PYTORCH_TEST_WITH_ROCM': '1', 'REMOTE_CONTAINERS': 'true', 'UCX_HOME': '/opt/ucx', 'FORCE_COLOR': '1', 'CONDA_PROMPT_MODIFIER': '(py_3.10) ', 'PYDEVD_USE_FRAME_EVAL': 'NO', 'MAGMA_HOME': '/opt/rocm/magma', 'CLICOLOR': '1', 'VSCODE_L10N_BUNDLE_LOCATION': '', 'DATA_DIR_ROOT': '/workspace/data', 'CLICOLOR_FORCE': '1', 'NVTE_ROCM_ARCH': 'gfx942', 'HIP_FORCE_DEV_KERNARG': '1', 'VSCODE_HANDLES_SIGPIPE': 'true', 'ANACONDA_PYTHON_VERSION': '3.10', 'PYTHONPATH': ':', 'TERM': 'xterm-color', '_CE_CONDA': '', 'INSTALLED_VISION': 'yes', 'REMOTE_CONTAINERS_SOCKETS': '[\"/tmp/vscode-ssh-auth-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock\",\"/tmp/.X11-unix/X4\",\"/root/.gnupg/S.gpg-agent\"]', 'GIT_PAGER': 'cat', 'PYTHONIOENCODING': 'utf-8', 'CONDA_SHLVL': '2', 'PYTORCH_ROCM_ARCH': 'gfx942', 'DISPLAY': ':4', 'NVTE_CK_V3_BF16_CVT': '2', 'SHLVL': '2', 'MAX_JOBS': '32', 'PAGER': 'cat', 'VSCODE_CWD': '/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447', 'LLVM_SYMBOLIZER_PATH': '/opt/rocm/llvm/bin/llvm-symbolizer', 'ROCM_PATH': '/opt/rocm', 'MPLBACKEND': 'module://matplotlib_inline.backend_inline', 'CONDA_PYTHON_EXE': '/opt/conda/bin/python', 'LD_LIBRARY_PATH': '/opt/ompi/lib:/opt/rocm/lib:/usr/local/lib::/opt/rocm/lib/:', 'CONDA_DEFAULT_ENV': 'py_3.10', 'DEBUGINFOD_URLS': '', 'LC_ALL': 'C.UTF-8', 'NVTE_USE_CAST_TRANSPOSE_TRITON': '0', 'BROWSER': '/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/bin/helpers/browser.sh', 'PATH': '/opt/conda/envs/py_3.10/bin:/opt/conda/condabin:/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/bin/remote-cli:/opt/ompi/bin:/opt/ucx/bin:/opt/cache/bin:/opt/rocm/llvm/bin:/opt/rocm/opencl/bin:/opt/rocm/hip/bin:/opt/rocm/hcc/bin:/opt/rocm/bin:/opt/conda/envs/py_3.10/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/rocm/bin:.', 'CI': '1', 'HSA_FORCE_FINE_GRAIN_PCIE': '1', 'NVTE_USE_HIPBLASLT': '1', 'VSCODE_NLS_CONFIG': '{\"userLocale\":\"en\",\"osLocale\":\"en\",\"resolvedLanguage\":\"en\",\"defaultMessagesFile\":\"/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/out/nls.messages.json\",\"locale\":\"en\",\"availableLanguages\":{}}', 'INSTALLED_PROTOBUF': 'yes', 'CONDA_PREFIX_1': '/opt/conda', 'GPU_ARCHS': 'gfx942', 'DEBIAN_FRONTEND': 'noninteractive', 'MODELS_DIR_ROOT': '/workspace/models', 'VSCODE_HANDLES_UNCAUGHT_ERRORS': 'true', 'REMOTE_CONTAINERS_DISPLAY_SOCK': '/tmp/.X11-unix/X4', 'OLDPWD': '/workspace/code/TransformerEngine/docs/examples/attention', 'VSCODE_IPC_HOOK_CLI': '/tmp/vscode-ipc-3cbb5860-fc6c-4cad-a32a-079cd8f8fe81.sock', '_': '/opt/conda/envs/py_3.10/bin/python', 'TORCHINDUCTOR_CACHE_DIR': '/tmp/torchinductor_root', 'CUDA_MODULE_LOADING': 'LAZY', 'NVTE_FLASH_ATTN': '0', 'NVTE_FUSED_ATTN': '1', 'NVTE_UNFUSED_ATTN': '1', 'NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT': '1', 'NVTE_FUSED_ATTN_AOTRITON': '1', 'NVTE_FUSED_ATTN_CK': '0', 'NVTE_FUSED_ATTN_BACKEND': '1', 'NVTE_LOG_AOTRITON_CONFIG': '1', 'NVTE_DEBUG': '1', 'NVTE_DEBUG_LEVEL': '2'})\n", + "\n", + "attn_fwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 0\n", + "\n", + "attn_bwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240882807152, causal mask: 0\n", + "\n", + "attn_fwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 0\n", + "\n", + "attn_bwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240882807152, causal mask: 0\n", + "\n", + "attn_fwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 0\n", + "\n", + "attn_bwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240882807152, causal mask: 0\n", + " FusedAttention Module ... FusedAttention AOTriton Kernels (fwd+bwd)\n", + "Model ... \n", + "test_0 54.2266 ... 0.0\n", + "test_1 0.0000 ... 0.0\n", + "test_2 0.0000 ... 0.0\n", + "test_3 0.0000 ... 0.0\n", + "\n", + "[4 rows x 17 columns]\n", + "Running test_1 with cuDNN attention...\n", + "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/transformer_engine/pytorch/attention.py:4832: UserWarning: window_size should be (-1, 0) or (>=0, 0) for attn_mask_type=causal\n", + " warnings.warn(\n", + "Running CK Backend\n", + "Running AOTriton Backend\n", + "environ({'SHELL': '/bin/bash', 'NVTE_FRAMEWORK': 'pytorch', 'INSTALLED_DB': 'yes', 'PYTORCH_TESTING_DEVICE_ONLY_FOR': 'cuda', 'PYTHONUNBUFFERED': '1', 'CONDA_EXE': '/opt/conda/bin/conda', '_CE_M': '', 'BUILD_ENVIRONMENT': 'pytorch-linux-jammy-rocm6.3-py3.10', 'HOSTNAME': 'tw036', 'SSH_AUTH_SOCK': '/tmp/vscode-ssh-auth-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock', 'PYTHON_FROZEN_MODULES': 'on', 'ELECTRON_RUN_AS_NODE': '1', 'REMOTE_CONTAINERS_IPC': '/tmp/vscode-remote-containers-ipc-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock', 'NCCL_SOCKET_IFNAME': 'ens51np0', 'PWD': '/workspace/code/TransformerEngine/benchmarks/attention', 'CONDA_ROOT': '/opt/conda', 'CONDA_PREFIX': '/opt/conda/envs/py_3.10', 'MEGATRON_LM_PATH': '/workspace/Megatron-LM', 'VSCODE_ESM_ENTRYPOINT': 'vs/workbench/api/node/extensionHostProcess', 'PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING': '1', 'NCCL_IB_HCA': 'rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7', 'MPI_HOME': '/opt/ompi', 'HOME': '/root', 'LANG': 'C.UTF-8', 'LS_COLORS': '', 'PYTORCH_TEST_WITH_ROCM': '1', 'REMOTE_CONTAINERS': 'true', 'UCX_HOME': '/opt/ucx', 'FORCE_COLOR': '1', 'CONDA_PROMPT_MODIFIER': '(py_3.10) ', 'PYDEVD_USE_FRAME_EVAL': 'NO', 'MAGMA_HOME': '/opt/rocm/magma', 'CLICOLOR': '1', 'VSCODE_L10N_BUNDLE_LOCATION': '', 'DATA_DIR_ROOT': '/workspace/data', 'CLICOLOR_FORCE': '1', 'NVTE_ROCM_ARCH': 'gfx942', 'HIP_FORCE_DEV_KERNARG': '1', 'VSCODE_HANDLES_SIGPIPE': 'true', 'ANACONDA_PYTHON_VERSION': '3.10', 'PYTHONPATH': ':', 'TERM': 'xterm-color', '_CE_CONDA': '', 'INSTALLED_VISION': 'yes', 'REMOTE_CONTAINERS_SOCKETS': '[\"/tmp/vscode-ssh-auth-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock\",\"/tmp/.X11-unix/X4\",\"/root/.gnupg/S.gpg-agent\"]', 'GIT_PAGER': 'cat', 'PYTHONIOENCODING': 'utf-8', 'CONDA_SHLVL': '2', 'PYTORCH_ROCM_ARCH': 'gfx942', 'DISPLAY': ':4', 'NVTE_CK_V3_BF16_CVT': '2', 'SHLVL': '2', 'MAX_JOBS': '32', 'PAGER': 'cat', 'VSCODE_CWD': '/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447', 'LLVM_SYMBOLIZER_PATH': '/opt/rocm/llvm/bin/llvm-symbolizer', 'ROCM_PATH': '/opt/rocm', 'MPLBACKEND': 'module://matplotlib_inline.backend_inline', 'CONDA_PYTHON_EXE': '/opt/conda/bin/python', 'LD_LIBRARY_PATH': '/opt/ompi/lib:/opt/rocm/lib:/usr/local/lib::/opt/rocm/lib/:', 'CONDA_DEFAULT_ENV': 'py_3.10', 'DEBUGINFOD_URLS': '', 'LC_ALL': 'C.UTF-8', 'NVTE_USE_CAST_TRANSPOSE_TRITON': '0', 'BROWSER': '/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/bin/helpers/browser.sh', 'PATH': '/opt/conda/envs/py_3.10/bin:/opt/conda/condabin:/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/bin/remote-cli:/opt/ompi/bin:/opt/ucx/bin:/opt/cache/bin:/opt/rocm/llvm/bin:/opt/rocm/opencl/bin:/opt/rocm/hip/bin:/opt/rocm/hcc/bin:/opt/rocm/bin:/opt/conda/envs/py_3.10/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/rocm/bin:.', 'CI': '1', 'HSA_FORCE_FINE_GRAIN_PCIE': '1', 'NVTE_USE_HIPBLASLT': '1', 'VSCODE_NLS_CONFIG': '{\"userLocale\":\"en\",\"osLocale\":\"en\",\"resolvedLanguage\":\"en\",\"defaultMessagesFile\":\"/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/out/nls.messages.json\",\"locale\":\"en\",\"availableLanguages\":{}}', 'INSTALLED_PROTOBUF': 'yes', 'CONDA_PREFIX_1': '/opt/conda', 'GPU_ARCHS': 'gfx942', 'DEBIAN_FRONTEND': 'noninteractive', 'MODELS_DIR_ROOT': '/workspace/models', 'VSCODE_HANDLES_UNCAUGHT_ERRORS': 'true', 'REMOTE_CONTAINERS_DISPLAY_SOCK': '/tmp/.X11-unix/X4', 'OLDPWD': '/workspace/code/TransformerEngine/docs/examples/attention', 'VSCODE_IPC_HOOK_CLI': '/tmp/vscode-ipc-3cbb5860-fc6c-4cad-a32a-079cd8f8fe81.sock', '_': '/opt/conda/envs/py_3.10/bin/python', 'TORCHINDUCTOR_CACHE_DIR': '/tmp/torchinductor_root', 'CUDA_MODULE_LOADING': 'LAZY', 'NVTE_FLASH_ATTN': '0', 'NVTE_UNFUSED_ATTN': '1', 'NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT': '1', 'NVTE_FUSED_ATTN_BACKEND': '1', 'NVTE_LOG_AOTRITON_CONFIG': '1', 'NVTE_DEBUG': '1', 'NVTE_DEBUG_LEVEL': '2', 'NVTE_FUSED_ATTN': '1', 'NVTE_FUSED_ATTN_AOTRITON': '1', 'NVTE_FUSED_ATTN_CK': '0'})\n", + "\n", + "attn_fwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 1\n", + "\n", + "attn_bwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240882807152, causal mask: 1\n", + "\n", + "attn_fwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 1\n", + "\n", + "attn_bwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240883216272, causal mask: 1\n", + "\n", + "attn_fwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 1\n", + "\n", + "attn_bwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240883216272, causal mask: 1\n", + " FusedAttention Module ... FusedAttention AOTriton Kernels (fwd+bwd)\n", + "Model ... \n", + "test_0 54.2266 ... 0.7426\n", + "test_1 56.3781 ... 0.0000\n", + "test_2 0.0000 ... 0.0000\n", + "test_3 0.0000 ... 0.0000\n", + "\n", + "[4 rows x 17 columns]\n", + "Running test_2 with cuDNN attention...\n", + "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/transformer_engine/pytorch/attention.py:4832: UserWarning: window_size should be (-1, 0) or (>=0, 0) for attn_mask_type=causal\n", + " warnings.warn(\n", + "Running CK Backend\n", + "Running test_3 with cuDNN attention...\n", + "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/transformer_engine/pytorch/attention.py:4832: UserWarning: window_size should be (-1, 0) or (>=0, 0) for attn_mask_type=causal\n", + " warnings.warn(\n", + "Running CK Backend\n", + "\n", + " cuDNN fwd+bwd (ms) flash-attn fwd+bwd (ms) cuDNN vs flash speedup\n", + "test_0 0.0833 0.0 0.0\n", + "test_1 0.7294 0.0 0.0\n", + "test_2 1.2165 0.0 0.0\n", + "test_3 15.9134 0.0 0.0\n" ] } ], "source": [ - "!cd ../../../ && python benchmarks/attention/benchmark_attention_rocm.py" + "!cd ../../../benchmarks/attention/ && python benchmark_attention_rocm.py" ] }, { @@ -312,7 +354,7 @@ "\n", "Given the various attention backends, Transformer Engine has a selection logic in place to choose the most appropriate backend for a particular set of user inputs and runtime environment. The selection logic is based on both backend availability and backend performance.\n", "\n", - "Backend availability is determined by factors such as model configuration, training hyper-parameters, software versions, and the GPU architecture in question. For example, some considerations are the sequence length, number of attention heads, head size, attention mask type, attention bias type, training or inference mode, self or cross attention, MHA or MQA/GQA, `flash-attn`/cuDNN library versions, and the compute capability of the GPU.\n", + "Backend availability is determined by factors such as model configuration, training hyper-parameters, software versions, and the GPU architecture in question. For example, some considerations are the sequence length, number of attention heads, head size, attention mask type, attention bias type, training or inference mode, self or cross attention, MHA or MQA/GQA, `flash-attn`/cuDNN/ROCm library versions, and the compute capability of the GPU.\n", "\n", "When there are multiple backends available, Transformer Engine makes backend selection based on performance. In general, there are a few rules being followed in our selection logic (see table below). As we monitor the performance of different backends, the selection logic may change.\n", "\n", @@ -322,7 +364,10 @@ " Selection Order\n", " \n", " \n", - " PyTorch\n", + " PyTorch\n", + " gfx942: ROCm attention > flash-attention > PyTorch-native attention\n", + " \n", + " \n", " sm90: cuDNN attention > flash-attention > PyTorch-native attention\n", " \n", " \n", @@ -437,16 +482,18 @@ "Users usually do not need to worry about the backend selection. However, if there is a convergence or performance issue encountered, Transformer Engine provides a few other environment variables for users to experiment with different backends.\n", "\n", "**flash-attention or cuDNN attention:**\n", - "Users can enable/disable the flash-attention backend or cuDNN attention backend via the following two environment variables in PyTorch.\n", + "Users can enable/disable the flash-attention backend or cuDNN/ROCm attention backend via the following two environment variables in PyTorch.\n", "```\n", "NVTE_FLASH_ATTN = 0 # disables flash-attention; default = 1\n", "NVTE_FUSED_ATTN = 0 # disables cuDNN attention; default = 1\n", "```\n", "\n", - "**cuDNN attention sub-backends:**\n", + "**cuDNN/ROCm attention sub-backends:**\n", "This environment variable allows users to express their preference of cuDNN attention sub-backends. However, the elected sub-backend will only be used *if* it is eligible, i.e. if it has support for the provided inputs and runtime environment.\n", "```\n", - "NVTE_FUSED_ATTN_BACKEND = 0/1/2 # user preference of cuDNN sub-backend\n", + "NVTE_FUSED_ATTN_BACKEND = 0/1/2 # user preference of cuDNN sub-backend\n", + "NVTE_FUSED_ATTN_CK = 0/1 # enable the ROcm CK sub-backend\n", + "NVTE_FUSED_ATTN_AOTRITON = 0/1 # enable the ROcm AOTriton sub-backend\n", "```\n", "\n", "**Execution paths of cuDNN sub-backend 1:**\n", @@ -470,9 +517,9 @@ "\n", "### 2.3 Example Tests\n", "\n", - "Our [unit tests](https://github.com/NVIDIA/TransformerEngine/tree/main/tests) demonstrate the use of Transformer Engine dot product attention APIs. Users are encouraged to use them as a template when integrating Transformer Engine to their ML workflows.\n", + "Our [unit tests](https://github.com/ROCM/TransformerEngine/tree/dev/tests) demonstrate the use of Transformer Engine dot product attention APIs. Users are encouraged to use them as a template when integrating Transformer Engine to their ML workflows.\n", "\n", - "For example, in PyTorch, [test_dot_product_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) offers a variety of use cases of `pytorch.DotProductAttention`, from data types, model configs, checkpointing, to QKV layouts." + "For example, in PyTorch, [test_dot_product_attention](https://github.com/ROCm/TransformerEngine/blob/dev/tests/pytorch/fused_attn/test_fused_attn.py) offers a variety of use cases of `pytorch.DotProductAttention`, from data types, model configs, checkpointing, to QKV layouts." ] }, { @@ -486,16 +533,16 @@ "\n", "| Attention Backend | Precision | Architecture | Sliding Window Attention | MQA/GQA | Multi-Latent Attention | Context Parallelism | Determinism Possible |\n", "| :---------------- | :-------- | :----------- | :----------------------- | :------ | :--------------------- | :------------------ | :------------ |\n", - "| ROCm attention (all frameworks) | BF16, FP16 | gfx90a, gfx942 | No | Yes | Yes | Yes (`bshd`,`sbhd`) | Yes [CK backend, with extra dq_acc buffer](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/pytorch/attention.py#L882-L887|\n", + "| ROCm attention (all frameworks) | BF16, FP16 | gfx942 | No | Yes | Yes | Yes (`bshd`,`sbhd`) | Yes [CK backend to be supported](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/pytorch/attention.py#L882-L887|\n", "| cuDNN attention (all frameworks) | BF16, FP16, FP8 (PyTorch only) | sm80+ | No | Yes | Yes | Yes (`bshd`,`sbhd`, `thd`) | Yes |\n", "| flash-attention (PyTorch) | BF16, FP16 | sm80+ | Yes | Yes | No | Yes (`bshd`,`thd`) | Yes |\n", "| Framework-native attention | BF16, FP16, FP32 | Any | No, unless used as a mask | Yes | Yes (PyTorch only) | No | Yes |\n", "\n", "Some unit tests are provided to serve as a starting point for integrating such features into users' models. For example,\n", - "- sliding window attention: [test_dpa_swa](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)\n", - "- MQA/GQA: [test_te_layer_mqa_gqa](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)\n", - "- Multi-Latent Attention: [test_dpa_mla](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py)\n", - "- context parallelism: [test_cp_with_fused_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn_with_cp.py), [test_cp_with_flash_attention](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn_with_cp.py)" + "- sliding window attention: [test_dpa_swa](https://github.com/ROCM/TransformerEngine/blob/dev/tests/pytorch/fused_attn/test_fused_attn.py)\n", + "- MQA/GQA: [test_te_layer_mqa_gqa](https://github.com/ROCM/TransformerEngine/blob/dev/tests/pytorch/fused_attn/test_fused_attn.py)\n", + "- Multi-Latent Attention: [test_dpa_mla](https://github.com/ROCM/TransformerEngine/blob/dev/tests/pytorch/fused_attn/test_fused_attn.py)\n", + "- context parallelism: [test_cp_with_fused_attention](https://github.com/ROCM/TransformerEngine/blob/dev/tests/pytorch/fused_attn/test_fused_attn_with_cp.py), [test_cp_with_flash_attention](https://github.com/ROCM/TransformerEngine/blob/dev/tests/pytorch/fused_attn/test_fused_attn_with_cp.py)" ] }, { @@ -542,7 +589,11 @@ " \n", " ROCm attention\n", " `bshd`, `sbhd`\n", - " PyTorch: 2 formats, i.e. 10 layouts [1](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp#L108-L114) [2](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/common/fused_attn_rocm/fused_attn_aotriton.cpp#L90-L95)\n", + " \n", + " PyTorch: 2 formats, i.e. 10 layouts \n", + " [1](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp#L108-L114)\n", + " [2](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/common/fused_attn_rocm/fused_attn_aotriton.cpp#L90-L95)\n", + " \n", " \n", " \n", " \n", @@ -566,12 +617,12 @@ " \n", "\n", "\n", - "Some example usage of the different layouts can be found at [test_dpa_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) and [test_dpa_qkv_layout_thd](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py). Transformer Engine also provides a utility function [transformer_engine.pytorch.attention.get_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py) to help determine which layout a set of `q`, `k`, `v` tensors have (PyTorch only).\n", + "Some example usage of the different layouts can be found at [test_dpa_qkv_layout](https://github.com/ROCM/TransformerEngine/blob/dev/tests/pytorch/fused_attn/test_fused_attn.py) and [test_dpa_qkv_layout_thd](https://github.com/ROCM/TransformerEngine/blob/dev/tests/pytorch/fused_attn/test_fused_attn.py). Transformer Engine also provides a utility function [transformer_engine.pytorch.attention.get_qkv_layout](https://github.com/ROCM/TransformerEngine/blob/dev/transformer_engine/pytorch/attention.py) to help determine which layout a set of `q`, `k`, `v` tensors have (PyTorch only).\n", "\n", "
    \n", "Note\n", " \n", - "When RoPE is employed, the qkv_layout may change in Transformer Engine PyTorch through [get_qkv_layout](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py). This is due to the in-place nature of our RoPE implementations. We convert `q`, `k`, `v` tensors from their initial layout to the corresponding hd_hd_hd layout. For example, from sbh3d in pytorch.MultiHeadAttention before RoPE, to sbhd_sbhd_sbhd in pytorch.DotProductAttention after RoPE.\n", + "When RoPE is employed, the qkv_layout may change in Transformer Engine PyTorch through [get_qkv_layout](https://github.com/ROCM/TransformerEngine/blob/dev/transformer_engine/pytorch/attention.py). This is due to the in-place nature of our RoPE implementations. We convert `q`, `k`, `v` tensors from their initial layout to the corresponding hd_hd_hd layout. For example, from sbh3d in pytorch.MultiHeadAttention before RoPE, to sbhd_sbhd_sbhd in pytorch.DotProductAttention after RoPE.\n", "
    \n" ] }, @@ -597,20 +648,20 @@ " \n", " flash-attention\n", "
  • `no_mask`, `causal` (self-attention),
  • `padding`, `padding_causal` (self-attention),
  • `causal_bottom_right`, `padding_causal_bottom_right`
  • \n", - "
  • `no_mask`, `causal` `causal_bottom_right`: No
  • `padding`, `padding_causal`, `padding_causal_bottom_right`: Yes if `cu_seqlens` not provided
  • `arbitrary`: Yes
  • \n", + "
  • `no_mask`, `causal` `causal_bottom_right`: No
  • `padding`, `padding_causal`, `padding_causal_bottom_right`: Yes if `cu_seqlens` not provided
  • `arbitrary`: Yes
  • \n", " \n", " \n", " cuDNN attention\n", "
  • `no_mask`, `causal`,
  • `padding`, `padding_causal`,
  • `causal_bottom_right`, `padding_causal_bottom_right`
  • \n", - " \n", + " \n", + " \n", + " ROCm attention\n", + "
  • no_mask, causal_mask, causal_bottom_right_mask (Only CK)
  • \n", " \n", " \n", " Framework-native attention\n", "
  • All (PyTorch)
  • `no_mask`, `causal`, `padding` (Jax, PaddlePaddle)
  • \n", " \n", - " \n", - " \n", - " \n", "\n", "\n", "**Padding masks:** For `padding`, `padding_causal`, `padding_causal_bottom_right` mask types, users need to provide sequence length information to help Transformer Engine figure out where each sequence ends in a batch. As of Transformer Engine 1.10, there are two options to do so in PyTorch and one in JAX and PaddlePaddle.\n", @@ -624,7 +675,7 @@ "\n", "**qkv_format=thd:** Transformer Engine extracts the max sequence length information from `q`, `k`, `v` if `max_seqlen_q` and `max_seqlen_kv` are not provided. This requires GPU-CPU copy and synchronization operations. For performance reasons, please set `max_seqlen_q` and `max_seqlen_kv` to their appropriate values for `thd` QKV format.\n", "\n", - "**Arbitrary mask:** cuDNN does not support `Arbitrary` mask type as of v9.3. However, users can convert the mask to a regular `post_scale_bias` bias and achieve the same functionality. An example script for this conversion is [arbitrary_mask_to_post_scale_bias.py](https://raw.githubusercontent.com/NVIDIA/TransformerEngine/main/docs/examples/attention/arbitrary_mask_to_post_scale_bias.py).\n" + "**Arbitrary mask:** cuDNN does not support `Arbitrary` mask type as of v9.3. However, users can convert the mask to a regular `post_scale_bias` bias and achieve the same functionality. An example script for this conversion is [arbitrary_mask_to_post_scale_bias.py](https://raw.githubusercontent.com/ROCm/TransformerEngine/refs/heads/dev/docs/examples/attention/arbitrary_mask_to_post_scale_bias.py).\n" ] }, { @@ -665,14 +716,14 @@ "id": "dda4a589", "metadata": {}, "source": [ - "Some more examples of running Transformer Engine with different attention masks can be found at [test_dpa_mask](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py).\n", + "Some more examples of running Transformer Engine with different attention masks can be found at [test_dpa_mask](https://github.com/ROCM/TransformerEngine/blob/dev/tests/pytorch/fused_attn/test_fused_attn.py).\n", "\n", "### 3.3 Attention Bias\n", "\n", "Transformer Engine supports 4 attention bias types, `no_bias`, `pre_scale_bias`, `post_scale_bias`, and `ALiBi` (with/without custom slopes). As of Transformer Engine 1.10, their support matrix is as follows.\n", "\n", "\n", - " \n", + " \n", " \n", " \n", " \n", @@ -699,6 +750,19 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", " \n", " \n", " \n", @@ -707,7 +771,7 @@ " \n", "
    BackendBias TypeBias ShapecuDNN 9.0+: sm80+
    ROCm attentionPyTorch: `no_bias`, `post_scale_bias`, `ALiBi`(with standard slope)`post_scale_bias`: BHSS(rquired for dbias), 1HSS, B1SS, 11SS, 1HSS`post_scale_bias`: same as QKV typeROCm 3.0.0: gfx942
    JAX, PaddlePaddle: `no bias`, `pre_scale_bias`, `post_scale_bias`ALiBi slopes: FP32
    Framework-native attention`no_bias`, `pre_scale_bias`, `post_scale_bias``post_scale_bias`: BHSS, 1HSS, B1SS, 11SS
    \n", "\n", - "The flash-attention backend enables `ALiBi` by asking user to pass in an `alibi_slopes` tensor, which can be the default slopes of vanilla ALiBi, or user-defined slopes. On the other hand, cuDNN attention supports `ALiBi` by taking in a `Boolean` flag, and it only supports vanilla ALiBi as of cuDNN 9.0.\n", + "The flash-attention backend enables `ALiBi` by asking user to pass in an `alibi_slopes` tensor, which can be the default slopes of vanilla ALiBi, or user-defined slopes. On the other hand, cuDNN attention supports `ALiBi` by taking in a `Boolean` flag, and it only supports vanilla ALiBi as of cuDNN 9.0, while ROCm CK attention requires an `ALiBi` slope array even if it is in standard (vanilla) mode [ref.](https://github.com/ROCm/TransformerEngine/blob/fec4facabcbd4fb69ea5dc2ba0f0d2b0da86be1a/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp#L238)\n", "\n", "The framework-native backends do not explicitly support `ALiBi`, but users can convert `ALiBi` to a regular `post_scale_bias` bias to achieve the same effect. In PyTorch, this utility function, `transformer_engine.pytorch.attention.get_alibi`, can be used to help with the conversion.\n", "\n", From 66e12ba9ad6c9fe700ee949ca589f99598c1e10d Mon Sep 17 00:00:00 2001 From: Anh Minh Nguyen Hoang Date: Wed, 19 Mar 2025 07:44:40 +0000 Subject: [PATCH 6/6] Update details on thd layout, mask, and bias --- .../attention/benchmark_attention_rocm.py | 8 +- .../arbitrary_mask_to_post_scale_bias.py | 10 +- docs/examples/attention/attention_rocm.ipynb | 120 ++++-------------- docs/examples/attention/example_attention.py | 9 +- 4 files changed, 51 insertions(+), 96 deletions(-) diff --git a/benchmarks/attention/benchmark_attention_rocm.py b/benchmarks/attention/benchmark_attention_rocm.py index b1cefcad6..bb2146641 100644 --- a/benchmarks/attention/benchmark_attention_rocm.py +++ b/benchmarks/attention/benchmark_attention_rocm.py @@ -20,7 +20,13 @@ trimmed_path = cwd[:index] sys.path.append(trimmed_path) -from tests.pytorch.fused_attn.test_fused_attn import ( +# Add path to tests directory +tests_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../../tests/pytorch/fused_attn") +) +sys.path.append(tests_path) + +from test_fused_attn import ( ModelConfig, _get_attention_backends, _run_dot_product_attention, diff --git a/docs/examples/attention/arbitrary_mask_to_post_scale_bias.py b/docs/examples/attention/arbitrary_mask_to_post_scale_bias.py index 85ce01079..bd74318e1 100644 --- a/docs/examples/attention/arbitrary_mask_to_post_scale_bias.py +++ b/docs/examples/attention/arbitrary_mask_to_post_scale_bias.py @@ -3,9 +3,17 @@ # See LICENSE for license information. import os +import sys import torch from typing import Tuple -from tests.pytorch.fused_attn.test_fused_attn import ModelConfig + +# Add path to tests directory +tests_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../../../tests/pytorch/fused_attn") +) +sys.path.append(tests_path) + +from test_fused_attn import ModelConfig from transformer_engine.pytorch.attention import DotProductAttention # Initialize RNG state diff --git a/docs/examples/attention/attention_rocm.ipynb b/docs/examples/attention/attention_rocm.ipynb index 7e17785f3..0d7f9bb66 100644 --- a/docs/examples/attention/attention_rocm.ipynb +++ b/docs/examples/attention/attention_rocm.ipynb @@ -206,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "b95a2a20", "metadata": {}, "outputs": [ @@ -230,17 +230,6 @@ "!pip install nvtx" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "37821a48", - "metadata": {}, - "outputs": [], - "source": [ - "# Make sure tests is importable\n", - "!touch ../../../tests/__init__.py" - ] - }, { "cell_type": "code", "execution_count": null, @@ -248,24 +237,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Exclude unimportable function\n", + "# fix for failing to import transformer_engine.pytorch.attention._flash_attn_3_is_installed on transformer_engine 1.11.0+2081355\n", "!sed -i '27s/^/#/' ../../../tests/pytorch/fused_attn/test_fused_attn.py" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "398e045d", - "metadata": {}, - "outputs": [], - "source": [ - "# Avoid importing transformer_engine directory and use the installed package instead\n", - "!mv ../../../transformer_engine/ ../../../transformer_engine_save/" - ] - }, - { - "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "id": "50852cb5", "metadata": {}, "outputs": [ @@ -275,69 +253,21 @@ "text": [ "Device 0: AMD Instinct MI300X GPU, sm94 compute capability, 192.0GB memory\n", "Running test_0 with cuDNN attention...\n", - "Running CK Backend\n", - "Running AOTriton Backend\n", - "environ({'SHELL': '/bin/bash', 'NVTE_FRAMEWORK': 'pytorch', 'INSTALLED_DB': 'yes', 'PYTORCH_TESTING_DEVICE_ONLY_FOR': 'cuda', 'PYTHONUNBUFFERED': '1', 'CONDA_EXE': '/opt/conda/bin/conda', '_CE_M': '', 'BUILD_ENVIRONMENT': 'pytorch-linux-jammy-rocm6.3-py3.10', 'HOSTNAME': 'tw036', 'SSH_AUTH_SOCK': '/tmp/vscode-ssh-auth-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock', 'PYTHON_FROZEN_MODULES': 'on', 'ELECTRON_RUN_AS_NODE': '1', 'REMOTE_CONTAINERS_IPC': '/tmp/vscode-remote-containers-ipc-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock', 'NCCL_SOCKET_IFNAME': 'ens51np0', 'PWD': '/workspace/code/TransformerEngine/benchmarks/attention', 'CONDA_ROOT': '/opt/conda', 'CONDA_PREFIX': '/opt/conda/envs/py_3.10', 'MEGATRON_LM_PATH': '/workspace/Megatron-LM', 'VSCODE_ESM_ENTRYPOINT': 'vs/workbench/api/node/extensionHostProcess', 'PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING': '1', 'NCCL_IB_HCA': 'rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7', 'MPI_HOME': '/opt/ompi', 'HOME': '/root', 'LANG': 'C.UTF-8', 'LS_COLORS': '', 'PYTORCH_TEST_WITH_ROCM': '1', 'REMOTE_CONTAINERS': 'true', 'UCX_HOME': '/opt/ucx', 'FORCE_COLOR': '1', 'CONDA_PROMPT_MODIFIER': '(py_3.10) ', 'PYDEVD_USE_FRAME_EVAL': 'NO', 'MAGMA_HOME': '/opt/rocm/magma', 'CLICOLOR': '1', 'VSCODE_L10N_BUNDLE_LOCATION': '', 'DATA_DIR_ROOT': '/workspace/data', 'CLICOLOR_FORCE': '1', 'NVTE_ROCM_ARCH': 'gfx942', 'HIP_FORCE_DEV_KERNARG': '1', 'VSCODE_HANDLES_SIGPIPE': 'true', 'ANACONDA_PYTHON_VERSION': '3.10', 'PYTHONPATH': ':', 'TERM': 'xterm-color', '_CE_CONDA': '', 'INSTALLED_VISION': 'yes', 'REMOTE_CONTAINERS_SOCKETS': '[\"/tmp/vscode-ssh-auth-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock\",\"/tmp/.X11-unix/X4\",\"/root/.gnupg/S.gpg-agent\"]', 'GIT_PAGER': 'cat', 'PYTHONIOENCODING': 'utf-8', 'CONDA_SHLVL': '2', 'PYTORCH_ROCM_ARCH': 'gfx942', 'DISPLAY': ':4', 'NVTE_CK_V3_BF16_CVT': '2', 'SHLVL': '2', 'MAX_JOBS': '32', 'PAGER': 'cat', 'VSCODE_CWD': '/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447', 'LLVM_SYMBOLIZER_PATH': '/opt/rocm/llvm/bin/llvm-symbolizer', 'ROCM_PATH': '/opt/rocm', 'MPLBACKEND': 'module://matplotlib_inline.backend_inline', 'CONDA_PYTHON_EXE': '/opt/conda/bin/python', 'LD_LIBRARY_PATH': '/opt/ompi/lib:/opt/rocm/lib:/usr/local/lib::/opt/rocm/lib/:', 'CONDA_DEFAULT_ENV': 'py_3.10', 'DEBUGINFOD_URLS': '', 'LC_ALL': 'C.UTF-8', 'NVTE_USE_CAST_TRANSPOSE_TRITON': '0', 'BROWSER': '/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/bin/helpers/browser.sh', 'PATH': '/opt/conda/envs/py_3.10/bin:/opt/conda/condabin:/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/bin/remote-cli:/opt/ompi/bin:/opt/ucx/bin:/opt/cache/bin:/opt/rocm/llvm/bin:/opt/rocm/opencl/bin:/opt/rocm/hip/bin:/opt/rocm/hcc/bin:/opt/rocm/bin:/opt/conda/envs/py_3.10/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/rocm/bin:.', 'CI': '1', 'HSA_FORCE_FINE_GRAIN_PCIE': '1', 'NVTE_USE_HIPBLASLT': '1', 'VSCODE_NLS_CONFIG': '{\"userLocale\":\"en\",\"osLocale\":\"en\",\"resolvedLanguage\":\"en\",\"defaultMessagesFile\":\"/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/out/nls.messages.json\",\"locale\":\"en\",\"availableLanguages\":{}}', 'INSTALLED_PROTOBUF': 'yes', 'CONDA_PREFIX_1': '/opt/conda', 'GPU_ARCHS': 'gfx942', 'DEBIAN_FRONTEND': 'noninteractive', 'MODELS_DIR_ROOT': '/workspace/models', 'VSCODE_HANDLES_UNCAUGHT_ERRORS': 'true', 'REMOTE_CONTAINERS_DISPLAY_SOCK': '/tmp/.X11-unix/X4', 'OLDPWD': '/workspace/code/TransformerEngine/docs/examples/attention', 'VSCODE_IPC_HOOK_CLI': '/tmp/vscode-ipc-3cbb5860-fc6c-4cad-a32a-079cd8f8fe81.sock', '_': '/opt/conda/envs/py_3.10/bin/python', 'TORCHINDUCTOR_CACHE_DIR': '/tmp/torchinductor_root', 'CUDA_MODULE_LOADING': 'LAZY', 'NVTE_FLASH_ATTN': '0', 'NVTE_FUSED_ATTN': '1', 'NVTE_UNFUSED_ATTN': '1', 'NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT': '1', 'NVTE_FUSED_ATTN_AOTRITON': '1', 'NVTE_FUSED_ATTN_CK': '0', 'NVTE_FUSED_ATTN_BACKEND': '1', 'NVTE_LOG_AOTRITON_CONFIG': '1', 'NVTE_DEBUG': '1', 'NVTE_DEBUG_LEVEL': '2'})\n", - "\n", - "attn_fwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 0\n", - "\n", - "attn_bwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240882807152, causal mask: 0\n", - "\n", - "attn_fwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 0\n", - "\n", - "attn_bwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240882807152, causal mask: 0\n", - "\n", - "attn_fwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 0\n", - "\n", - "attn_bwd(aotriton): q_shape: (2, 16, 512, 64), q_stride: (524288, 64, 1024, 1), kv_shape: (2, 16, 512, 64), k_stride: (524288, 64, 1024, 1), v_stride: (524288, 64, 1024, 1), scaling_factor: 0.125, M_shape: (32, 512), M_stride: (512, 1), o_shape: (2, 16, 512, 64), o_stride: (524288, 64, 1024, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240882807152, causal mask: 0\n", - " FusedAttention Module ... FusedAttention AOTriton Kernels (fwd+bwd)\n", - "Model ... \n", - "test_0 54.2266 ... 0.0\n", - "test_1 0.0000 ... 0.0\n", - "test_2 0.0000 ... 0.0\n", - "test_3 0.0000 ... 0.0\n", - "\n", - "[4 rows x 17 columns]\n", "Running test_1 with cuDNN attention...\n", "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/transformer_engine/pytorch/attention.py:4832: UserWarning: window_size should be (-1, 0) or (>=0, 0) for attn_mask_type=causal\n", " warnings.warn(\n", - "Running CK Backend\n", - "Running AOTriton Backend\n", - "environ({'SHELL': '/bin/bash', 'NVTE_FRAMEWORK': 'pytorch', 'INSTALLED_DB': 'yes', 'PYTORCH_TESTING_DEVICE_ONLY_FOR': 'cuda', 'PYTHONUNBUFFERED': '1', 'CONDA_EXE': '/opt/conda/bin/conda', '_CE_M': '', 'BUILD_ENVIRONMENT': 'pytorch-linux-jammy-rocm6.3-py3.10', 'HOSTNAME': 'tw036', 'SSH_AUTH_SOCK': '/tmp/vscode-ssh-auth-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock', 'PYTHON_FROZEN_MODULES': 'on', 'ELECTRON_RUN_AS_NODE': '1', 'REMOTE_CONTAINERS_IPC': '/tmp/vscode-remote-containers-ipc-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock', 'NCCL_SOCKET_IFNAME': 'ens51np0', 'PWD': '/workspace/code/TransformerEngine/benchmarks/attention', 'CONDA_ROOT': '/opt/conda', 'CONDA_PREFIX': '/opt/conda/envs/py_3.10', 'MEGATRON_LM_PATH': '/workspace/Megatron-LM', 'VSCODE_ESM_ENTRYPOINT': 'vs/workbench/api/node/extensionHostProcess', 'PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING': '1', 'NCCL_IB_HCA': 'rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7', 'MPI_HOME': '/opt/ompi', 'HOME': '/root', 'LANG': 'C.UTF-8', 'LS_COLORS': '', 'PYTORCH_TEST_WITH_ROCM': '1', 'REMOTE_CONTAINERS': 'true', 'UCX_HOME': '/opt/ucx', 'FORCE_COLOR': '1', 'CONDA_PROMPT_MODIFIER': '(py_3.10) ', 'PYDEVD_USE_FRAME_EVAL': 'NO', 'MAGMA_HOME': '/opt/rocm/magma', 'CLICOLOR': '1', 'VSCODE_L10N_BUNDLE_LOCATION': '', 'DATA_DIR_ROOT': '/workspace/data', 'CLICOLOR_FORCE': '1', 'NVTE_ROCM_ARCH': 'gfx942', 'HIP_FORCE_DEV_KERNARG': '1', 'VSCODE_HANDLES_SIGPIPE': 'true', 'ANACONDA_PYTHON_VERSION': '3.10', 'PYTHONPATH': ':', 'TERM': 'xterm-color', '_CE_CONDA': '', 'INSTALLED_VISION': 'yes', 'REMOTE_CONTAINERS_SOCKETS': '[\"/tmp/vscode-ssh-auth-b542e04c-02b5-4292-98e2-bbad291bc1f4.sock\",\"/tmp/.X11-unix/X4\",\"/root/.gnupg/S.gpg-agent\"]', 'GIT_PAGER': 'cat', 'PYTHONIOENCODING': 'utf-8', 'CONDA_SHLVL': '2', 'PYTORCH_ROCM_ARCH': 'gfx942', 'DISPLAY': ':4', 'NVTE_CK_V3_BF16_CVT': '2', 'SHLVL': '2', 'MAX_JOBS': '32', 'PAGER': 'cat', 'VSCODE_CWD': '/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447', 'LLVM_SYMBOLIZER_PATH': '/opt/rocm/llvm/bin/llvm-symbolizer', 'ROCM_PATH': '/opt/rocm', 'MPLBACKEND': 'module://matplotlib_inline.backend_inline', 'CONDA_PYTHON_EXE': '/opt/conda/bin/python', 'LD_LIBRARY_PATH': '/opt/ompi/lib:/opt/rocm/lib:/usr/local/lib::/opt/rocm/lib/:', 'CONDA_DEFAULT_ENV': 'py_3.10', 'DEBUGINFOD_URLS': '', 'LC_ALL': 'C.UTF-8', 'NVTE_USE_CAST_TRANSPOSE_TRITON': '0', 'BROWSER': '/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/bin/helpers/browser.sh', 'PATH': '/opt/conda/envs/py_3.10/bin:/opt/conda/condabin:/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/bin/remote-cli:/opt/ompi/bin:/opt/ucx/bin:/opt/cache/bin:/opt/rocm/llvm/bin:/opt/rocm/opencl/bin:/opt/rocm/hip/bin:/opt/rocm/hcc/bin:/opt/rocm/bin:/opt/conda/envs/py_3.10/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/rocm/bin:.', 'CI': '1', 'HSA_FORCE_FINE_GRAIN_PCIE': '1', 'NVTE_USE_HIPBLASLT': '1', 'VSCODE_NLS_CONFIG': '{\"userLocale\":\"en\",\"osLocale\":\"en\",\"resolvedLanguage\":\"en\",\"defaultMessagesFile\":\"/root/.vscode-server/bin/e54c774e0add60467559eb0d1e229c6452cf8447/out/nls.messages.json\",\"locale\":\"en\",\"availableLanguages\":{}}', 'INSTALLED_PROTOBUF': 'yes', 'CONDA_PREFIX_1': '/opt/conda', 'GPU_ARCHS': 'gfx942', 'DEBIAN_FRONTEND': 'noninteractive', 'MODELS_DIR_ROOT': '/workspace/models', 'VSCODE_HANDLES_UNCAUGHT_ERRORS': 'true', 'REMOTE_CONTAINERS_DISPLAY_SOCK': '/tmp/.X11-unix/X4', 'OLDPWD': '/workspace/code/TransformerEngine/docs/examples/attention', 'VSCODE_IPC_HOOK_CLI': '/tmp/vscode-ipc-3cbb5860-fc6c-4cad-a32a-079cd8f8fe81.sock', '_': '/opt/conda/envs/py_3.10/bin/python', 'TORCHINDUCTOR_CACHE_DIR': '/tmp/torchinductor_root', 'CUDA_MODULE_LOADING': 'LAZY', 'NVTE_FLASH_ATTN': '0', 'NVTE_UNFUSED_ATTN': '1', 'NVTE_FUSED_ATTN_FORCE_WORKSPACE_OPT': '1', 'NVTE_FUSED_ATTN_BACKEND': '1', 'NVTE_LOG_AOTRITON_CONFIG': '1', 'NVTE_DEBUG': '1', 'NVTE_DEBUG_LEVEL': '2', 'NVTE_FUSED_ATTN': '1', 'NVTE_FUSED_ATTN_AOTRITON': '1', 'NVTE_FUSED_ATTN_CK': '0'})\n", - "\n", - "attn_fwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 1\n", - "\n", - "attn_bwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240882807152, causal mask: 1\n", - "\n", - "attn_fwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 1\n", - "\n", - "attn_bwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240883216272, causal mask: 1\n", - "\n", - "attn_fwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), is_training: 1, dropout_p: 0, philox_seed: 4294967297, philox_offset: 140517004526856, causal mask: 1\n", - "\n", - "attn_bwd(aotriton): q_shape: (2, 16, 2048, 128), q_stride: (4194304, 128, 2048, 1), kv_shape: (2, 16, 2048, 128), k_stride: (4194304, 128, 2048, 1), v_stride: (4194304, 128, 2048, 1), scaling_factor: 0.0883883, M_shape: (32, 2048), M_stride: (2048, 1), o_shape: (2, 16, 2048, 128), o_stride: (4194304, 128, 2048, 1), dropout_p: 0, philox_seed: 140240993893488, philox_offset: 140240883216272, causal mask: 1\n", - " FusedAttention Module ... FusedAttention AOTriton Kernels (fwd+bwd)\n", - "Model ... \n", - "test_0 54.2266 ... 0.7426\n", - "test_1 56.3781 ... 0.0000\n", - "test_2 0.0000 ... 0.0000\n", - "test_3 0.0000 ... 0.0000\n", - "\n", - "[4 rows x 17 columns]\n", "Running test_2 with cuDNN attention...\n", "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/transformer_engine/pytorch/attention.py:4832: UserWarning: window_size should be (-1, 0) or (>=0, 0) for attn_mask_type=causal\n", " warnings.warn(\n", - "Running CK Backend\n", "Running test_3 with cuDNN attention...\n", "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/transformer_engine/pytorch/attention.py:4832: UserWarning: window_size should be (-1, 0) or (>=0, 0) for attn_mask_type=causal\n", " warnings.warn(\n", - "Running CK Backend\n", "\n", " cuDNN fwd+bwd (ms) flash-attn fwd+bwd (ms) cuDNN vs flash speedup\n", - "test_0 0.0833 0.0 0.0\n", - "test_1 0.7294 0.0 0.0\n", - "test_2 1.2165 0.0 0.0\n", - "test_3 15.9134 0.0 0.0\n" + "test_0 0.0805 0.0 0.0\n", + "test_1 0.7510 0.0 0.0\n", + "test_2 2.6646 0.0 0.0\n", + "test_3 16.6992 0.0 0.0\n" ] } ], @@ -418,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "906b8cf1", "metadata": {}, "outputs": [ @@ -435,7 +365,7 @@ } ], "source": [ - "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python docs/examples/attention/example_attention.py" + "!NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python example_attention.py" ] }, { @@ -448,7 +378,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 27, "id": "d3637094", "metadata": {}, "outputs": [ @@ -458,7 +388,7 @@ "text": [ "\n", "Run cuDNN attention...\n", - "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bshd_bshd_bshd', 'batch_size': 2, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 512, 'max_seqlen_kv': 512, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+2081355', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bshd_bshd_bshd', 'batch_size': 2, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 512, 'max_seqlen_kv': 512, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=True (sub-backend 1), UnfusedDotProductAttention=True}\n", "[DEBUG | DotProductAttention]: Selected backend = FusedAttention (sub-backend 1)\n", @@ -469,7 +399,7 @@ } ], "source": [ - "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python docs/examples/attention/example_attention.py" + "!NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python example_attention.py" ] }, { @@ -588,7 +518,7 @@ " \n", " \n", " ROCm attention\n", - " `bshd`, `sbhd`\n", + " `bshd`, `sbhd` `thd`\n", " \n", " PyTorch: 2 formats, i.e. 10 layouts \n", " [1](https://github.com/ROCm/TransformerEngine/blob/8898b7db2289793fe51632a7c5f1eb742bac47c7/transformer_engine/common/fused_attn_rocm/fused_attn_ck.cpp#L108-L114)\n", @@ -656,7 +586,7 @@ " \n", " \n", " ROCm attention\n", - "
  • no_mask, causal_mask, causal_bottom_right_mask (Only CK)
  • \n", + "
  • `no_mask`, `padding`, `arbitrary`
  • `causal`, `padding_causal`
  • `causal_bottom_right`, `padding_causal_bottom_right`
  • \n", " \n", " \n", " Framework-native attention\n", @@ -673,14 +603,14 @@ "\n", "* JAX and PaddlePaddle: Users should provide the `attention_mask` tensor in shape `[batch_size, 1, seqlen_q, seqlen_kv]`.\n", "\n", - "**qkv_format=thd:** Transformer Engine extracts the max sequence length information from `q`, `k`, `v` if `max_seqlen_q` and `max_seqlen_kv` are not provided. This requires GPU-CPU copy and synchronization operations. For performance reasons, please set `max_seqlen_q` and `max_seqlen_kv` to their appropriate values for `thd` QKV format.\n", + "**qkv_format=thd:** Transformer Engine extracts the max sequence length information from `q`, `k`, `v` if `max_seqlen_q` and `max_seqlen_kv` are not provided. This requires GPU-CPU copy and synchronization operations. For performance reasons, please set `max_seqlen_q` and `max_seqlen_kv` to their appropriate values for `thd` QKV format. On ROCm, please use `padding` mask with no padding between sequences, `no_bias` on CK sub-backend for now.\n", "\n", "**Arbitrary mask:** cuDNN does not support `Arbitrary` mask type as of v9.3. However, users can convert the mask to a regular `post_scale_bias` bias and achieve the same functionality. An example script for this conversion is [arbitrary_mask_to_post_scale_bias.py](https://raw.githubusercontent.com/ROCm/TransformerEngine/refs/heads/dev/docs/examples/attention/arbitrary_mask_to_post_scale_bias.py).\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "a1f25a9b", "metadata": {}, "outputs": [ @@ -689,14 +619,14 @@ "output_type": "stream", "text": [ "Run with post_scale_bias:\n", - "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'post_scale_bias', 'core_attention_bias_shape': 'bhss', 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+2081355', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'no_mask', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'post_scale_bias', 'core_attention_bias_shape': 'bhss', 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=True (sub-backend 1), UnfusedDotProductAttention=True}\n", "[DEBUG | DotProductAttention]: Selected backend = FusedAttention (sub-backend 1)\n", "[INFO | DotProductAttention]: Running with FusedAttention backend (sub-backend 1)\n", "\n", "Run with arbitrary mask:\n", - "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+18ee57c', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'arbitrary', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", + "[DEBUG | DotProductAttention]: Running with config={'transformer_engine_version': '1.11.0+2081355', 'compute_capability': 'sm94', 'flash_attn_version': , 'cudnn_version': '99.0.0', 'qkv_type': , 'qkv_dtype': torch.bfloat16, 'qkv_layout': 'bs3hd', 'batch_size': 4, 'num_heads': 16, 'num_gqa_groups': 16, 'max_seqlen_q': 2048, 'max_seqlen_kv': 2048, 'head_dim_qk': 64, 'head_dim_v': 64, 'attn_mask_type': 'arbitrary', 'window_size': (-1, -1), 'alibi_slopes_shape': None, 'core_attention_bias_type': 'no_bias', 'core_attention_bias_shape': None, 'core_attention_bias_requires_grad': False, 'pad_between_seqs': False, 'attention_dropout': 0.0, 'context_parallel': False, 'deterministic': False, 'is_training': True, 'fp8': False, 'fp8_meta': {'fp8_checkpoint': False, 'fp8_group': None, 'recipe': margin=0, format=HYBRID, amax_history_len=1024, wgrad_override=False, fp8_dpa=False, fp8_mha=False}}\n", "[DEBUG | DotProductAttention]: Disabling FlashAttention due to NVTE_FLASH_ATTN=0\n", "[DEBUG | DotProductAttention]: Disabling FusedAttention for arbitrary mask\n", "[DEBUG | DotProductAttention]: Available backends = {FlashAttention=False, FusedAttention=False, UnfusedDotProductAttention=True}\n", @@ -708,7 +638,7 @@ } ], "source": [ - "!cd ../../../ && NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python docs/examples/attention/arbitrary_mask_to_post_scale_bias.py" + "!NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=2 python arbitrary_mask_to_post_scale_bias.py" ] }, { @@ -722,6 +652,8 @@ "\n", "Transformer Engine supports 4 attention bias types, `no_bias`, `pre_scale_bias`, `post_scale_bias`, and `ALiBi` (with/without custom slopes). As of Transformer Engine 1.10, their support matrix is as follows.\n", "\n", + "For ROCm backend, AOTriton only supports `no_bias` attention bias type currently\n", + "\n", "\n", " \n", " \n", @@ -752,12 +684,12 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", "\n", " \n", @@ -793,7 +725,9 @@ "\n", "- `DelayedScaling.fp8_mha=True (default=False)`: This option, on top of `fp8_dpa=True`, removes the casting operations at the beginning and end of the `FusedAttention` module. This feature is experimental. \n", "\n", - "Examples of using the two features are available at [test_dpa_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) and [test_mha_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py). To disable FP8 attention for backward and only use it for forward, users can also set `NVTE_FP8_DPA_BWD=0 (default=1)`." + "Examples of using the two features are available at [test_dpa_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py) and [test_mha_fp8_vs_f16](https://github.com/NVIDIA/TransformerEngine/blob/main/tests/pytorch/fused_attn/test_fused_attn.py). To disable FP8 attention for backward and only use it for forward, users can also set `NVTE_FP8_DPA_BWD=0 (default=1)`.\n", + "\n", + "On ROCm backend, FP8 features are only supported on MI300 and currently, `FusedAttention` sub-backends Composable kernel and AOTriton do not support FP8." ] } ], diff --git a/docs/examples/attention/example_attention.py b/docs/examples/attention/example_attention.py index 15022005a..e96136a42 100644 --- a/docs/examples/attention/example_attention.py +++ b/docs/examples/attention/example_attention.py @@ -9,7 +9,14 @@ import torch import nvtx import transformer_engine -from tests.pytorch.fused_attn.test_fused_attn import ( + +# Add path to tests directory +tests_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "../../../tests/pytorch/fused_attn") +) +sys.path.append(tests_path) + +from test_fused_attn import ( ModelConfig, _get_attention_backends, _run_dot_product_attention,
    Backend
    ROCm attentionPyTorch: `no_bias`, `post_scale_bias`, `ALiBi`(with standard slope)`post_scale_bias`: BHSS(rquired for dbias), 1HSS, B1SS, 11SS, 1HSS`post_scale_bias`
  • BHSS, 1HSS, B1SS, 11SS, 1HSS for forward, BHSS for backward
  • Batch stride for bias are bias_h x s_q x s_kv (bias_h=1 for B1SS and bias_h=h for BHSS)
  • `post_scale_bias`: same as QKV typeROCm 3.0.0: gfx942ROCm 3.0.0+: gfx942
    JAX, PaddlePaddle: `no bias`, `pre_scale_bias`, `post_scale_bias`JAX, PaddlePaddle: `no bias` `post_scale_bias`ALiBi slopes: FP32