[CI][tutorials] Split tutorial 06-fused-attention.py run in build process to accelerate runtime (#4993)

Egor-Krivov · web-flow · commit fb2fff8578d0 · 2025-09-03T11:29:01.000-04:00
Closes #4948 Implementation details: 1. Previously `mxfp` suite in the workflow was responsible for running tests and running tutorial 6. I moved tests to `rest` suite .mxfp test takes ~1min, so this move won't affect runtime significantly. I separated tutorial 6 into 3 separate parts (total compute time ~45mins -> ~25mins for the slowest part after the split). 2. I added parsing of env variables to tutorial 6 to specify config to run. 3. I modified `test-triton.sh` and `pytest-utils.sh` to support tutorial selection based on input arguments. For that I moved select from file logic from `pytest-utils.sh` to `test-triton.sh`. I think that's reasonable, because that way we can clearly see how we skip some benchmarks. 4. Further investigation of tutorial 6 performance shows that the majority of time is spent on forward kernel autotuning, that also happens for backward phase due to a single forward call for shape inference. That autotuning is dtype specific (separate for FP8 & FP16) and FP8 is much slower. Moreover backward mode of FP8 uses FP16 forward and backward pass. So to optimize autotune runs we have 3 configs: 64 heads, 128 heads with autotune for FP8 types (only FWD FP8), 128 heads with autotune for FP16 types (FWD for FP16 & BWD for FP8,FP16). Alternative implementation could be something like setting tutorial 6 in the file with tutorial list, setting env variables and then running tutorials. It would require less changes, but it would make new spilt functionality non-obvious and difficult to use if I want to run specific subconfig. With this implementation I keep things explicit and obvious.
diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
@@ -195,9 +195,11 @@ jobs:
       matrix:
         suite:
           - minicore
-          - mxfp
           - scaled_dot
           - rest
+          - tutorial-fa-64
+          - tutorial-fa-128-fwdfp8
+          - tutorial-fa-128-nofwdfp8
     timeout-minutes: 720
     runs-on: ${{ fromJson(inputs.runner_label && format('["linux", "{0}"]', inputs.runner_label) || format('["linux", "{0}", "{1}", "{2}"]', inputs.device, inputs.driver_version, inputs.runner_version)) }}
     defaults:
@@ -295,7 +297,7 @@ jobs:
           ${{ env.TRITON_TEST_CMD }} --minicore
 
       - name: Run mxfp tests
-        if: matrix.suite == 'mxfp'
+        if: matrix.suite == 'rest'
         run: |
           ${{ env.TRITON_TEST_CMD }} --mxfp
 
@@ -309,15 +311,7 @@ jobs:
         run: |
           ${{ env.TRITON_TEST_CMD }} --interpreter
 
-      # FIXME: make sure new tutorials are added to one of the groups (mxfp, scaled_dot, rest)
-
-      - name: Select tutorials to run (mxfp)
-        if: matrix.suite == 'mxfp'
-        run: |
-          cat <<EOF | tee tutorials.txt
-          06-fused-attention
-          EOF
-
+      # FIXME: make sure new tutorials are added to one of the groups (scaled_dot, rest, tutorial-faX)
       - name: Select tutorials to run (scaled_dot)
         if: matrix.suite == 'scaled_dot'
         run: |
@@ -341,10 +335,16 @@ jobs:
           EOF
 
       - name: Run Tutorials
-        if: matrix.suite == 'mxfp' || matrix.suite == 'scaled_dot' || matrix.suite == 'rest'
+        if: matrix.suite == 'scaled_dot' || matrix.suite == 'rest'
         run: |
           ${{ env.TRITON_TEST_CMD }} --select-from-file tutorials.txt --tutorial
 
+      # Run 06-fused-attention.py separately, because it is split into 3 configs
+      - name: Run Flash Attention tutorials
+        if: matrix.suite == 'tutorial-fa-64' || matrix.suite == 'tutorial-fa-128-fwdfp8' || matrix.suite == 'tutorial-fa-128-nofwdfp8'
+        run: |
+          ${{ env.TRITON_TEST_CMD }} "--${{ matrix.suite }}"
+
       - name: Install transformers
         if: matrix.suite == 'rest'
         run: |
diff --git a/python/tutorials/06-fused-attention.py b/python/tutorials/06-fused-attention.py
@@ -24,6 +24,47 @@
 DEVICE = triton.runtime.driver.active.get_active_torch_device()
 
 
+def parse_config():
+    head_dim_txt = os.getenv("HEAD_DIM", "")
+    print("HEAD_DIM", head_dim_txt)
+
+    head_dims = [64, 128]
+    try:
+        head_dim = int(head_dim_txt)
+        head_dims = [head_dim]
+    except ValueError:
+        pass
+
+    # With FWD_FP8_ONLY we will only run forward with FP8 and will not run backward
+    # With FWD_FP8_SKIP we will skip forward with FP8, but will run forward with FP16 and backward with FP8 & FP16
+    # The reason is that currently the slowest step is kernel autotuning, which is only done for forward pass
+    # The slowest autotune is FP8, which is several times slower than FP16
+    # However, backward pass currently involves calling forward pass, hence, has the same slow time
+    # But, backward pass for FP8 is actually just backward pass for FP16, there is no difference, so it uses FP16 forward tuning.
+    # So from a workload perspective the best strategy for parallel execution is to run separately
+    # 1. Forward pass with FP8, which will trigger autotune(FP8-FWD)
+    # 2. Forward pass with FP16 and backward with FP8 & FP16, which will trigger only autotune(FP16-FWD)
+    fwd_fp8_only_txt = os.getenv("FWD_FP8_ONLY", "0")
+    fwd_fp8_skip_txt = os.getenv("FWD_FP8_SKIP", "0")
+    print("FWD_FP8_ONLY", fwd_fp8_only_txt)
+    print("FWD_FP8_SKIP", fwd_fp8_skip_txt)
+    if fwd_fp8_only_txt == "1":
+        fwd_dtypes = ['fp8']
+        modes = ['fwd']
+    elif fwd_fp8_skip_txt == "1":
+        fwd_dtypes = ['fp16']
+        modes = ['fwd', 'bwd']
+    else:
+        fwd_dtypes = ['fp8', 'fp16']
+        modes = ['fwd', 'bwd']
+
+    return head_dims, fwd_dtypes, modes
+
+
+HEAD_DIMS, FWD_DTYPES, MODES = parse_config()
+print("HEAD_DIM_OPTIONS", HEAD_DIMS, "FWD_DTYPES", FWD_DTYPES, "MODES", MODES)
+
+
 def is_hip():
     return triton.runtime.driver.active.get_current_target().backend == "hip"
 
@@ -705,15 +746,23 @@ def test_op(Z, H, N_CTX, HEAD_DIM, causal, warp_specialize, mode, provider, dtyp
             # Enable warpspec for causal fwd on Hopper
             enable_ws = mode == "fwd" and (is_blackwell() or (is_hopper() and not causal))
             for warp_specialize in [False, True] if enable_ws else [False]:
+
+                if HEAD_DIM not in HEAD_DIMS or mode not in MODES:
+                    continue
+                include_fp8 = mode != 'fwd' or 'fp8' in FWD_DTYPES
+                include_fp16 = mode != 'fwd' or 'fp16' in FWD_DTYPES
+
                 configs.append(
                     triton.testing.Benchmark(
                         x_names=["N_CTX"],
                         x_vals=[2**i for i in range(10, 15)],
                         line_arg="provider",
-                        line_vals=["triton-fp16"] + (["triton-fp8"] if TORCH_HAS_FP8 else []) +
-                        (["flash"] if HAS_FLASH else []),
-                        line_names=["Triton [FP16]"] + (["Triton [FP8]"] if TORCH_HAS_FP8 else []) +
-                        (["Flash-2"] if HAS_FLASH else []),
+                        line_vals=((["triton-fp16"] if include_fp16 else []) +
+                                   (["triton-fp8"] if TORCH_HAS_FP8 and include_fp8 else []) +
+                                   (["flash"] if HAS_FLASH else [])),
+                        line_names=((["Triton [FP16]"] if include_fp16 else []) +
+                                    (["Triton [FP8]"] if TORCH_HAS_FP8 and include_fp8 else []) +
+                                    (["Flash-2"] if HAS_FLASH else [])),
                         styles=[("red", "-"), ("blue", "-"), ("green", "-")],
                         ylabel="TFLOPS",
                         plot_name=
diff --git a/scripts/pytest-utils.sh b/scripts/pytest-utils.sh
@@ -58,10 +58,6 @@ pytest() {
 }
 
 run_tutorial_test() {
-    if [[ -f $TRITON_TEST_SELECTFILE ]] && ! grep -qF "$1" "$TRITON_TEST_SELECTFILE"; then
-        return
-    fi
-
     echo
     echo "****** Running $1 test ******"
     echo
diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh
@@ -18,6 +18,9 @@ TEST:
     --softmax
     --gemm
     --flash-attention
+    - tutorial-fa-64
+    - tutorial-fa-128-fwdfp8
+    - tutorial-fa-128-nofwdfp8
     --flex-attention
     --instrumentation
     --inductor
@@ -110,6 +113,27 @@ while (( $# != 0 )); do
       TEST_DEFAULT=false
       shift
       ;;
+    --tutorial-fa-64)
+      TEST_TUTORIAL=true
+      TEST_TUTORIAL_FA=true
+      FA_CONFIG="HEAD_DIM=64"
+      TEST_DEFAULT=false
+      shift
+      ;;
+    --tutorial-fa-128-fwdfp8)
+      TEST_TUTORIAL=true
+      TEST_TUTORIAL_FA=true
+      FA_CONFIG="HEAD_DIM=128 FWD_FP8_ONLY=1"
+      TEST_DEFAULT=false
+      shift
+      ;;
+    --tutorial-fa-128-nofwdfp8)
+      TEST_TUTORIAL=true
+      TEST_TUTORIAL_FA=true
+      FA_CONFIG="HEAD_DIM=128 FWD_FP8_SKIP=1"
+      TEST_DEFAULT=false
+      shift
+      ;;
     --microbench)
       TEST_MICRO_BENCHMARKS=true
       TEST_DEFAULT=false
@@ -371,17 +395,37 @@ run_tutorial_tests() {
   python -m pip install matplotlib pandas tabulate -q
   cd $TRITON_PROJ/python/tutorials
 
-  run_tutorial_test "01-vector-add"
-  run_tutorial_test "02-fused-softmax"
-  run_tutorial_test "03-matrix-multiplication"
-  run_tutorial_test "04-low-memory-dropout"
-  run_tutorial_test "05-layer-norm"
-  run_tutorial_test "06-fused-attention"
-  run_tutorial_test "07-extern-functions"
-  run_tutorial_test "08-grouped-gemm"
-  run_tutorial_test "09-persistent-matmul"
-  run_tutorial_test "10-experimental-block-pointer"
-  run_tutorial_test "10i-experimental-block-pointer"
+  tutorials=(
+    "01-vector-add"
+    "02-fused-softmax"
+    "03-matrix-multiplication"
+    "04-low-memory-dropout"
+    "05-layer-norm"
+    "06-fused-attention"
+    "07-extern-functions"
+    "08-grouped-gemm"
+    "09-persistent-matmul"
+    "10-experimental-block-pointer"
+    "10i-experimental-block-pointer"
+  )
+  if [ "${TEST_TUTORIAL_FA:-false}" = true ]; then
+    tutorials=(
+      "06-fused-attention"
+    )
+
+    if [ -n "${FA_CONFIG:-}" ]; then
+      # Containst specific config for Fused attention tutorial
+      export $FA_CONFIG
+    fi
+  fi
+
+  for tutorial in "${tutorials[@]}"; do
+    if [[ -f $TRITON_TEST_SELECTFILE ]] && ! grep -qF "$tutorial" "$TRITON_TEST_SELECTFILE"; then
+        continue
+    fi
+
+    run_tutorial_test "$tutorial"
+  done
 }
 
 run_microbench_tests() {