intel
diff --git a/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 3 additions & 9 deletions b/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎.github/pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/pins/pytorchFlexAttention.txt‎
Lines changed: 0 additions & 1 deletion b/‎.github/pins/pytorchFlexAttention.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/build-test.yml‎
Lines changed: 4 additions & 26 deletions b/‎.github/workflows/build-test.yml‎
Lines changed: 4 additions & 26 deletions
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 6 additions & 27 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 6 additions & 27 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 4 additions & 11 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 4 additions & 11 deletions
diff --git a/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 0 additions & 8 deletions b/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/bench/bench_mlp.py‎
Lines changed: 24 additions & 10 deletions b/‎bench/bench/bench_mlp.py‎
Lines changed: 24 additions & 10 deletions
@@ -45,14 +45,8 @@ runs:
       if: inputs.ref != ''
       shell: bash
       run: |
-        if [[ "${{ inputs.repository }}" = "liangan1/pytorch" ]]; then
-          PYTORCH_COMMIT_ID="$(<.github/pins/pytorchFlexAttention.txt)"
-          echo "PYTORCH_REPO=${{ inputs.repository }}" | tee -a "$GITHUB_ENV"
-          echo "PYTORCH_COMMIT_ID=$PYTORCH_COMMIT_ID" | tee -a "$GITHUB_ENV"
-        else
-          echo "PYTORCH_REPO=${{ inputs.repository }}" | tee -a "$GITHUB_ENV"
-          echo "PYTORCH_COMMIT_ID=${{ steps.commit-id.outputs.commit_id }}" | tee -a "$GITHUB_ENV"
-        fi
+        echo "PYTORCH_REPO=${{ inputs.repository }}" | tee -a "$GITHUB_ENV"
+        echo "PYTORCH_COMMIT_ID=${{ steps.commit-id.outputs.commit_id }}" | tee -a "$GITHUB_ENV"
 
     - name: Identify Python version
       shell: bash
@@ -105,7 +99,7 @@ runs:
         path: pytorch
 
     - name: Apply additional PR patches
-      if: ${{ steps.pytorch-cache.outputs.status == 'miss'  && inputs.mode == 'source' && (inputs.repository == 'pytorch/pytorch' || inputs.repository == 'liangan1/pytorch') }}
+      if: ${{ steps.pytorch-cache.outputs.status == 'miss' && inputs.repository == 'pytorch/pytorch' && inputs.mode == 'source' }}
       shell: bash
       run: |
         cd pytorch
 
@@ -1 +1 @@
-6f6ff8837a0a5ef95bec63d52e24db288f1db8b0
+a94483329c1e5de7237ee8c6be68c3bf42743a10
@@ -124,7 +124,7 @@ jobs:
 
       - name: Install pass_rate dependencies
         run: |
-          pip install defusedxml
+          pip install defusedxml setproctitle==1.3.5
 
       - name: Setup Triton
         uses: ./.github/actions/setup-triton
@@ -169,14 +169,14 @@ jobs:
 
           {
             echo SKIPLIST="$skiplist"
-            echo TRITON_TEST_CMD="bash -v -x scripts/test-triton.sh --warning-reports --skip-pytorch-install --reports-dir $GITHUB_WORKSPACE/reports ${{ inputs.ignore_errors && '--ignore-errors' || '' }} $skiplist"
+            echo TRITON_TEST_CMD="bash -x scripts/test-triton.sh --warning-reports --skip-pytorch-install --reports-dir $GITHUB_WORKSPACE/reports ${{ inputs.ignore_errors && '--ignore-errors' || '' }} $skiplist"
           } | tee -a $GITHUB_ENV
 
       - name: Run Proton tests
         if: ${{ inputs.driver_version == 'rolling' && inputs.device == 'max1100' }}
         run: |
           cd third_party/proton/test
-          pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v
+          pytest test_api.py test_lib.py test_profile.py test_viewer.py test_record.py -s -v
           cd ..
 
       - name: Run unit tests
 
@@ -96,36 +96,14 @@ jobs:
           path: ${{ steps.pip-cache.outputs.path }}
           dest: ${{ steps.pip-cache.outputs.dest }}
 
-  prepare:
-    name: Prepare
-    runs-on: Linux
-
-    outputs:
-      matrix: ${{ steps.matrix.outputs.matrix }}
-
-    steps:
-      - name: Inputs
-        run: |
-          cat <<EOF
-          ${{ toJSON(inputs) }}
-          EOF
-
-      - name: Matrix
-        id: matrix
-        run: |
-          if [[ -n "${{ inputs.runner_label }}" ]]; then
-            matrix='{"python": ["3.9"]}'
-          else
-            matrix='{"python": ["3.9"], "driver": ["rolling", "lts"]}'
-          fi
-          echo "matrix=$matrix" | tee -a $GITHUB_OUTPUT
-
   integration-tests:
     name: Integration tests matrix
-    needs: prepare
 
     strategy:
-      matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
+      matrix:
+        python:
+          - "3.9"
+        driver: ${{ fromJson((inputs.runner_label || '') == '' && '["rolling", "lts"]' || '["rolling"]') }}
 
     uses: ./.github/workflows/build-test-reusable.yml
     with:
 
@@ -31,7 +31,7 @@ env:
 jobs:
   Runner-Preparation:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 45
     outputs:
       matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
       matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -115,7 +115,7 @@ jobs:
         run: |
           if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
             echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"], ["gb200-runner-set"]]'
-            echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["self-hosted", "gfx942"]]'
+            echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["amd-gfx942"]]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           else
             echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
@@ -198,12 +198,7 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.triton/cache because the vast majority of unit test time is
-        # spent compiling.  Triton won't (well, should not) use these cached files
-        # if something internal to Triton changes, because Triton's internal
-        # source code is part of the cache key.
-        #
-        # Similarly, cache ~/.cache/ccache to speed up compilation.
+      - # Cache ~/.cache/ccache to speed up compilation.
         #
         # On branch `main` we always start from an empty cache, i.e. we skip the
         # "restore" step.  This is to prevent the caches from accumulating stale
@@ -214,7 +209,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -285,14 +279,13 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
   Integration-Tests-AMD:
     needs: Runner-Preparation
     if: needs.Runner-Preparation.outputs.matrix-HIP != ''
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 30
+    timeout-minutes: 45
     env:
       RUNNER_TYPE: ${{ matrix.runner[1] }}
     strategy:
@@ -336,12 +329,7 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.triton/cache because the vast majority of unit test time is
-        # spent compiling.  Triton won't (well, should not) use these cached files
-        # if something internal to Triton changes, because Triton's internal
-        # source code is part of the cache key.
-        #
-        # Similarly, cache ~/.cache/ccache to speed up compilation.
+      - # Cache ~/.cache/ccache to speed up compilation.
         #
         # On branch `main` we always start from an empty cache, i.e. we skip the
         # "restore" step.  This is to prevent the caches from accumulating stale
@@ -352,7 +340,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -443,7 +430,6 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
       - name: Clean up caches
@@ -500,12 +486,7 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.triton/cache because the vast majority of unit test time is
-        # spent compiling.  Triton won't (well, should not) use these cached files
-        # if something internal to Triton changes, because Triton's internal
-        # source code is part of the cache key.
-        #
-        # Similarly, cache ~/.cache/ccache to speed up compilation.
+      - # Cache ~/.cache/ccache to speed up compilation.
         #
         # On branch `main` we always start from an empty cache, i.e. we skip the
         # "restore" step.  This is to prevent the caches from accumulating stale
@@ -516,7 +497,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -572,6 +552,5 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
@@ -34,7 +34,7 @@ env:
 jobs:
   Runner-Preparation:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 45
     outputs:
       matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
       matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -124,7 +124,7 @@ jobs:
         run: |
           if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
             echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"], ["gb200-runner-set"]]'
-            echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["self-hosted", "gfx942"]]'
+            echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["amd-gfx942"]]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           else
             echo '::set-output name=matrix-CUDA::["ubuntu-latest"]'
@@ -225,12 +225,7 @@ jobs:
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
 
-      # Cache ~/.triton/cache because the vast majority of unit test time is
-      # spent compiling.  Triton won't (well, should not) use these cached files
-      # if something internal to Triton changes, because Triton's internal
-      # source code is part of the cache key.
-      #
-      # Similarly, cache ~/.cache/ccache to speed up compilation.
+      # Cache ~/.cache/ccache to speed up compilation.
       #
       # On branch `main` we always start from an empty cache, i.e. we skip the
       # "restore" step.  This is to prevent the caches from accumulating stale
@@ -242,7 +237,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -325,7 +319,6 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
 
@@ -334,7 +327,7 @@ jobs:
     if: needs.Runner-Preparation.outputs.matrix-HIP != ''
 
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 30
+    timeout-minutes: 45
 
     env:
         RUNNER_TYPE: ${{ matrix.runner[1] }}
 
@@ -274,14 +274,6 @@ jobs:
           cd benchmarks/micro_benchmarks
           python run_benchmarks.py --reports $REPORTS
 
-      # Install Pytorch with FlexAttention XPU support enabled
-      - name: Setup PyTorch
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
-        uses: ./.github/actions/setup-pytorch
-        with:
-          repository: liangan1/pytorch
-          ref: liangan1/flex_attention
-
       - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
         run: |
 
@@ -24,7 +24,7 @@ benchmarks/**/*.so
 inductor_log/
 
 # Backends copied from submodules
-python/triton/backends/
+python/triton/backends/*
 !python/triton/backends/__init__.py
 !python/triton/backends/compiler.py
 !python/triton/backends/driver.py
 
@@ -1,15 +1,22 @@
 from pathlib import Path
 import json
+import triton
 import triton.profiler as proton
 import torch
 import triton_bench.swiglu
 from triton_bench.mxfp import downcast_to_mxfp
 from triton_bench.matmul_ogs import MicroscalingCtx, matmul_ogs, PrecisionConfig, FlexCtx
 from triton_bench.numerics import InFlexData
-from triton_bench.routing import routing_torch, simulate_expert_sharded_routing
+from triton_bench.routing import routing, simulate_expert_sharded_routing
 from triton_bench.meta import cuda_capability_geq
 
-if torch.cuda.is_available():
+
+def is_hip_cdna4():
+    target = triton.runtime.driver.active.get_current_target()
+    return target.backend == 'hip' and target.arch == 'gfx950'
+
+
+if torch.cuda.is_available() and not is_hip_cdna4():
     from triton._C.libtriton import nvidia
     cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8)
     cublas = nvidia.cublas.CublasLt(cublas_workspace)
@@ -18,6 +25,9 @@
 
 
 def _query_gpu_specs():
+    if is_hip_cdna4():
+        # no spec data yet.
+        return None
     import subprocess
     cmd = ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader", "-i=0"]
     output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip()
@@ -86,17 +96,19 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
     for i in range(100):
         x = torch.randn((batch, dim1), device=dev)
         x = x.to(wg.dtype if n_expts_tot > 1 else x_dtype)
-        # TODO: activate proton here when fast routing is done
+        proton.activate()
         if n_expts_tot > 1:
             logits = matmul_ogs(x, wg, bg, precision_config=pcg)
-            rdata, gather_indx, scatter_indx = routing_torch(logits, n_expts_act)
+            rdata, gather_indx, scatter_indx = routing(logits, n_expts_act)
             if EP > 1:
+                proton.deactivate()
+                # TODO: activate proton here when fast expert parallelism simulation is done
                 m = logits.shape[0] * EP
                 _, rdata, gather_indx, scatter_indx = simulate_expert_sharded_routing(m, rdata, EP, device=dev)
+                proton.activate()
             x = x.to(x_dtype)
         else:
             rdata, gather_indx, scatter_indx = None, None, None
-        proton.activate()
         # c0 = torch.empty((x.shape[0], w1.shape[-1]), device=dev, dtype=x.dtype)
         # c1 = torch.empty((x.shape[0], w2.shape[-1]), device=dev, dtype=x.dtype)
         # cublas.matmul(x, w1.squeeze(0), c0)
@@ -119,8 +131,10 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
         # TODO: proton should really be recording that in the json instead of
         # relying on the user to aggregate
         tot_time = sum(x["metrics"].get("time (ns)", 0) for x in data[0]["children"])
-        min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
-        min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
+        min_time_flops = min_time_bytes = 0
+        if SPECS is not None:
+            min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
+            min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
         min_time = max(min_time_flops, min_time_bytes)
         util = min_time / tot_time
         tflops = sum([tot_flops[w] for w in [8, 16]]) / tot_time * 1e-3
@@ -130,9 +144,9 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
 
 
 if __name__ == "__main__":
-    has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10
+    has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or is_hip_cdna4()
     qxdtype = "fp8" if has_native_mx4 else "bf16"
     print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
     print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))
-    print(bench_mlp(1024, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=2, name="llama4"))
-    print(bench_mlp(1024, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=2, name="llama4"))
+    print(bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4"))
+    print(bench_mlp(2048, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=1, name="llama4"))
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-6f6ff8837a0a5ef95bec63d52e24db288f1db8b0`
	`1`	`+a94483329c1e5de7237ee8c6be68c3bf42743a10`