intel
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 4 additions & 25 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 4 additions & 25 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 2 additions & 9 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎bench/bench/bench_mlp.py‎
Lines changed: 16 additions & 4 deletions b/‎bench/bench/bench_mlp.py‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎bench/triton_bench/matmul_ogs_details/opt_flags.py‎
Lines changed: 2 additions & 1 deletion b/‎bench/triton_bench/matmul_ogs_details/opt_flags.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bench/triton_bench/matmul_ogs_details/opt_flags_amd.py‎
Lines changed: 8 additions & 0 deletions b/‎bench/triton_bench/matmul_ogs_details/opt_flags_amd.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 9 additions & 4 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 2 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 2 deletions
@@ -31,7 +31,7 @@ env:
 jobs:
   Runner-Preparation:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 45
     outputs:
       matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
       matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -198,12 +198,7 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.triton/cache because the vast majority of unit test time is
-        # spent compiling.  Triton won't (well, should not) use these cached files
-        # if something internal to Triton changes, because Triton's internal
-        # source code is part of the cache key.
-        #
-        # Similarly, cache ~/.cache/ccache to speed up compilation.
+      - # Cache ~/.cache/ccache to speed up compilation.
         #
         # On branch `main` we always start from an empty cache, i.e. we skip the
         # "restore" step.  This is to prevent the caches from accumulating stale
@@ -214,7 +209,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -285,7 +279,6 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
   Integration-Tests-AMD:
@@ -336,12 +329,7 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.triton/cache because the vast majority of unit test time is
-        # spent compiling.  Triton won't (well, should not) use these cached files
-        # if something internal to Triton changes, because Triton's internal
-        # source code is part of the cache key.
-        #
-        # Similarly, cache ~/.cache/ccache to speed up compilation.
+      - # Cache ~/.cache/ccache to speed up compilation.
         #
         # On branch `main` we always start from an empty cache, i.e. we skip the
         # "restore" step.  This is to prevent the caches from accumulating stale
@@ -352,7 +340,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -443,7 +430,6 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
       - name: Clean up caches
@@ -500,12 +486,7 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.triton/cache because the vast majority of unit test time is
-        # spent compiling.  Triton won't (well, should not) use these cached files
-        # if something internal to Triton changes, because Triton's internal
-        # source code is part of the cache key.
-        #
-        # Similarly, cache ~/.cache/ccache to speed up compilation.
+      - # Cache ~/.cache/ccache to speed up compilation.
         #
         # On branch `main` we always start from an empty cache, i.e. we skip the
         # "restore" step.  This is to prevent the caches from accumulating stale
@@ -516,7 +497,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -572,6 +552,5 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
@@ -34,7 +34,7 @@ env:
 jobs:
   Runner-Preparation:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 45
     outputs:
       matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
       matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -225,12 +225,7 @@ jobs:
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
 
-      # Cache ~/.triton/cache because the vast majority of unit test time is
-      # spent compiling.  Triton won't (well, should not) use these cached files
-      # if something internal to Triton changes, because Triton's internal
-      # source code is part of the cache key.
-      #
-      # Similarly, cache ~/.cache/ccache to speed up compilation.
+      # Cache ~/.cache/ccache to speed up compilation.
       #
       # On branch `main` we always start from an empty cache, i.e. we skip the
       # "restore" step.  This is to prevent the caches from accumulating stale
@@ -242,7 +237,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -325,7 +319,6 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
 
 
@@ -1,5 +1,6 @@
 from pathlib import Path
 import json
+import triton
 import triton.profiler as proton
 import torch
 import triton_bench.swiglu
@@ -9,7 +10,13 @@
 from triton_bench.routing import routing_torch, simulate_expert_sharded_routing
 from triton_bench.meta import cuda_capability_geq
 
-if torch.cuda.is_available():
+
+def is_hip_cdna4():
+    target = triton.runtime.driver.active.get_current_target()
+    return target.backend == 'hip' and target.arch == 'gfx950'
+
+
+if torch.cuda.is_available() and not is_hip_cdna4():
     from triton._C.libtriton import nvidia
     cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8)
     cublas = nvidia.cublas.CublasLt(cublas_workspace)
@@ -18,6 +25,9 @@
 
 
 def _query_gpu_specs():
+    if is_hip_cdna4():
+        # no spec data yet.
+        return None
     import subprocess
     cmd = ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader", "-i=0"]
     output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip()
@@ -119,8 +129,10 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
         # TODO: proton should really be recording that in the json instead of
         # relying on the user to aggregate
         tot_time = sum(x["metrics"].get("time (ns)", 0) for x in data[0]["children"])
-        min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
-        min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
+        min_time_flops = min_time_bytes = 0
+        if SPECS is not None:
+            min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
+            min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
         min_time = max(min_time_flops, min_time_bytes)
         util = min_time / tot_time
         tflops = sum([tot_flops[w] for w in [8, 16]]) / tot_time * 1e-3
@@ -130,7 +142,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
 
 
 if __name__ == "__main__":
-    has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10
+    has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or is_hip_cdna4()
     qxdtype = "fp8" if has_native_mx4 else "bf16"
     print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
     print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))
 
@@ -61,6 +61,7 @@ def make_default_opt_flags_amd(
         block_m = 128
     elif tokens_per_expt >= 512 and n >= 2048:
         block_m = 128
+
     else:
         block_m = max(32, min(triton.next_power_of_2(tokens_per_expt), 64))
     if routing_data is not None:
@@ -90,7 +91,7 @@ def make_default_opt_flags_amd(
     num_stages = 2
     is_persistent = False
     # AMD-specific
-    target_kernel_kwargs = {"waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2}
+    target_kernel_kwargs = {"waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1}
     return OptFlags(
         block_m=block_m,
         block_n=block_n,
 
@@ -2,6 +2,11 @@
 import triton
 
 
+def is_hip_cdna4():
+    target = triton.runtime.driver.active.get_current_target()
+    return target.backend == 'hip' and target.arch == 'gfx950'
+
+
 def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, microscaling_ctx):
     lhs_width = lhs_dtype.itemsize
     rhs_width = rhs_dtype.itemsize if microscaling_ctx.weight_scale is None else 0.5
@@ -18,6 +23,9 @@ def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, microsc
     else:
         block_n = 128
 
+    if is_hip_cdna4() and block_m == 128:
+        block_n = 512
+
     # block_k needs to match the cacheline size (128B)
     block_k = int(128 // min(lhs_width, rhs_width))
 
 
@@ -94,10 +94,15 @@ class TargetInfoBase {
 
   virtual bool supportVectorizedAtomics() const = 0;
 
-  // Helper used by targets to annotate store operations during lowering to
-  // llvm.
-  virtual void storeOpAnnotation(triton::gpu::LocalStoreOp op,
-                                 size_t localStoreOpCount, Type type) const {}
+  // Annotate target specific information to local store operations during
+  // lowering to LLVM.
+  virtual void localStoreOpAnnotation(triton::gpu::LocalStoreOp op,
+                                      size_t localStoreOpCount,
+                                      Type type) const {}
+  // Annotate target specific information to local load operations during
+  // lowering to LLVM. `llLoadOp` is the generated LLVM load op.
+  virtual void localLoadOpAnnotation(triton::gpu::LocalLoadOp localLoadOp,
+                                     Operation *llLoadOp) const {}
 
   virtual ~TargetInfoBase() {}
 };
 
@@ -706,8 +706,7 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
 
-SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
-                                           triton::gpu::MemDescType srcTy,
+SmallVector<Value> loadSharedToDistributed(triton::gpu::LocalLoadOp localLoadOp,
                                            Type elemLlvmTy,
                                            const SharedMemoryObject &smemObj,
                                            Location loc, RewriterBase &rewriter,