intel
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 4 additions & 25 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 4 additions & 25 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 2 additions & 9 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎bench/bench/bench_mlp.py‎
Lines changed: 16 additions & 4 deletions b/‎bench/bench/bench_mlp.py‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎bench/triton_bench/matmul_ogs_details/opt_flags.py‎
Lines changed: 2 additions & 1 deletion b/‎bench/triton_bench/matmul_ogs_details/opt_flags.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bench/triton_bench/matmul_ogs_details/opt_flags_amd.py‎
Lines changed: 8 additions & 0 deletions b/‎bench/triton_bench/matmul_ogs_details/opt_flags_amd.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 9 additions & 4 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 2 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 65 additions & 37 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 65 additions & 37 deletions
@@ -31,7 +31,7 @@ env:
 jobs:
   Runner-Preparation:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 45
     outputs:
       matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
       matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -198,12 +198,7 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.triton/cache because the vast majority of unit test time is
-        # spent compiling.  Triton won't (well, should not) use these cached files
-        # if something internal to Triton changes, because Triton's internal
-        # source code is part of the cache key.
-        #
-        # Similarly, cache ~/.cache/ccache to speed up compilation.
+      - # Cache ~/.cache/ccache to speed up compilation.
         #
         # On branch `main` we always start from an empty cache, i.e. we skip the
         # "restore" step.  This is to prevent the caches from accumulating stale
@@ -214,7 +209,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -285,7 +279,6 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
   Integration-Tests-AMD:
@@ -336,12 +329,7 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.triton/cache because the vast majority of unit test time is
-        # spent compiling.  Triton won't (well, should not) use these cached files
-        # if something internal to Triton changes, because Triton's internal
-        # source code is part of the cache key.
-        #
-        # Similarly, cache ~/.cache/ccache to speed up compilation.
+      - # Cache ~/.cache/ccache to speed up compilation.
         #
         # On branch `main` we always start from an empty cache, i.e. we skip the
         # "restore" step.  This is to prevent the caches from accumulating stale
@@ -352,7 +340,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -443,7 +430,6 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
       - name: Clean up caches
@@ -500,12 +486,7 @@ jobs:
             ~/.triton/nvidia
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
-      - # Cache ~/.triton/cache because the vast majority of unit test time is
-        # spent compiling.  Triton won't (well, should not) use these cached files
-        # if something internal to Triton changes, because Triton's internal
-        # source code is part of the cache key.
-        #
-        # Similarly, cache ~/.cache/ccache to speed up compilation.
+      - # Cache ~/.cache/ccache to speed up compilation.
         #
         # On branch `main` we always start from an empty cache, i.e. we skip the
         # "restore" step.  This is to prevent the caches from accumulating stale
@@ -516,7 +497,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -572,6 +552,5 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
@@ -34,7 +34,7 @@ env:
 jobs:
   Runner-Preparation:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 45
     outputs:
       matrix-CUDA: ${{ steps.set-matrix.outputs.matrix-CUDA }}
       matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
@@ -225,12 +225,7 @@ jobs:
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
 
-      # Cache ~/.triton/cache because the vast majority of unit test time is
-      # spent compiling.  Triton won't (well, should not) use these cached files
-      # if something internal to Triton changes, because Triton's internal
-      # source code is part of the cache key.
-      #
-      # Similarly, cache ~/.cache/ccache to speed up compilation.
+      # Cache ~/.cache/ccache to speed up compilation.
       #
       # On branch `main` we always start from an empty cache, i.e. we skip the
       # "restore" step.  This is to prevent the caches from accumulating stale
@@ -242,7 +237,6 @@ jobs:
         uses: actions/cache/restore@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           # Restore the most recent cache entry.
           restore-keys: |
@@ -325,7 +319,6 @@ jobs:
         uses: actions/cache/save@v4
         with:
           path: |
-            ~/.triton/cache
             ~/.ccache
           key: triton-artifacts-${{ runner.os }}-${{ runner.arch }}-${{ env.RUNNER_TYPE }}-llvm-${{ steps.cache-key.outputs.llvm }}-${{ steps.cache-key.outputs.datetime }}
 
 
@@ -1,5 +1,6 @@
 from pathlib import Path
 import json
+import triton
 import triton.profiler as proton
 import torch
 import triton_bench.swiglu
@@ -9,7 +10,13 @@
 from triton_bench.routing import routing_torch, simulate_expert_sharded_routing
 from triton_bench.meta import cuda_capability_geq
 
-if torch.cuda.is_available():
+
+def is_hip_cdna4():
+    target = triton.runtime.driver.active.get_current_target()
+    return target.backend == 'hip' and target.arch == 'gfx950'
+
+
+if torch.cuda.is_available() and not is_hip_cdna4():
     from triton._C.libtriton import nvidia
     cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8)
     cublas = nvidia.cublas.CublasLt(cublas_workspace)
@@ -18,6 +25,9 @@
 
 
 def _query_gpu_specs():
+    if is_hip_cdna4():
+        # no spec data yet.
+        return None
     import subprocess
     cmd = ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader", "-i=0"]
     output = subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip()
@@ -119,8 +129,10 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
         # TODO: proton should really be recording that in the json instead of
         # relying on the user to aggregate
         tot_time = sum(x["metrics"].get("time (ns)", 0) for x in data[0]["children"])
-        min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
-        min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
+        min_time_flops = min_time_bytes = 0
+        if SPECS is not None:
+            min_time_flops = sum([tot_flops[w] / SPECS[f"MAX_TFLOPS{w}"] for w in [8, 16]]) * 1e-3
+            min_time_bytes = tot_bytes / SPECS["MAX_TBPS"] * 1e-3
         min_time = max(min_time_flops, min_time_bytes)
         util = min_time / tot_time
         tflops = sum([tot_flops[w] for w in [8, 16]]) / tot_time * 1e-3
@@ -130,7 +142,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
 
 
 if __name__ == "__main__":
-    has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10
+    has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or is_hip_cdna4()
     qxdtype = "fp8" if has_native_mx4 else "bf16"
     print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
     print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))
 
@@ -61,6 +61,7 @@ def make_default_opt_flags_amd(
         block_m = 128
     elif tokens_per_expt >= 512 and n >= 2048:
         block_m = 128
+
     else:
         block_m = max(32, min(triton.next_power_of_2(tokens_per_expt), 64))
     if routing_data is not None:
@@ -90,7 +91,7 @@ def make_default_opt_flags_amd(
     num_stages = 2
     is_persistent = False
     # AMD-specific
-    target_kernel_kwargs = {"waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2}
+    target_kernel_kwargs = {"waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1}
     return OptFlags(
         block_m=block_m,
         block_n=block_n,
 
@@ -2,6 +2,11 @@
 import triton
 
 
+def is_hip_cdna4():
+    target = triton.runtime.driver.active.get_current_target()
+    return target.backend == 'hip' and target.arch == 'gfx950'
+
+
 def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, microscaling_ctx):
     lhs_width = lhs_dtype.itemsize
     rhs_width = rhs_dtype.itemsize if microscaling_ctx.weight_scale is None else 0.5
@@ -18,6 +23,9 @@ def compute_block_nk(n, block_m, grid_m, num_xcds, lhs_dtype, rhs_dtype, microsc
     else:
         block_n = 128
 
+    if is_hip_cdna4() and block_m == 128:
+        block_n = 512
+
     # block_k needs to match the cacheline size (128B)
     block_k = int(128 // min(lhs_width, rhs_width))
 
 
@@ -94,10 +94,15 @@ class TargetInfoBase {
 
   virtual bool supportVectorizedAtomics() const = 0;
 
-  // Helper used by targets to annotate store operations during lowering to
-  // llvm.
-  virtual void storeOpAnnotation(triton::gpu::LocalStoreOp op,
-                                 size_t localStoreOpCount, Type type) const {}
+  // Annotate target specific information to local store operations during
+  // lowering to LLVM.
+  virtual void localStoreOpAnnotation(triton::gpu::LocalStoreOp op,
+                                      size_t localStoreOpCount,
+                                      Type type) const {}
+  // Annotate target specific information to local load operations during
+  // lowering to LLVM. `llLoadOp` is the generated LLVM load op.
+  virtual void localLoadOpAnnotation(triton::gpu::LocalLoadOp localLoadOp,
+                                     Operation *llLoadOp) const {}
 
   virtual ~TargetInfoBase() {}
 };
 
@@ -706,8 +706,7 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
 
-SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
-                                           triton::gpu::MemDescType srcTy,
+SmallVector<Value> loadSharedToDistributed(triton::gpu::LocalLoadOp localLoadOp,
                                            Type elemLlvmTy,
                                            const SharedMemoryObject &smemObj,
                                            Location loc, RewriterBase &rewriter,
 
@@ -149,6 +149,62 @@ inline int64_t getTMAContigDim(gpu::MemDescType memDescType) {
   return getTMAContigDim(memDescType.getEncoding(), memDescType.getShape());
 }
 
+inline std::optional<int> getTMASwizzleMode(Operation *op, TensorDescType ty) {
+  auto encoding = ty.getBlockType().getEncoding();
+  auto mmaEncoding = dyn_cast<gpu::NVMMASharedEncodingAttr>(encoding);
+  unsigned swizzleBytes = mmaEncoding ? mmaEncoding.getSwizzlingByteWidth() : 0;
+  if (!mmaEncoding) {
+    auto swizzledEnc = dyn_cast<gpu::SwizzledSharedEncodingAttr>(encoding);
+    if (!swizzledEnc || swizzledEnc.getVec() != 1 ||
+        swizzledEnc.getPerPhase() != 1 || swizzledEnc.getMaxPhase() != 1) {
+      if (op)
+        op->emitError("Unhandled encoding type");
+      return std::nullopt;
+    }
+  }
+
+  bool fp4Padded = mmaEncoding && mmaEncoding.getFp4Padded();
+  assert(!fp4Padded || swizzleBytes == 128 &&
+                           "elem type .b4x16_p64 supports only 128B swizzling");
+
+  int32_t swizzleMode = 0;
+  if (swizzleBytes == 128) {
+    swizzleMode = 3;
+  } else if (swizzleBytes == 64) {
+    swizzleMode = 2;
+  } else if (swizzleBytes == 32) {
+    swizzleMode = 1;
+  }
+  return swizzleMode;
+}
+
+inline std::optional<int> getTMAElementType(Operation *op, TensorDescType ty) {
+  auto encoding = ty.getBlockType().getEncoding();
+  auto mmaEncoding = dyn_cast<gpu::NVMMASharedEncodingAttr>(encoding);
+  bool fp4Padded = mmaEncoding && mmaEncoding.getFp4Padded();
+
+  if (fp4Padded)
+    return 14; // .b4x16_p64
+
+  auto elemSize = ty.getBlockType().getElementTypeBitWidth() / 8;
+  switch (elemSize) {
+  case 1:
+    return 0;
+  case 2:
+    return 1;
+  case 4:
+    return 2;
+  default:
+    break;
+  }
+  if (op) {
+    op->emitError()
+        << "Tensor descriptor element type must have size 1, 2, or 4 but got "
+        << elemSize;
+  }
+  return std::nullopt;
+}
+
 template <typename BuilderT>
 mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
                                   mlir::triton::MakeTensorDescOp op,
@@ -182,8 +238,6 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
     boxDim.push_back(mkI32Constant(shapePerCTA[k]));
 
   unsigned swizzleBytes = mmaEncoding ? mmaEncoding.getSwizzlingByteWidth() : 0;
-  assert(!fp4Padded || swizzleBytes == 128 &&
-                           "elem type .b4x16_p64 supports only 128B swizzling");
   if (!mmaEncoding) {
     auto swizzledEnc = dyn_cast<gpu::SwizzledSharedEncodingAttr>(
         op.getType().getBlockType().getEncoding());
@@ -194,14 +248,10 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
     }
   }
 
-  int32_t swizzle_mode = 0;
-  if (swizzleBytes == 128) {
-    swizzle_mode = 3;
-  } else if (swizzleBytes == 64) {
-    swizzle_mode = 2;
-  } else if (swizzleBytes == 32) {
-    swizzle_mode = 1;
-  }
+  auto maybeSwizzleMode = getTMASwizzleMode(op, op.getType());
+  if (!maybeSwizzleMode)
+    return failure();
+  auto swizzleMode = *maybeSwizzleMode;
 
   Value elemSizeVal = builder.template create<arith::ConstantOp>(
       loc, builder.getI64Type(), builder.getI64IntegerAttr(elemSize));
@@ -224,31 +274,9 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
     globalStride[i] = builder.template create<arith::MulIOp>(
         loc, globalStride[i], elemSizeVal);
 
-  int elemTypeEnum;
-
-  if (fp4Padded) {
-    elemTypeEnum = 14; // .b4x16_p64
-  } else {
-    switch (elemSize) {
-    case 1: {
-      elemTypeEnum = 0;
-      break;
-    }
-    case 2: {
-      elemTypeEnum = 1;
-      break;
-    }
-    case 4: {
-      elemTypeEnum = 2;
-      break;
-    }
-    default: {
-      op->emitError()
-          << "Tensor descriptor element type must have size 1, 2, or 4 but got "
-          << elemSize;
-      return failure();
-    }
-    }
+  auto elemTypeEnum = getTMAElementType(op, op.getType());
+  if (!elemTypeEnum) {
+    return failure();
   }
 
   builder.template create<triton::ExperimentalTensormapCreateOp>(
@@ -259,9 +287,9 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
       /*global_dim=*/globalDim,
       /*global_stride=*/globalStride,
       /*element_strides=*/elementStride,
-      /*elem_type*/ builder.getI32IntegerAttr(elemTypeEnum),
+      /*elem_type*/ builder.getI32IntegerAttr(*elemTypeEnum),
       /*interleave_layout*/ builder.getI32IntegerAttr(0),
-      /*swizzle_mode=*/builder.getI32IntegerAttr(swizzle_mode),
+      /*swizzle_mode=*/builder.getI32IntegerAttr(swizzleMode),
       /*fill_mode=*/builder.getI32IntegerAttr(0));
   return success();
 }