Merge commit '67af519ec69331a3d4e2fc2cd9d45e0165a849a1'

whitneywhtsang · whitneywhtsang · commit 586824a2d5ed · 2025-08-11T23:50:35.000Z
diff --git a/Makefile b/Makefile
@@ -6,7 +6,7 @@ PYTHON ?= python
 BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmake_dir; print(get_cmake_dir())')
 TRITON_OPT := $(BUILD_DIR)/bin/triton-opt
 PYTEST := $(PYTHON) -m pytest
-LLVM_BUILD_PATH ?= $(realpath .llvm-project/build)
+LLVM_BUILD_PATH ?= "$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))/.llvm-project/build"
 NUM_PROCS ?= 8
 
 # Incremental builds
diff --git a/lib/Conversion/TritonInstrumentToLLVM/CMakeLists.txt b/lib/Conversion/TritonInstrumentToLLVM/CMakeLists.txt
@@ -8,4 +8,5 @@ add_triton_library(TritonInstrumentToLLVM
     TritonGPUIR
     TritonInstrumentIR
     TritonNvidiaGPUIR
+    NVGPUIR
 )
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/Partition.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/Partition.cpp
@@ -2,6 +2,7 @@
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/IR/Use.h"
 
 using namespace mlir;
 using namespace triton;
@@ -119,16 +120,14 @@ bool WarpSchedule::trySchedule(Partition *partition, Operation *op) {
 
 FailureOr<WarpSchedule> WarpSchedule::deserialize(scf::ForOp loop) {
   auto stages = loop->getAttrOfType<ArrayAttr>(kPartitionStagesAttrName);
-  if (!stages) {
-    return mlir::emitWarning(loop.getLoc(), "missing '")
-           << kPartitionStagesAttrName << "' attribute";
-  }
+  if (!stages)
+    return failure();
 
   WarpSchedule result;
   for (auto [idx, attr] : llvm::enumerate(stages)) {
     auto stage = dyn_cast<IntegerAttr>(attr);
     if (!stage || stage.getInt() < 0) {
-      return mlir::emitWarning(loop.getLoc(), "partition stages attribute '")
+      return mlir::emitError(loop.getLoc(), "partition stages attribute '")
              << kPartitionStagesAttrName << "' has invalid element " << attr;
     }
 
@@ -140,10 +139,8 @@ FailureOr<WarpSchedule> WarpSchedule::deserialize(scf::ForOp loop) {
     Partition *partition = result.getRootPartition();
     if (auto attr = op.getAttrOfType<IntegerAttr>(kPartitionAttrName)) {
       int64_t idx = attr.getInt();
-      if (idx < 0 || idx >= result.partitions.size()) {
-        return mlir::emitWarning(op.getLoc(), "invalid partition index ")
-               << idx;
-      }
+      if (idx < 0 || idx >= result.partitions.size())
+        return mlir::emitError(op.getLoc(), "invalid partition index ") << idx;
       partition = result.partitions[idx].get();
     }
     result.insert(partition, &op);
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp
@@ -149,10 +149,14 @@ static void scheduleUsers(scf::ForOp loop, WarpSchedule &schedule,
 // first-order partition assignment to the operations in the scheme and its
 // users and/or dependencies. This sets up the initial partitioning of the ops.
 static std::optional<WarpSchedule> getInitialSchedule(scf::ForOp loop) {
-  WarpSchedule schedule;
+  // Check for an existing schedule.
+  if (FailureOr<WarpSchedule> scheduleOr = WarpSchedule::deserialize(loop);
+      succeeded(scheduleOr))
+    return {std::move(*scheduleOr)};
 
   // Start by creating the default partition, a partition for for all loads, and
   // a partition for all MMAs.
+  WarpSchedule schedule;
   Partition *defaultPartition = schedule.addPartition(0);
   Partition *mmaPartition = schedule.addPartition(1);
   Partition *loadPartition = schedule.addPartition(0);
@@ -479,6 +483,39 @@ void propagatePartitions(scf::ForOp loop, WarpSchedule &schedule) {
   }
 }
 
+// Rematerialize chains of broadcasts where the user is in a different partition
+// than the broadcast to reduce the amount of data that needs to be transferred.
+void rematerializeBroadcasts(WarpSchedule &schedule, OpOperand *use) {
+  static_assert(
+      std::is_base_of_v<OpTrait::OneResult<BroadcastOp>, BroadcastOp> &&
+      std::is_base_of_v<OpTrait::OneResult<ExpandDimsOp>, ExpandDimsOp>);
+
+  Operation *defOp = use->get().getDefiningOp();
+  while (isa_and_nonnull<BroadcastOp, ExpandDimsOp>(defOp)) {
+    Operation *clone = OpBuilder(defOp).clone(*defOp);
+    Partition *userPartition = schedule.getPartition(use->getOwner());
+    assert(userPartition && "user not scheduled");
+    schedule.insert(userPartition, clone);
+    use->set(clone->getResult(0));
+
+    defOp = clone->getOperand(0).getDefiningOp();
+    use = &clone->getOpOperand(0);
+  }
+}
+
+void optimizeSchedule(scf::ForOp loop, WarpSchedule &schedule) {
+  for (Partition &partition : schedule.getPartitions()) {
+    SmallVector<OpOperand *> uses;
+    schedule.iterateOutputs(loop, &partition,
+                            [&](Operation *defOp, OpOperand &use) {
+                              if (!isa<scf::YieldOp>(use.getOwner()))
+                                uses.push_back(&use);
+                            });
+    for (OpOperand *use : uses)
+      rematerializeBroadcasts(schedule, use);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Pass Definition
 //===----------------------------------------------------------------------===//
@@ -507,6 +544,7 @@ void PartitionScheduling::runOnOperation() {
   for (scf::ForOp loop : loops) {
     if (std::optional<WarpSchedule> schedule = getInitialSchedule(loop)) {
       propagatePartitions(loop, *schedule);
+      optimizeSchedule(loop, *schedule);
       schedule->serialize(loop);
     }
   }
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/RewritePartitionDependencies.cpp
@@ -48,7 +48,7 @@ struct UseInfo {
 int UseInfo::getMaxUseDistance(const Partition &partition) {
   int maxDistance = 0;
   for (auto [usePartition, distance] : llvm::make_first_range(consumers)) {
-    int dist = 2 + distance;
+    int dist = 1 + distance;
     maxDistance = std::max(maxDistance, dist);
   }
   return maxDistance;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -7276,11 +7276,13 @@ def mul_add(data):
 # -----------------------
 
 
-@pytest.mark.parametrize("arch", ["sm70", "sm80", "sm90"])
+@pytest.mark.parametrize("arch", ["sm70", "sm80", "sm90", "gfx942", "gfx950", "gfx1200"])
 @pytest.mark.parametrize("env_var_override", [False, True])
 def test_override_arch(arch, env_var_override, device):
-    if not is_cuda():
-        pytest.xfail('arch only for CUDA')
+    if arch.startswith("sm") and not is_cuda():
+        pytest.xfail(f"{arch} arch only for CUDA")
+    elif arch.startswith("gfx") and not is_hip():
+        pytest.xfail(f"{arch} arch only for HIP")
 
     @triton.jit
     def simple(data, out):
@@ -7291,15 +7293,31 @@ def simple(data, out):
     data = torch.randn((128, ), device=device, dtype=torch.float32)
     out = torch.empty_like(data)
 
-    if env_var_override:
-        os.environ["TRITON_OVERRIDE_ARCH"] = str(arch)
-        h = simple[(1, )](data, out)
-        os.environ.pop("TRITON_OVERRIDE_ARCH")
-    else:
-        h = simple[(1, )](data, out, arch=arch)
-    torch.testing.assert_close(data * 1.5 + 1.0, out)
-    ttgir_cc = re.search(r'cuda:(\d+)', h.asm["ttgir"])
-    assert ttgir_cc.group(1) == arch[2:]
+    if is_cuda():
+        if env_var_override:
+            os.environ["TRITON_OVERRIDE_ARCH"] = str(arch)
+            h = simple[(1, )](data, out)
+            os.environ.pop("TRITON_OVERRIDE_ARCH")
+        else:
+            h = simple[(1, )](data, out, arch=arch)
+        torch.testing.assert_close(data * 1.5 + 1.0, out)
+        ttgir_cc = re.search(r'cuda:(\d+)', h.asm["ttgir"])
+        assert ttgir_cc.group(1) == arch[2:]
+    elif is_hip():
+        # For HIP, the generated kernel is a binary containing the final ISA. So we cannot run
+        # them like CUDA side if the chip doesn't match. Here we just check generated ISA.
+        if env_var_override:
+            os.environ["TRITON_OVERRIDE_ARCH"] = str(arch)
+            h = simple.warmup(data, out, grid=(1, ))
+            os.environ.pop("TRITON_OVERRIDE_ARCH")
+        else:
+            h = simple.warmup(data, out, arch=arch, grid=(1, ))
+        ttgir_gfx = re.search(r'hip:(\w+)', h.asm["ttgir"])
+        ttgir_warp = re.search(r'"ttg.threads-per-warp" = (\d+)', h.asm["ttgir"])
+        amdgcn_gfx = re.search(r'.amdgcn_target "amdgcn-amd-amdhsa--(\w+)"', h.asm["amdgcn"])
+        assert ttgir_gfx.group(1) == arch
+        assert int(ttgir_warp.group(1)) == (32 if arch == "gfx1200" else 64)
+        assert amdgcn_gfx.group(1) == arch
 
 
 # -----------------------
diff --git a/python/triton_kernels/tests/test_specialize.py b/python/triton_kernels/tests/test_specialize.py
@@ -53,32 +53,36 @@ def cache_hook(*args, **kwargs):
         fn_name = kwargs["fn"].name
         module_name = kwargs["fn"].module
 
-    triton.knobs.runtime.jit_cache_hook = cache_hook
-    o = torch.empty((1, ), dtype=torch.float32, device=device)
-    k = specialized_kernel[(1, )](o, )
-    hash = k.hash
-    assert o.item() == 1.0
-    assert module_name == "tests.test_specialize"
-    assert fn_name == "cacheable_kernel"
-
-    compile_count = 0
-
-    def count_hook(*args, **kwargs):
-        nonlocal compile_count
-        compile_count += 1
-
-    triton.knobs.runtime.jit_cache_hook = count_hook
-    # clear the cache
-    specialized_kernel.device_caches.clear()
-
-    # retrieve the kernel from name and preload it.
-    fn = retrieve_fn(module_name, fn_name)
-    assert fn == specialized_kernel
-    preload = fn.preload(specialization_data)
-    assert compile_count == 1
-    assert preload.hash == hash
-
-    # verify that we hit the cache.
-    compile_count = 0
-    specialized_kernel[(1, )](o, )
-    assert compile_count == 0
+    prev_hook = triton.knobs.runtime.jit_cache_hook
+    try:
+        triton.knobs.runtime.jit_cache_hook = cache_hook
+        o = torch.empty((1, ), dtype=torch.float32, device=device)
+        k = specialized_kernel[(1, )](o, )
+        hash = k.hash
+        assert o.item() == 1.0
+        assert module_name == "tests.test_specialize"
+        assert fn_name == "cacheable_kernel"
+
+        compile_count = 0
+
+        def count_hook(*args, **kwargs):
+            nonlocal compile_count
+            compile_count += 1
+
+        triton.knobs.runtime.jit_cache_hook = count_hook
+        # clear the cache
+        specialized_kernel.device_caches.clear()
+
+        # retrieve the kernel from name and preload it.
+        fn = retrieve_fn(module_name, fn_name)
+        assert fn == specialized_kernel
+        preload = fn.preload(specialization_data)
+        assert compile_count == 1
+        assert preload.hash == hash
+
+        # verify that we hit the cache.
+        compile_count = 0
+        specialized_kernel[(1, )](o, )
+        assert compile_count == 0
+    finally:
+        triton.knobs.runtime.jit_cache_hook = prev_hook
diff --git a/python/triton_kernels/triton_kernels/numerics_details/mxfp.py b/python/triton_kernels/triton_kernels/numerics_details/mxfp.py
@@ -52,7 +52,7 @@ def downcast_to_mxfp(src_tensor: torch.Tensor, out_quant_type: torch.dtype, axis
         kernel_scale = out_scale.view(-1, out_scale.shape[-1])
 
         BLOCK_OUT_DIM = 128
-        BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE
+        BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value
         grid_out = triton.cdiv(kernel_src_tensor.shape[0], BLOCK_OUT_DIM)
         grid_quant = triton.cdiv(kernel_src_tensor.shape[1], BLOCK_QUANT_DIM)
 
@@ -93,7 +93,7 @@ def upcast_from_mxfp(tensor: torch.Tensor, scale: torch.Tensor, dtype: torch.dty
     reshaped_tensor = tensor.view(-1, tensor.shape[-1])
     reshaped_scale = scale.view(-1, scale.shape[-1])
     BLOCK_OUT_DIM = 128
-    BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE
+    BLOCK_QUANT_DIM = MXFP_BLOCK_SIZE.value
     blocks_out_dim = triton.cdiv(reshaped_out.shape[0], BLOCK_OUT_DIM)
     blocks_quant_dim = triton.cdiv(reshaped_out.shape[1], BLOCK_QUANT_DIM)
     _upcast_from_mxfp[(blocks_out_dim, blocks_quant_dim)](reshaped_out, *reshaped_out.stride(), reshaped_scale,
diff --git a/test/TritonGPU/partition-scheduling.mlir b/test/TritonGPU/partition-scheduling.mlir
@@ -28,7 +28,6 @@ tt.func public @attention_forward(
   %zero = arith.constant dense<0.0> : tensor<256x64xf32, #blocked>
   %one = arith.constant dense<1.0> : tensor<256xf32, #ttg.slice<{dim = 1, parent = #blocked}>>
 
-  %QK_tmem, %QK_tok = ttng.tmem_alloc : () -> (!ttg.memdesc<256x64xf32, #tmem_acc, #ttng.tensor_memory, mutable>, !ttg.async.token)
 
   %loop_outs:4 = scf.for %i = %c0_i32 to %n_tiles step %c64_i32 iter_args(
     %l_i = %one,
@@ -46,6 +45,7 @@ tt.func public @attention_forward(
     %K_shared = ttg.local_alloc %K : (tensor<64x64xf16, #load_blocked>) -> !ttg.memdesc<64x64xf16, #shared, #smem>
 
     %K_trans = ttg.memdesc_trans %K_shared {order = array<i32: 1, 0>} : !ttg.memdesc<64x64xf16, #shared, #smem> -> !ttg.memdesc<64x64xf16, #shared_T, #smem>
+    %QK_tmem, %QK_tok = ttng.tmem_alloc : () -> (!ttg.memdesc<256x64xf32, #tmem_acc, #ttng.tensor_memory, mutable>, !ttg.async.token)
     %QK_mma_tok = ttng.tc_gen5_mma %Q_shared, %K_trans, %QK_tmem[%QK_tok], %false, %true : !ttg.memdesc<256x64xf16, #shared, #smem>, !ttg.memdesc<64x64xf16, #shared_T, #smem>, !ttg.memdesc<256x64xf32, #tmem_acc, #ttng.tensor_memory, mutable>
 
     %QK, %QK_load_tok = ttng.tmem_load %QK_tmem[%QK_mma_tok] : !ttg.memdesc<256x64xf32, #tmem_acc, #ttng.tensor_memory, mutable> -> tensor<256x64xf32, #blocked>
@@ -138,4 +138,28 @@ tt.func public @mma_operand_view(
   tt.return
 }
 
+// CHECK-LABEL: @optimize_broadcast
+tt.func @optimize_broadcast(%arg0: i32) {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  // CHECK: scf.for
+  scf.for %i = %c0_i32 to %arg0 step %c1_i32 : i32 {
+    // CHECK: [[X:%.*]] = "producer"{{.*}}partition = 0
+    %x = "producer"() {ttg.partition = 0 : i32} : () -> tensor<128xf32>
+
+    // CHECK-DAG: [[X0_P0:%.*]] = tt.expand_dims [[X]] {{.*}}partition = 0
+    // CHECK-DAG: [[X0_P1:%.*]] = tt.expand_dims [[X]] {{.*}}partition = 1
+    %x0 = tt.expand_dims %x {axis = 0 : i32} : tensor<128xf32> -> tensor<1x128xf32>
+    // CHECK-DAG: [[X1_P0:%.*]] = tt.broadcast [[X0_P0]] {{.*}}partition = 0
+    // CHECK-DAG: [[X1_P1:%.*]] = tt.broadcast [[X0_P1]] {{.*}}partition = 1
+    %x1 = tt.broadcast %x0 : tensor<1x128xf32> -> tensor<128x128xf32>
+
+    // CHECK: "use"([[X1_P0]]) {{.*}}partition = 0
+    "use"(%x1) {ttg.partition = 0 : i32} : (tensor<128x128xf32>) -> ()
+    // CHECK: "use"([[X1_P1]]) {{.*}}partition = 1
+    "use"(%x1) {ttg.partition = 1 : i32} : (tensor<128x128xf32>) -> ()
+  } {tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32]}
+  tt.return
+}
+
 }
diff --git a/test/TritonGPU/rewrite-partition-dependencies.mlir b/test/TritonGPU/rewrite-partition-dependencies.mlir
@@ -10,7 +10,7 @@ module attributes {"ttg.num-warps" = 4 : i32} {
 // CHECK-LABEL: @two_consumers
 tt.func @two_consumers(%lb: i32, %ub: i32, %step: i32) {
   // CHECK: [[C0:%.*]] = arith.constant 0 : i32
-  // CHECK-NEXT: [[ABUF:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<2x1xi32, {{.*}}>
+  // CHECK-NEXT: [[ABUF:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1x1xi32, {{.*}}>
   // CHECK-NEXT: [[AREF:%.*]] = nvws.aref.create [[ABUF]]
   scf.for %i = %lb to %ub step %step iter_args() -> () : i32 {
     %0 = "op_a"() {ttg.partition = 0} : () -> !ty
@@ -40,7 +40,7 @@ tt.func @two_consumers(%lb: i32, %ub: i32, %step: i32) {
 // CHECK-LABEL: @distance_one
 tt.func @distance_one(%lb: i32, %ub: i32, %step: i32) {
   // CHECK: [[C0:%.*]] = arith.constant 0 : i32
-  // CHECK: [[ABUF:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<2x1xi32, {{.*}}>
+  // CHECK: [[ABUF:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1x1xi32, {{.*}}>
   // CHECK-NEXT: [[AREF:%.*]] = nvws.aref.create [[ABUF]]
   %cst = arith.constant dense<0> : !ty
   // CHECK: scf.for [[IV:%.*]] = [[LB:%.*]] to [[UB:%.*]] step [[STEP:%.*]] iter_args([[K:%.*]] = {{.*}})
@@ -63,9 +63,9 @@ tt.func @distance_one(%lb: i32, %ub: i32, %step: i32) {
 }
 
 tt.func @complex_case(%lb: i32, %ub: i32, %step: i32) {
-  // CHECK: [[ABUF1:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<2x1xi32, {{.*}}>
+  // CHECK: [[ABUF1:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1x1xi32, {{.*}}>
   // CHECK-NEXT: [[AREF1:%.*]] = nvws.aref.create [[ABUF1]]
-  // CHECK-NEXT: [[ABUF2:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<2x1xi32, {{.*}}>
+  // CHECK-NEXT: [[ABUF2:%.*]] = ttg.local_alloc : () -> !ttg.memdesc<1x1xi32, {{.*}}>
   // CHECK-NEXT: [[AREF2:%.*]] = nvws.aref.create [[ABUF2]]
   %cst = arith.constant dense<0> : !ty
   // CHECK: scf.for [[IV:%.*]] = [[LB:%.*]] to [[UB:%.*]] step [[STEP:%.*]] iter_args([[K:%.*]] = {{.*}}, [[L:%.*]] = {{.*}})
@@ -337,7 +337,7 @@ tt.func @no_def_op(%lb: i32, %ub: i32, %step: i32) {
 module attributes {"ttg.num-warps" = 4 : i32} {
 
 tt.func @invalid_attribute(%lb: i32, %ub: i32, %step: i32) {
-  // expected-warning @below {{partition stages attribute 'ttg.partition.stages' has invalid element "a"}}
+  // expected-error @below {{partition stages attribute 'ttg.partition.stages' has invalid element "a"}}
   scf.for %i = %lb to %ub step %step : i32 {
     scf.yield
   } {ttg.partition.stages = ["a"]}
@@ -359,7 +359,7 @@ module attributes {"ttg.num-warps" = 4 : i32} {
 
 tt.func @invalid_attribute(%lb: i32, %ub: i32, %step: i32) {
   scf.for %k = %lb to %ub step %step : i32 {
-    // expected-warning @below {{invalid partition index -1}}
+    // expected-error @below {{invalid partition index -1}}
     "op"() {ttg.partition = -1} : () -> ()
     scf.yield
   } {ttg.partition.stages = [2, 2]}

Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,5 @@ add_triton_library(TritonInstrumentToLLVM`
`8`	`8`	`TritonGPUIR`
`9`	`9`	`TritonInstrumentIR`
`10`	`10`	`TritonNvidiaGPUIR`
	`11`	`+ NVGPUIR`
`11`	`12`	`)`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ struct UseInfo {`
`48`	`48`	`int UseInfo::getMaxUseDistance(const Partition &partition) {`
`49`	`49`	`int maxDistance = 0;`
`50`	`50`	`for (auto [usePartition, distance] : llvm::make_first_range(consumers)) {`
`51`		`- int dist = 2 + distance;`
	`51`	`+ int dist = 1 + distance;`
`52`	`52`	`maxDistance = std::max(maxDistance, dist);`
`53`	`53`	`}`
`54`	`54`	`return maxDistance;`