[WS] Fix for aref insert when descriptor_load and local_alloc are in different partitions (#8197)

masahi · anmyachev · commit dcc60a80c850 · 2025-09-26T08:58:32.000Z
I was making a wrong assumption that partitions for `descriptor_load` and `local_alloc` are always the same. When they are different, we need to create two arefs, one between TMA and default partitions, another between default and MMA partitions. To do so, we need to treat both ops as a producer operation. [The necessary logic is already there](https://github.com/triton-lang/triton/blob/e28c618576be7b4c78a2083ade058ff20040b266/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertAref.cpp#L489-L505), so we just need to recognize this case.
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp
@@ -35,8 +35,7 @@ struct AutomaticWarpSpecialization
 void AutomaticWarpSpecialization::runOnOperation() {
   OpPassManager pm;
   pm.addPass(createTritonGPUPartitionScheduling());
-  // TODO: re-enable once the regression is fixed.
-  // pm.addPass(createNVWSInsertAref());
+  pm.addPass(createNVWSInsertAref());
   pm.addPass(createTritonGPULoadMMASpecialization({numStages}));
   pm.addPass(createTritonGPURewritePartitionDependencies());
   // `int-range-optimizations` and SCCP are good at cleaning up loop arithmetic.
diff --git a/test/NVWS/insert_aref.mlir b/test/NVWS/insert_aref.mlir
@@ -2,6 +2,7 @@
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [128, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
 #linear = #ttg.linear<{register = [[0, 1], [0, 2], [32, 0], [64, 0], [0, 4]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 0], [0, 0]], block = []}>
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
 #shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
@@ -166,4 +167,49 @@ module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
     } {tt.num_stages = 2 : i64, tt.scheduled_max_stage = 1 : i32, tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32], ttg.warp_specialize.tag = 0 : i32}
     tt.return
   }
+
+  // FUNC-LABEL: @local_alloc_default_partition
+  // CHECK: @local_alloc_default_partition
+  tt.func @local_alloc_default_partition(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: !tt.tensordesc<tensor<128x128xf16, #shared>>, %arg4: !tt.tensordesc<tensor<128x128xf16, #shared>>) {
+    %true = arith.constant true
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c128_i32 = arith.constant 128 : i32
+    // CHECK: [[AREF_LHS_TRANS:%.*]] = nvws.aref.create {{.*}} : <[!ttg.memdesc<1x128x128xf16, #shared1, #smem, mutable>]>
+    // CHECK: [[AREF_RHS:%.*]] = nvws.aref.create {{.*}} : <[!ttg.memdesc<1x128x128xf16, #shared, #smem, mutable>]>
+    // CHECK: [[AREF_LHS:%.*]] = nvws.aref.create {{.*}} : <[!ttg.memdesc<1x128x128xf16, #shared, #smem, mutable>]>
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
+    %result, %token = ttng.tmem_alloc : () -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
+    %0 = ttng.tmem_store %cst, %result[%token], %true : tensor<128x128xf32, #blocked> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+
+    %1 = scf.for %arg5 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg6 = %0) -> (!ttg.async.token)  : i32 {
+      %2 = arith.muli %arg5, %c128_i32 {loop.cluster = 1 : i32, loop.stage = 0 : i32} : i32
+      // CHECK: [[AREF_LHS_PUT_BUF:%.*]], {{.*}} = nvws.aref.put.enter [[AREF_LHS]] {{.*}}ttg.partition = array<i32: 2>}
+      // CHECK: nvws.descriptor_load {{.*}} 32768 [[AREF_LHS_PUT_BUF]] {{.*}}ttg.partition = array<i32: 2>}
+
+      // CHECK: [[AREF_LHS_GET_BUF:%.*]], {{.*}} = nvws.aref.get.enter [[AREF_LHS]] {{.*}}ttg.partition = array<i32: 0>}
+      // CHECK: [[TMA_RES_REG:%.*]] = ttg.local_load [[AREF_LHS_GET_BUF]] {{.*}}ttg.partition = array<i32: 0>}
+
+      // CHECK: [[AREF_LHS_TRANS_PUT_BUF:%.*]], {{.*}} = nvws.aref.put.enter [[AREF_LHS_TRANS]] {{.*}}ttg.partition = array<i32: 0>}
+      // CHECK: ttg.local_store [[TMA_RES_REG]], [[AREF_LHS_TRANS_PUT_BUF]] {{.*}}ttg.partition = array<i32: 0>}
+
+      // CHECK: [[AREF_LHS_TRANS_GET_BUF:%.*]], {{.*}} = nvws.aref.get.enter [[AREF_LHS_TRANS]] {{.*}}ttg.partition = array<i32: 1>}
+      // CHECK: [[LHS:%.*]] = ttg.memdesc_trans [[AREF_LHS_TRANS_GET_BUF]] {{.*}}ttg.partition = array<i32: 1>}
+
+      %3 = tt.descriptor_load %arg3[%arg1, %2] {loop.cluster = 1 : i32, loop.stage = 0 : i32, ttg.partition = array<i32: 2>} : !tt.tensordesc<tensor<128x128xf16, #shared>> -> tensor<128x128xf16, #blocked2>
+      %5 = ttg.local_alloc %3 {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = array<i32: 0>} : (tensor<128x128xf16, #blocked2>) -> !ttg.memdesc<128x128xf16, #shared1, #smem>
+      %lhs_trans = ttg.memdesc_trans %5 {loop.cluster = 0 : i32, loop.stage = 1 : i32, order = array<i32: 1, 0>, ttg.partition = array<i32: 1>} : !ttg.memdesc<128x128xf16, #shared1, #smem> -> !ttg.memdesc<128x128xf16, #shared, #smem>
+
+      %4 = tt.descriptor_load %arg4[%arg2, %2] {loop.cluster = 1 : i32, loop.stage = 0 : i32, ttg.partition = array<i32: 2>} : !tt.tensordesc<tensor<128x128xf16, #shared>> -> tensor<128x128xf16, #blocked1>
+      %6 = ttg.local_alloc %4 {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = array<i32: 2>} : (tensor<128x128xf16, #blocked1>) -> !ttg.memdesc<128x128xf16, #shared, #smem>
+      %7 = ttg.memdesc_trans %6 {loop.cluster = 0 : i32, loop.stage = 1 : i32, order = array<i32: 1, 0>, ttg.partition = array<i32: 1>} : !ttg.memdesc<128x128xf16, #shared, #smem> -> !ttg.memdesc<128x128xf16, #shared1, #smem>
+
+      // CHECK: ttng.tc_gen5_mma [[LHS]]
+      %8 = ttng.tc_gen5_mma %lhs_trans, %7, %result[%arg6], %true, %true {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = array<i32: 1>} : !ttg.memdesc<128x128xf16, #shared, #smem>, !ttg.memdesc<128x128xf16, #shared1, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+      scf.yield %8 : !ttg.async.token
+    } {tt.num_stages = 2 : i32, tt.scheduled_max_stage = 1 : i32, tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32], ttg.warp_specialize.tag = 0 : i32}
+    %result_0, %token_1 = ttng.tmem_load %result[%1] : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+    "use"(%result_0) : (tensor<128x128xf32, #blocked>) -> ()
+    tt.return
+  }
 }
diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertAref.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertAref.cpp
@@ -66,7 +66,10 @@ std::optional<std::pair<AllocOp, LoadOp>> isLoadAndAlloc(Value result) {
   auto alloc = result.getDefiningOp<AllocOp>();
   if (!alloc)
     return std::nullopt;
-  if (auto load = alloc.getSrc().template getDefiningOp<LoadOp>()) {
+  if (auto load = alloc.getSrc().template getDefiningOp<LoadOp>();
+      *getPartitionIds(alloc) == *getPartitionIds(load)) {
+    // if alloc and load are in different partitions, they are treated as two
+    // different producer operations.
     return std::make_pair(alloc, load);
   }
   return std::nullopt;