[NVWS] Support tmem_alloc(desc_load()) pattern in aref insertion (#7734)

masahi · web-flow · commit 84a707312faa · 2025-08-01T12:16:38.000-07:00
A follow-up to triton-lang/triton#7581, handling one remaining case supported by TMA code in `LoadMMASpecialization`. Consider [these lines of code ](https://github.com/triton-lang/triton/blob/main/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp#L71-L73) in `LoadMMASpecialization` and [the corresponding lit test](https://github.com/triton-lang/triton/blob/main/test/TritonGPU/load-mma-specialization.mlir#L763-L816). Main WS supports a load pattern `tmem_alloc(desc_load())`. I assume that this pattern is used only for loading scales. Importantly, main WS lowers `tmem_alloc(desc_load())` in exactly the same way as `local_alloc(desc_load())`, **meaning the tmem scale operand is replaced by smem scales** after WS. It's kind of working because `tc_gen5_mma_scaled` allows both tmem and smem scale operands. However, there is a contract to have scales in SMEM: their layout must be compatible with `tcgen05.cp`. This is because in `MMALowering`, if the scales are in SMEM, we generate `tcgen05.cp` on them. The use of `tcgen05.cp` in turn ensures that pipelining scaled MMAv5 is safe without double-buffering scales in TMEM. So in Triton, having scales in SMEM also implies that MMA pipelining is applicable. See triton-lang/triton#6019 for more details on the SMEM scales. For scales whose layout is compatible with `tcgen05.cp`, we put them into SMEM during `OptimizeDotOperand`. So if WS sees tmem scales, it implies that `tcgen05.cp` cannot be used and also MMA cannot be pipelined. So randomly replacing tmem scales with smem is incorrect. This PR correctly handles this case by adding `local_load` on the TMA buffer, which is then consumed by tmem alloc. In this case, since scales are put into TMEM by tmem store, MMA cannot be pipelined. So this is probably the first example where load is async but MMA is sync (not specialized) - if we detect this pattern, the MMA op must have been put into the default partition. If we really want to support this pattern end to end, we need to update the partition scheduling and `pipelineMMA` code as well (probably not worth it).
diff --git a/test/NVWS/insert_aref.mlir b/test/NVWS/insert_aref.mlir
@@ -2,10 +2,15 @@
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 128], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#linear = #ttg.linear<{register = [[0, 1], [0, 2], [32, 0], [64, 0], [0, 4]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 0], [0, 0]], block = []}>
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
 #shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
+#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#shared3 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8}>
+#shared4 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 8}>
 #smem = #ttg.shared_memory
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+#tmem_scales = #ttng.tensor_memory_scales_encoding<>
 
 module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
   // FUNC-LABEL: @warp_specialize_tma_matmul
@@ -129,4 +134,40 @@ module attributes {"ttg.num-warps" = 4 : i32, ttg.target = "cuda:100"} {
     } {tt.num_stages = 2 : i32, tt.scheduled_max_stage = 1 : i32, tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32]}
     tt.return
   }
+
+  // CHECK-LABEL: @matmul_scaled_rhs_scales_tma
+  tt.func @matmul_scaled_rhs_scales_tma(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: !tt.tensordesc<tensor<128x64xf8E4M3FN, #shared3>>, %arg4: !tt.tensordesc<tensor<128x64xf8E4M3FN, #shared3>>, %arg5: !tt.tensordesc<tensor<128x8xi8, #shared2>>) {
+    %true = arith.constant true
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
+    %cst_0 = arith.constant dense<127> : tensor<128x8xi8, #linear>
+    %result = ttng.tmem_alloc %cst_0 : (tensor<128x8xi8, #linear>) -> !ttg.memdesc<128x8xi8, #tmem_scales, #ttng.tensor_memory>
+    %0 = scf.for %arg6 = %c0_i32 to %arg0 step %c1_i32 iter_args(%arg7 = %cst) -> (tensor<128x128xf32, #blocked>)  : i32 {
+      %1 = arith.muli %arg6, %c64_i32 {loop.cluster = 1 : i32, loop.stage = 0 : i32} : i32
+      %2 = tt.descriptor_load %arg3[%arg1, %1] {loop.cluster = 1 : i32, loop.stage = 0 : i32, ttg.partition = 2 : i32} : !tt.tensordesc<tensor<128x64xf8E4M3FN, #shared3>> -> tensor<128x64xf8E4M3FN, #blocked1>
+      %3 = tt.descriptor_load %arg4[%arg2, %1] {loop.cluster = 1 : i32, loop.stage = 0 : i32, ttg.partition = 2 : i32} : !tt.tensordesc<tensor<128x64xf8E4M3FN, #shared3>> -> tensor<128x64xf8E4M3FN, #blocked1>
+      %5 = ttg.local_alloc %2 {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 2 : i32} : (tensor<128x64xf8E4M3FN, #blocked1>) -> !ttg.memdesc<128x64xf8E4M3FN, #shared3, #smem>
+      %6 = ttg.local_alloc %3 {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 2 : i32} : (tensor<128x64xf8E4M3FN, #blocked1>) -> !ttg.memdesc<128x64xf8E4M3FN, #shared3, #smem>
+
+      // CHECK: nvws.aref.put.enter
+      // CHECK: nvws.descriptor_load
+      // CHECK: nvws.aref.put.exit
+      %4 = tt.descriptor_load %arg5[%arg1, %c0_i32] {loop.cluster = 1 : i32, loop.stage = 0 : i32, ttg.partition = 2 : i32} : !tt.tensordesc<tensor<128x8xi8, #shared2>> -> tensor<128x8xi8, #linear>
+
+      // CHECK: nvws.aref.get.enter
+      // CHECK: [[REG:%.*]] = ttg.local_load
+      // CHECK: nvws.aref.get.exit
+      // CHECK: tmem_alloc [[REG]]
+      %result_1 = ttng.tmem_alloc %4 {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 2 : i32} : (tensor<128x8xi8, #linear>) -> !ttg.memdesc<128x8xi8, #tmem_scales, #ttng.tensor_memory>
+
+      %7 = ttg.memdesc_trans %6 {loop.cluster = 0 : i32, loop.stage = 1 : i32, order = array<i32: 1, 0>, ttg.partition = 1 : i32} : !ttg.memdesc<128x64xf8E4M3FN, #shared3, #smem> -> !ttg.memdesc<64x128xf8E4M3FN, #shared4, #smem>
+      %result_2, %token = ttng.tmem_alloc %arg7 {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 0 : i32} : (tensor<128x128xf32, #blocked>) -> (!ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.async.token)
+      %8 = ttng.tc_gen5_mma_scaled %5, %7, %result_2[%token], %result, %result_1, %true, %true lhs = e4m3 rhs = e4m3 {loop.cluster = 0 : i32, loop.stage = 1 : i32, tt.self_latency = 1 : i32, ttg.partition = 1 : i32} : !ttg.memdesc<128x64xf8E4M3FN, #shared3, #smem>, !ttg.memdesc<64x128xf8E4M3FN, #shared4, #smem>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<128x8xi8, #tmem_scales, #ttng.tensor_memory>, !ttg.memdesc<128x8xi8, #tmem_scales, #ttng.tensor_memory>
+      %result_3, %token_4 = ttng.tmem_load %result_2[%8] {loop.cluster = 0 : i32, loop.stage = 1 : i32, ttg.partition = 0 : i32} : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+      scf.yield %result_3 : tensor<128x128xf32, #blocked>
+    } {tt.num_stages = 2 : i64, tt.scheduled_max_stage = 1 : i32, tt.warp_specialize, ttg.partition.stages = [0 : i32, 1 : i32, 0 : i32]}
+    tt.return
+  }
 }
diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertAref.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/InsertAref.cpp
@@ -49,19 +49,37 @@ SmallVector<ProducedValueInfo> getProducedValues(Operation *op, Block *loopBody,
   return producedValues;
 };
 
+template <typename AllocOp, typename LoadOp>
+std::optional<std::pair<AllocOp, LoadOp>> isLoadAndAlloc(Value result) {
+  auto alloc = result.getDefiningOp<AllocOp>();
+  if (!alloc)
+    return std::nullopt;
+  if (auto load = alloc.getSrc().template getDefiningOp<LoadOp>()) {
+    return std::make_pair(alloc, load);
+  }
+  return std::nullopt;
+}
+
+// if result is defined by descriptor_load followed by alloc, return the alloc
+// and the load ops as a pair.
+template <typename AllocOp> auto isDescLoadAndAlloc(Value result) {
+  return isLoadAndAlloc<AllocOp, triton::DescriptorOpInterface>(result);
+}
+
+template <typename AllocOp> auto isGlobalLoadAndAlloc(Value result) {
+  return isLoadAndAlloc<AllocOp, triton::LoadOp>(result);
+}
+
 ArefCreateOp createAref(OpBuilder &builder, ProducedValueInfo &producedValue) {
   auto result = producedValue.result;
-  MemDescType arefBufType;
 
-  if (auto memDescType = dyn_cast<MemDescType>(result.getType())) {
-    arefBufType = getMultiBufferedType(memDescType, 1);
-  } else if (auto tensorType = dyn_cast<RankedTensorType>(result.getType())) {
-    // if result is a value, create memdesctype for location where value will
-    // be stored
+  auto getSmemDescType = [](Value tensorResult) {
+    auto tensorType = cast<RankedTensorType>(tensorResult.getType());
     MemDescType memDescType;
     Attribute SharedMemorySpace =
         SharedMemorySpaceAttr::get(tensorType.getContext());
-    if (auto load = result.getDefiningOp<triton::DescriptorOpInterface>()) {
+    if (auto load =
+            tensorResult.getDefiningOp<triton::DescriptorOpInterface>()) {
       // A use of TMA which is not immediately consumed by LocalAlloc
       // This case applies, for example, when TMA is followed by SIMT ops
       // or MMAv2 is used.
@@ -73,15 +91,25 @@ ArefCreateOp createAref(OpBuilder &builder, ProducedValueInfo &producedValue) {
     } else {
       llvm_unreachable("Only TMA is expected for now.");
     }
-    arefBufType = getMultiBufferedType(memDescType, 1);
+    return memDescType;
+  };
+
+  MemDescType memDescType;
+  if (isDescLoadAndAlloc<LocalAllocOp>(result)) {
+    memDescType = dyn_cast<MemDescType>(result.getType());
+  } else if (auto opt = isDescLoadAndAlloc<TMEMAllocOp>(result)) {
+    auto descLoadResult = opt->first.getSrc();
+    memDescType = getSmemDescType(descLoadResult);
+  } else if (isa<RankedTensorType>(result.getType())) {
+    memDescType = getSmemDescType(result);
   } else {
-    std::string msg = "unsupported produced value type: " +
+    std::string msg = "createAref: unsupported produced value type: " +
                       mlir::debugString(result.getType());
     llvm::report_fatal_error(msg.c_str());
   }
 
-  assert(arefBufType &&
-         (isa<SharedMemorySpaceAttr>(arefBufType.getMemorySpace())));
+  MemDescType arefBufType = getMultiBufferedType(memDescType, 1);
+  assert(isa<SharedMemorySpaceAttr>(arefBufType.getMemorySpace()));
   auto loc = result.getLoc();
   auto alloc = triton::nvws::createAlloc(builder, loc, arefBufType, Value());
   return createArefCreateOp(builder, {arefBufType}, {alloc->getResult(0)}, loc);
@@ -127,26 +155,15 @@ void createNVWSDescriptorLoadOp(OpBuilder &builder, Operation *ttDescLoadOp,
   }
 }
 
-bool isDescLoadAndAlloc(Value result) {
-  auto alloc = result.getDefiningOp<LocalAllocOp>();
-  if (!alloc)
-    return false;
-  return alloc.getSrc().getDefiningOp<triton::DescriptorOpInterface>();
-}
-
-bool isGlobalLoadAndAlloc(Value result) {
-  auto alloc = result.getDefiningOp<LocalAllocOp>();
-  if (!alloc)
-    return false;
-  return alloc.getSrc().getDefiningOp<triton::LoadOp>();
-}
-
 StageCluster getStageClusterForProducer(Value producedValue) {
-  if (isDescLoadAndAlloc(producedValue) ||
-      isGlobalLoadAndAlloc(producedValue)) {
-    auto alloc = producedValue.getDefiningOp<LocalAllocOp>();
-    auto loadOp = alloc.getSrc().getDefiningOp();
-    return getStageCluster(loadOp);
+  if (auto opt = isDescLoadAndAlloc<LocalAllocOp>(producedValue)) {
+    return getStageCluster(opt->second);
+  } else if (auto opt = isDescLoadAndAlloc<TMEMAllocOp>(producedValue)) {
+    return getStageCluster(opt->second);
+  } else if (auto opt = isGlobalLoadAndAlloc<LocalAllocOp>(producedValue)) {
+    return getStageCluster(opt->second);
+  } else if (auto opt = isGlobalLoadAndAlloc<TMEMAllocOp>(producedValue)) {
+    return getStageCluster(opt->second);
   }
   return getStageCluster(producedValue.getDefiningOp());
 }
@@ -173,15 +190,21 @@ SmallVector<Operation *> createArefPut(PartitionBuilder &builder,
 
   auto producerKind = AsyncOp::NONE;
   SmallVector<Operation *> staleOps;
-  if (isDescLoadAndAlloc(result)) {
-    auto alloc = result.getDefiningOp<LocalAllocOp>();
-    auto descOp = alloc.getSrc().getDefiningOp();
+  if (auto opt = isDescLoadAndAlloc<LocalAllocOp>(result)) {
+    auto [alloc, descOp] = *opt;
     createNVWSDescriptorLoadOp(builder, descOp, dataBuf, producerPartition,
                                schedule, loc);
     producerKind = AsyncOp::TMALoad;
     staleOps.push_back(alloc);
     staleOps.push_back(descOp);
-  } else if (isGlobalLoadAndAlloc(result)) {
+  } else if (auto opt = isDescLoadAndAlloc<TMEMAllocOp>(result)) {
+    auto descOp = opt->second;
+    createNVWSDescriptorLoadOp(builder, descOp, dataBuf, producerPartition,
+                               schedule, loc);
+    producerKind = AsyncOp::TMALoad;
+    staleOps.push_back(descOp);
+  } else if (isGlobalLoadAndAlloc<LocalAllocOp>(result) ||
+             isGlobalLoadAndAlloc<TMEMAllocOp>(result)) {
     llvm_unreachable("cpasync not supported yet");
   } else if (auto tensorType = dyn_cast<RankedTensorType>(result.getType())) {
     if (auto descOp = result.getDefiningOp<triton::DescriptorOpInterface>()) {
@@ -197,7 +220,7 @@ SmallVector<Operation *> createArefPut(PartitionBuilder &builder,
       llvm_unreachable("Aref for values not supported yet");
     }
   } else {
-    std::string msg = "unsupported produced value type: " +
+    std::string msg = "createArefPut: unsupported produced value type: " +
                       mlir::debugString(result.getType());
     llvm::report_fatal_error(msg.c_str());
   }
@@ -327,26 +350,34 @@ void createArefGet(PartitionBuilder &builder, scf::ForOp loop,
   Value token = getEnterOp.getToken();
 
   Operation *exitInsertPointAfter = nullptr;
+
+  auto replaceUsesWithLocalLoad = [&](Value result, StageCluster stageCluster) {
+    auto localLoadOp = builder.createInto<LocalLoadOp>(
+        *consumerPartition, stageCluster, result.getType(), dataBuf);
+    result.replaceAllUsesWith(localLoadOp.getResult());
+    schedule.insert(consumerPartition, localLoadOp);
+    if (consumers.size() == 1) {
+      // If there is only one consumer and we hit this code path, the empty
+      // barrier can be released after local load.
+      exitInsertPointAfter = localLoadOp;
+    }
+  };
+
   for (auto result : results) {
-    if (auto memDescType = dyn_cast<MemDescType>(result.getType())) {
+    if (auto localAlloc = result.getDefiningOp<LocalAllocOp>()) {
+      auto memDescType = cast<MemDescType>(result.getType());
       auto callback = [&](Operation *oldOp, Operation *newOp) {
         assert(schedule.getPartition(oldOp) == consumerPartition);
         schedule.insert(consumerPartition, newOp);
       };
-      replaceUsesAndPropagateType(builder, result.getDefiningOp(), dataBuf,
-                                  callback);
-    } else if (auto tensorType = dyn_cast<RankedTensorType>(result.getType())) {
-      auto localLoadOp = builder.createInto<LocalLoadOp>(
-          *consumerPartition, stageClusterEnter, tensorType, dataBuf);
-      result.replaceAllUsesWith(localLoadOp.getResult());
-      schedule.insert(consumerPartition, localLoadOp);
-      if (consumers.size() == 1) {
-        // If there is only one consumer and we hit this code path, the empty
-        // barrier can be released after local load.
-        exitInsertPointAfter = localLoadOp;
-      }
+      replaceUsesAndPropagateType(builder, localAlloc, dataBuf, callback);
+    } else if (auto tmemAlloc = result.getDefiningOp<TMEMAllocOp>()) {
+      builder.setInsertionPoint(tmemAlloc);
+      replaceUsesWithLocalLoad(tmemAlloc.getSrc(), stageClusterEnter);
+    } else if (isa<RankedTensorType>(result.getType())) {
+      replaceUsesWithLocalLoad(result, stageClusterEnter);
     } else {
-      std::string msg = "unsupported produced value type: " +
+      std::string msg = "createArefGet: unsupported produced value type: " +
                         mlir::debugString(result.getType());
       llvm::report_fatal_error(msg.c_str());
     }
@@ -384,9 +415,12 @@ bool insertArefs(PartitionBuilder &builder, scf::ForOp loop,
 
   processResultUses(producedValue.result);
 
-  if (isDescLoadAndAlloc(producedValue.result)) {
+  if (auto opt = isDescLoadAndAlloc<LocalAllocOp>(producedValue.result)) {
     // Process the register use as well
-    auto alloc = producedValue.result.getDefiningOp<LocalAllocOp>();
+    auto alloc = opt->first;
+    processResultUses(alloc.getSrc());
+  } else if (auto opt = isDescLoadAndAlloc<TMEMAllocOp>(producedValue.result)) {
+    auto alloc = opt->first;
     processResultUses(alloc.getSrc());
   }
 
@@ -446,7 +480,8 @@ class NVWSArefInsertion
             return WalkResult::advance();
           }
           // Only handles load ops for now.
-          if (isDescLoadAndAlloc(op->getResult(0)) ||
+          if (isDescLoadAndAlloc<LocalAllocOp>(op->getResult(0)) ||
+              isDescLoadAndAlloc<TMEMAllocOp>(op->getResult(0)) ||
               (allowDescLoadRegUse &&
                (isa<triton::DescriptorOpInterface>(op)))) {
             ops.push_back(op);
@@ -459,7 +494,7 @@ class NVWSArefInsertion
               getProducedValues(op, loop.getBody(), *schedule);
           for (auto producedValue : producedValues) {
             PartitionBuilder builder(op->getLoc(), op);
-            builder.setInsertionPointAfter(op);
+            builder.setInsertionPoint(op);
             if (insertArefs(builder, loop, *schedule, producedValue, arefTag))
               arefTag++;
           }