[hopper][WS] Use required layout for buffers (#7284)

htyu · web-flow · commit 54606e838f7c · 2025-06-24T13:40:30.000-07:00
When creating buffers, the layout required should be decided by the
consumer. We were using mma layout previously and it broken the case
where consumer wasn't actually a dot op.
diff --git a/test/Hopper/WarpSpecialization/ws_code_partition.mlir b/test/Hopper/WarpSpecialization/ws_code_partition.mlir
@@ -260,3 +260,48 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return
   }
 }
+
+
+// -----
+
+// CHECK-DAG: #[[$SHARED:.*]] = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+// CHECK-DAG: #[[$SHARED1:.*]]  = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 8}>
+// CHECK-LABEL: @_fbgemm_grouped_gemm_fp8_rowwise_ws
+// CHECK: ttg.local_alloc : () -> !ttg.memdesc<1x64x64xf8E4M3FN, #[[$SHARED1]], #smem, mutable>
+// CHECK: ttg.local_alloc : () -> !ttg.memdesc<1x128x64xf8E4M3FN, #[[$SHARED1]], #smem, mutable>
+// CHECK: ttg.local_alloc : () -> !ttg.memdesc<1x128xf32, #[[$SHARED]], #smem, mutable>
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 128, 32]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 8}>
+#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+#shared2 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = true, elementBitWidth = 8}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @_fbgemm_grouped_gemm_fp8_rowwise_ws(%arg0: !tt.ptr<i8, 0> {tt.nv_tma_desc = 1 : i32}, %arg1: i32, %arg2: !tt.ptr<i8, 0> {tt.nv_tma_desc = 1 : i32}, %arg3: !tt.ptr<i8, 0> {tt.nv_tma_desc = 1 : i32}) attributes {noinline = false} {
+    %c0_i32 = arith.constant {async_task_id = array<i32: 0, 1, 2>} 0 : i32
+    %c2048_i32 = arith.constant {async_task_id = array<i32: 0, 1, 2>} 2048 : i32
+    %c64_i32 = arith.constant {async_task_id = array<i32: 0, 1, 2>} 64 : i32
+    %cst = arith.constant {async_task_id = array<i32: 0, 1, 2>} dense<0.000000e+00> : tensor<64x128xf32, #mma>
+    %0 = tt.get_program_id x {async_task_id = array<i32: 0, 1, 2>} : i32
+    %1 = ttng.reinterpret_tensor_descriptor %arg0 {async_task_id = array<i32: 0>} : !tt.ptr<i8, 0> to !tt.tensordesc<tensor<64x64xf8E4M3FN, #shared>>
+    %2 = ttng.reinterpret_tensor_descriptor %arg2 {async_task_id = array<i32: 0>} : !tt.ptr<i8, 0> to !tt.tensordesc<tensor<128x64xf8E4M3FN, #shared>>
+    %3 = ttng.reinterpret_tensor_descriptor %arg3 {async_task_id = array<i32: 0>} : !tt.ptr<i8, 0> to !tt.tensordesc<tensor<128xf32, #shared1>>
+    scf.for %arg4 = %0 to %arg1 step %c64_i32  : i32 {
+      %4 = arith.muli %arg4, %c2048_i32 {async_task_id = array<i32: 0>} : i32
+      %5 = scf.for %arg5 = %c0_i32 to %c2048_i32 step %c64_i32 iter_args(%arg6 = %cst) -> (tensor<64x128xf32, #mma>)  : i32 {
+        %8 = tt.descriptor_load %1[%4, %arg5] {async_task_id = array<i32: 0>} : !tt.tensordesc<tensor<64x64xf8E4M3FN, #shared>> -> tensor<64x64xf8E4M3FN, #blocked>
+        %9 = ttg.local_alloc %8 {async_task_id = array<i32: 1>} : (tensor<64x64xf8E4M3FN, #blocked>) -> !ttg.memdesc<64x64xf8E4M3FN, #shared, #smem>
+        %10 = tt.descriptor_load %2[%4, %arg5] {async_task_id = array<i32: 0>} : !tt.tensordesc<tensor<128x64xf8E4M3FN, #shared>> -> tensor<128x64xf8E4M3FN, #blocked>
+        %11 = ttg.local_alloc %10 {async_task_id = array<i32: 1, 2>} : (tensor<128x64xf8E4M3FN, #blocked>) -> !ttg.memdesc<128x64xf8E4M3FN, #shared, #smem>
+        %12 = ttg.memdesc_trans %11 {async_task_id = array<i32: 1, 2>, order = array<i32: 1, 0>} : !ttg.memdesc<128x64xf8E4M3FN, #shared, #smem> -> !ttg.memdesc<64x128xf8E4M3FN, #shared2, #smem>
+        %13 = ttng.warp_group_dot %9, %12, %arg6 {async_task_id = array<i32: 1>, inputPrecision = 0 : i32, maxNumImpreciseAcc = 1073741824 : i32} : !ttg.memdesc<64x64xf8E4M3FN, #shared, #smem> * !ttg.memdesc<64x128xf8E4M3FN, #shared2, #smem> -> tensor<64x128xf32, #mma>
+        scf.yield {async_task_id = array<i32: 1, 2>} %13 : tensor<64x128xf32, #mma>
+      } {async_task_id = array<i32: 0, 1, 2>}
+      %6 = tt.descriptor_load %3[%4] {async_task_id = array<i32: 0>} : !tt.tensordesc<tensor<128xf32, #shared1>> -> tensor<128xf32, #blocked1>
+      %7 = ttg.convert_layout %6 {async_task_id = array<i32: 1, 2>} : tensor<128xf32, #blocked1> -> tensor<128xf32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    } {async_task_id = array<i32: 1, 2>}
+    tt.return
+  }
+}
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSCodePartition.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSCodePartition.cpp
@@ -19,6 +19,7 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
 #include <unordered_set>
 
 namespace tt = mlir::triton;
@@ -725,10 +726,19 @@ DenseMap<Channel *, Value> createBuffer(
     auto &channels = channelsGroupedByProducers[channelInOrder];
     auto srcValue = channelInOrder->getSrcOperand();
     auto srcOp = channelInOrder->getSrcOp();
+    auto dstOp = channelInOrder->getDstOp();
     auto *channel = channels.front();
     unsigned numBuffers = channel->numBuffers;
     Value buffer;
 
+    LLVM_DEBUG({
+      LDBG("Creating buffers for channel:");
+      LDBG("Producer:");
+      DBGS() << *srcOp << "\n";
+      LDBG("Consumer:");
+      DBGS() << *dstOp << "\n";
+    });
+
     // For TMEM channel, multi-buffer TMEM alloc
     if (channel->channelKind == DataChannelKind::TMEM) {
       // Move TMEM alloc to the beginning of the function.
@@ -745,8 +755,34 @@ DenseMap<Channel *, Value> createBuffer(
 
       // Get shape, layout and type of a slice
       auto sliceShape = tensorType.getShape();
-      auto sharedLayout = ttg::NVMMASharedEncodingAttr::get(
-          context, sliceShape, order, CTALayout, elemType, /*fp4Padded*/ false);
+      // Check the consumer type
+      auto actualConsumers = getActualConsumers(dstOp);
+      LLVM_DEBUG({
+        DBGS() << "actual consumers: \n";
+        for (auto consumerOp : actualConsumers) {
+          DBGS() << *consumerOp << "\n";
+        }
+      });
+
+      bool requireMMASharedEncoding =
+          llvm::any_of(actualConsumers, [](Operation *op) {
+            return isa<mlir::triton::DotOpInterface>(op);
+          });
+
+      Attribute sharedLayout;
+      if (requireMMASharedEncoding) {
+        sharedLayout = ttg::NVMMASharedEncodingAttr::get(
+            context, sliceShape, order, CTALayout, elemType,
+            /*fp4Padded*/ false);
+      } else if (auto tmaLoad = dyn_cast<tt::DescriptorLoadOp>(srcOp)) {
+        sharedLayout = ttng::getEncodingFromDescriptor(
+            tmaLoad, tmaLoad.getType(), tmaLoad.getDesc());
+      } else {
+        // Create an unswizzled layout for now.
+        // TODO: optimize it based on the consumer.
+        sharedLayout = ttg::SwizzledSharedEncodingAttr::get(context, 1, 1, 1,
+                                                            order, CTALayout);
+      }
 
       // Get shape, layout and type of the complete buffer
       SmallVector<int64_t> bufferShape(sliceShape.begin(), sliceShape.end());
diff --git a/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSLowerMem.cpp b/third_party/nvidia/hopper/lib/Transforms/WarpSpecialization/WSLowerMem.cpp
@@ -57,8 +57,8 @@ createAsyncCopy(const DenseMap<Channel *, Value> &bufferMap, Channel *c,
 
   // Get shape, layout and type of a slice
   auto sliceShape = tensorType.getShape();
-  auto sharedLayout = ttg::NVMMASharedEncodingAttr::get(
-      context, sliceShape, order, CTALayout, elemType, /*fp4Padded*/ false);
+  auto sharedLayout =
+      dyn_cast<triton::gpu::MemDescType>(buffer.getType()).getEncoding();
   auto sliceType = RankedTensorType::get(sliceShape, elemType, sharedLayout);
 
   Attribute sharedMemorySpace =
@@ -118,8 +118,8 @@ createLocalCopy(const DenseMap<Channel *, Value> &bufferMap, Channel *channel,
 
   // Get shape, layout and type of a slice
   auto sliceShape = tensorType.getShape();
-  auto sharedLayout = ttg::NVMMASharedEncodingAttr::get(
-      context, sliceShape, order, CTALayout, elemType, /*fp4Padded*/ false);
+  auto sharedLayout =
+      dyn_cast<triton::gpu::MemDescType>(buffer.getType()).getEncoding();
   auto sliceType = RankedTensorType::get(sliceShape, elemType, sharedLayout);
 
   Attribute sharedMemorySpace =
@@ -205,8 +205,8 @@ Value getBufferForPipelineStage(OpBuilderWithAsyncTaskIds &builder,
 
   // Get shape, layout and type of a slice
   auto sliceShape = tensorType.getShape();
-  auto sharedLayout = ttg::NVMMASharedEncodingAttr::get(
-      context, sliceShape, order, CTALayout, elemType, /*fp4Padded*/ false);
+  auto sharedLayout =
+      dyn_cast<triton::gpu::MemDescType>(buffer.getType()).getEncoding();
   auto sliceType = RankedTensorType::get(sliceShape, elemType, sharedLayout);
 
   Attribute sharedMemorySpace =