[BACKEND] Fix layout picked during TMA store pipelining (#6978)

ThomasRaoux · web-flow · commit 1143c0311b83 · 2025-05-29T19:48:37.000Z
We were picking a layout inconsistent with the descriptor. Instead we
should decide during pipelining and use what was decided for the
descriptor.
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -437,7 +437,11 @@ def NVMMASharedEncodingAttr :
         } else {
           swizzlingByteWidth = 0;
         }
-        if (shapePerCTA.size() < 2 || shapePerCTA[order[1]] < 8) {
+        int flattenOutterDim = 1;
+        for (int i = 1; i < shapePerCTA.size(); i++) {
+          flattenOutterDim *= shapePerCTA[order[i]];
+        }
+        if (shapePerCTA.size() < 2 || flattenOutterDim < 8) {
           swizzlingByteWidth = 0;
         }
         bool transposed = order[0] == 0;
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -220,8 +220,10 @@ getSharedMemoryScale(Value arg, mlir::PatternRewriter &rewriter, Location loc) {
   auto CTALayout = getCTALayout(argType.getEncoding());
   // No swizzling for scale for now
   auto newLayout = NVMMASharedEncodingAttr::get(
-      argType.getContext(), argType.getShape(), newOrder, CTALayout,
-      argType.getElementType(), false);
+      argType.getContext(), /*swizzlingByteWidth=*/0,
+      /*transposed=*/false,
+      /*elementBitWidth=*/argType.getElementType().getIntOrFloatBitWidth(),
+      /*fp4Padded=*/false, CTALayout);
   auto newType = MemDescType::get(argType.getShape(), argType.getElementType(),
                                   newLayout, SharedMemorySpace);
   rewriter.setInsertionPointAfterValue(arg);
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/TMAStoresPipeline.cpp
@@ -33,16 +33,8 @@ static SmallVector<TMAStore> getTMAStores(scf::ForOp forOp) {
 static Value createAlloc(scf::ForOp &forOp, const TMAStore &store) {
   OpBuilder builder(forOp);
   RankedTensorType ty = store.src.getType();
-  // Is this one correct or should it always be [2, 1, 0]?
-  auto order = triton::gpu::getOrderForMemory(ty);
-  auto ctaLayout = ttg::getCTALayout(ty.getEncoding());
-  Attribute encoding = ttg::SwizzledSharedEncodingAttr::get(
-      ty.getContext(), 1, 1, 1, order, ctaLayout);
-  if (ty.getRank() > 1) {
-    encoding = ttg::NVMMASharedEncodingAttr::get(
-        ty.getContext(), ty.getShape(), order, ctaLayout, ty.getElementType(),
-        /*fp4Padded*/ false);
-  }
+  auto encoding =
+      triton::nvidia_gpu::getEncodingFromDescriptor(store.op, ty, store.desc);
   Attribute sharedMemorySpace =
       triton::gpu::SharedMemorySpaceAttr::get(ty.getContext());
   Type memdescType =
@@ -58,7 +50,6 @@ static void createTMAAsyncCopy(scf::ForOp forOp, const TMAStore &store,
   OpBuilder builder(store.op);
   Location loc = store.op->getLoc();
   RankedTensorType ty = store.src.getType();
-  auto ctaLayout = ttg::getCTALayout(ty.getEncoding());
 
   // Put wait before the local_store make the store truly async. We know
   // that we are the only user of the CopyLocalToGlobal.
diff --git a/test/TritonGPU/loop-pipeline-hopper.mlir b/test/TritonGPU/loop-pipeline-hopper.mlir
@@ -448,20 +448,23 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 
 // -----
 // Test pipelining of descriptor_store
-#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
-#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 8}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK: #[[$SHARED:.+]] = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 8}>
   // CHECK-LABEL: tma_store_pipeline
-  tt.func public @tma_store_pipeline(%arg0: tensor<1xf32, #blocked>, %arg1: !tt.tensordesc<tensor<1xf32, #shared>>, %arg2: i32, %arg3: i32) attributes {noinline = false} {
+  tt.func public @tma_store_pipeline(%arg0: tensor<128x128xf32, #blocked>, %arg1: !tt.tensordesc<tensor<128x128xf32, #shared>>, %arg2: i32, %arg3: i32) attributes {noinline = false} {
     %c0_i32 = arith.constant 0 : i32
+    // CHECK: ttg.local_alloc : () -> !ttg.memdesc<128x128xf32, #[[$SHARED]], #smem, mutable>
+    // CHECK: scf.for
     scf.for %arg4 = %c0_i32 to %arg3 step %arg2  : i32 {
       %1 = arith.divsi %arg4, %arg2 : i32
       // CHECK: ttng.async_tma_store_wait {pendings = 0 : i32}
       // CHECK-NEXT: ttg.local_store
       // CHECK-NEXT: ttng.fence_async_shared
       // CHECK-NEXT: ttng.tensor_desc_to_tma_ptr
       // CHECK-NEXT: ttng.async_tma_copy_local_to_global
-      tt.descriptor_store %arg1[%1], %arg0 : !tt.tensordesc<tensor<1xf32, #shared>>, tensor<1xf32, #blocked>
+      tt.descriptor_store %arg1[%1, %1], %arg0 : !tt.tensordesc<tensor<128x128xf32, #shared>>, tensor<128x128xf32, #blocked>
     }
     tt.return
   }

Original file line number	Diff line number	Diff line change
`@@ -437,7 +437,11 @@ def NVMMASharedEncodingAttr :`
`437`	`437`	`} else {`
`438`	`438`	`swizzlingByteWidth = 0;`
`439`	`439`	`}`
`440`		`- if (shapePerCTA.size() < 2 \|\| shapePerCTA[order[1]] < 8) {`
	`440`	`+ int flattenOutterDim = 1;`
	`441`	`+ for (int i = 1; i < shapePerCTA.size(); i++) {`
	`442`	`+ flattenOutterDim *= shapePerCTA[order[i]];`
	`443`	`+ }`
	`444`	`+ if (shapePerCTA.size() < 2 \|\| flattenOutterDim < 8) {`
`441`	`445`	`swizzlingByteWidth = 0;`
`442`	`446`	`}`
`443`	`447`	`bool transposed = order[0] == 0;`