[BACKEND] Apply padding when lowering ttg.memdesc_index (#7696)

AlexAUT · web-flow · commit ea4bdaf9d662 · 2025-07-29T09:48:23.000-07:00
triton-lang/triton#7622 introduced `ttg.memdesc_index` which applies a constant offset to the base pointer of the smem object. For padded layouts we need to add padding based on the offset, similar to what triton-lang/triton#7404 did for the old subview operation. I also adjusted the lit test to check we actually generate padding from the ttg.memdesc_index. The previous version did not fail because it matched the lowering of the `ttg.local_load/store` as well.
diff --git a/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
@@ -493,6 +493,15 @@ struct MemDescIndexOpConversion
     auto prevOffsets = smemObj.getOffsets();
     SmallVector<Value> offsetVals(prevOffsets.end() - dstTy.getRank(),
                                   prevOffsets.end());
+
+    // Apply padding based on the amount we move the base ptr
+    if (auto padEnc = dyn_cast<PaddedSharedEncodingAttr>(dstTy.getEncoding())) {
+      auto bitwidth = dstTy.getElementTypeBitWidth();
+      Value padOffset = emitPadding(loc, rewriter, padEnc, bitwidth, offset,
+                                    /*offsetInBytes=*/false);
+      offset = b.add(offset, padOffset);
+    }
+
     // Advance the pointer and keep the opOffsets as the new shape
     smemObj = SharedMemoryObject(b.gep(elemPtrTy, llvmElemTy, base, offset),
                                  llvmElemTy, offsetVals);
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -412,34 +412,30 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
 
 // -----
 
-// CHECK-LABEL: padded_shared_layout_subview
+// GFX950-LABEL: padded_shared_layout_subview
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-#shared = #ttg.padded_shared<[128:+4, 256:+8] {order = [1, 0]}>
+#shared = #ttg.padded_shared<[128:+4] {order = [1, 0]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
   tt.func @padded_shared_layout_subview(%arg0: !ttg.memdesc<2x64x64xf16, #shared, #smem, mutable>) {
     %c0_i32 = arith.constant 0 : i32
     %c1_i32 = arith.constant 1 : i32
-    // Skip two constants from the stride calculation
+    // Skip three constants from the stride calculation
+    // GFX950: llvm.mlir.constant
+    // GFX950: llvm.mlir.constant
+    // GFX950: llvm.mlir.constant
 
-    // CHECK-DAG: %[[CST0:.+]] = llvm.mlir.constant(0 : i32)
-    // CHECK-DAG: %[[CST3:.+]] = llvm.mlir.constant(3 : i32)
-    // CHECK-DAG: %[[CST4:.+]] = llvm.mlir.constant(4 : i32)
-    // CHECK-DAG: %[[CST8:.+]] = llvm.mlir.constant(8 : i32)
-    // CHECK-DAG: %[[CST9:.+]] = llvm.mlir.constant(9 : i32)
+    // GFX950-DAG: %[[CST0:.+]] = llvm.mlir.constant(0 : i32)
+    // GFX950-DAG: %[[CST7:.+]] = llvm.mlir.constant(7 : i32)
+    // GFX950-DAG: %[[CST2:.+]] = llvm.mlir.constant(2 : i32)
 
-    //  CHECK: %[[SHR0:.+]] = llvm.ashr %[[ADD:.+]], %[[CST8]] : i32
-    // CHECK-NEXT: %[[SHL0:.+]] = llvm.shl %[[SHR0]], %[[CST3]] : i32
-    // CHECK-NEXT: %[[ADD0:.+]] = llvm.add %[[SHL0]], %[[CST0]] : i32
-    // CHECK-NEXT: %[[SHR1:.+]] = llvm.ashr %[[ADD]], %[[CST9]] : i32
-    // CHECK-NEXT: %[[SHL1:.+]] = llvm.shl %[[SHR1]], %[[CST4]] : i32
-    // CHECK-NEXT: %[[ADD1:.+]] = llvm.add %[[ADD0]], %[[SHL1]] : i32
-    // CHECK-NEXT: %[[ADD2:.+]] = llvm.add %[[ADD]], %[[ADD1]] : i32
-    // CHECK: llvm.getelementptr inbounds %{{.+}}[%[[ADD2]]]
+    // GFX950: %[[SHR0:.+]] = llvm.ashr %[[ADD:.+]], %[[CST7]] : i32
+    // GFX950-NEXT: %[[SHL0:.+]] = llvm.shl %[[SHR0]], %[[CST2]] : i32
+    // GFX950-NEXT: %[[ADD1:.+]] = llvm.add %[[CST0]], %[[SHL0]] : i32
+    // GFX950-NEXT: %[[ADD2:.+]] = llvm.add %[[ADD]], %[[ADD1]] : i32
+    // GFX950: llvm.getelementptr %{{.+}}[%[[ADD2]]]
 
     %1 = ttg.memdesc_index %arg0, %c1_i32 : !ttg.memdesc<2x64x64xf16, #shared, #smem, mutable> -> !ttg.memdesc<64x64xf16, #shared, #smem, mutable>
-    %2 = ttg.local_load %1 : !ttg.memdesc<64x64xf16, #shared, #smem, mutable> -> tensor<64x64xf16, #blocked>
-    ttg.local_store %2, %1 : tensor<64x64xf16, #blocked> -> !ttg.memdesc<64x64xf16, #shared, #smem, mutable>
     tt.return
   }
 }