[Dialect] Verify local/tmem store/load/alloc reg shape and type matches mem shape and type (#7144)

Mogball · web-flow · commit 9e7dfc6b5c5b · 2025-06-10T20:52:32.000-04:00
diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -283,6 +283,12 @@ bool isInnermostContiguous(MemDescType type, unsigned numElems);
 LinearLayout inferReshapeLinearLayout(ArrayRef<int64_t> srcShape,
                                       Attribute srcEnc,
                                       ArrayRef<int64_t> dstShape);
+
+// Verify the types of operations that operate on memory.
+LogicalResult verifyMemoryOpTypes(Operation *op, ShapedType srcTy,
+                                  ShapedType dstTy);
+// Verify a memory allocation operation.
+LogicalResult verifyAllocOp(Operation *op, Value src, MemDescType dstTy);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -163,8 +163,6 @@ def TTG_LocalAllocOp : TTG_Op<"local_alloc", [DeclareOpInterfaceMethods<MemoryEf
   ];
 
   let extraClassDeclaration = [{
-    static LogicalResult verifyAllocOp(Operation *op, Value src,
-                                       MemDescType dstTy);
     bool isSharedMemoryAlloc() {
       return isa_and_nonnull<SharedMemorySpaceAttr>(getType().getMemorySpace());
     }
@@ -312,6 +310,7 @@ def TTG_LocalLoadOp : TTG_Op<"local_load"> {
     Arg<TTG_MemDescType, "", [MemRead<SharedMemory>]>:$src,
     Optional<TTG_AsyncToken>:$token
   );
+  let results = (outs TT_Tensor:$result);
 
   let builders = [
       OpBuilder<(ins "Type":$retType, "Value":$src),
@@ -321,8 +320,7 @@ def TTG_LocalLoadOp : TTG_Op<"local_load"> {
 
   // Use qualified() otherwise "!ttg.memdesc<X>" is printed as "<X>".
   let assemblyFormat = [{$src (`token` $token^)? attr-dict `:` qualified(type($src)) `->` type($result)}];
-
-  let results = (outs TT_Tensor:$result);
+  let hasVerifier = 1;
 }
 
 def TTG_LocalStoreOp : TTG_Op<"local_store"> {
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -529,8 +529,22 @@ OpFoldResult LocalAllocOp::fold(FoldAdaptor adaptor) {
   return loadSrc;
 }
 
-LogicalResult LocalAllocOp::verifyAllocOp(Operation *op, Value src,
-                                          MemDescType dstTy) {
+LogicalResult verifyMemoryOpTypes(Operation *op, ShapedType srcTy,
+                                  ShapedType dstTy) {
+  if (srcTy.getElementType() != dstTy.getElementType()) {
+    return op->emitOpError("source element type ")
+           << srcTy << " must match "
+           << "destination element type " << dstTy.getElementType();
+  }
+  if (srcTy.getShape() != dstTy.getShape()) {
+    return op->emitOpError("source shape [")
+           << srcTy.getShape() << "] must match ["
+           << "destination shape " << dstTy.getShape() << "]";
+  }
+  return success();
+}
+
+LogicalResult verifyAllocOp(Operation *op, Value src, MemDescType dstTy) {
   if (dstTy.getShape() != dstTy.getAllocShape())
     return op->emitOpError("result shape and its alloc shape must match");
 
@@ -542,12 +556,7 @@ LogicalResult LocalAllocOp::verifyAllocOp(Operation *op, Value src,
     return success();
   }
 
-  auto srcTy = cast<RankedTensorType>(src.getType());
-  if (srcTy.getElementType() != dstTy.getElementType())
-    return op->emitOpError("result element type must source element type");
-  if (srcTy.getShape() != dstTy.getShape())
-    return op->emitOpError("result shape must match source shape");
-  return success();
+  return verifyMemoryOpTypes(op, cast<RankedTensorType>(src.getType()), dstTy);
 }
 
 LogicalResult LocalAllocOp::verify() {
@@ -561,7 +570,12 @@ LogicalResult LocalAllocOp::verify() {
 LogicalResult LocalStoreOp::verify() {
   if (!getDst().getType().getMutableMemory())
     return emitOpError("Cannot store into immutable memory");
-  return success();
+  return verifyMemoryOpTypes(*this, getSrc().getType(), getDst().getType());
+}
+
+// LocalLoadOp
+LogicalResult LocalLoadOp::verify() {
+  return verifyMemoryOpTypes(*this, getSrc().getType(), getType());
 }
 
 // AsyncCopyGlobalToLocalOp
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -455,7 +455,8 @@ LogicalResult TMEMStoreOp::verify() {
   if (!getDst().getType().getMutableMemory()) {
     return emitOpError("Cannot store into an immutable alloc");
   }
-  return success();
+  return triton::gpu::verifyMemoryOpTypes(*this, getSrc().getType(),
+                                          getDst().getType());
 }
 
 // -- TMEMLoadOp --
@@ -466,7 +467,7 @@ LogicalResult TMEMLoadOp::verify() {
   if (!isa<triton::nvidia_gpu::TensorMemoryEncodingAttr>(
           getSrc().getType().getEncoding()))
     return emitOpError("should use tensor memory encoding.");
-  return success();
+  return triton::gpu::verifyMemoryOpTypes(*this, getSrc().getType(), getType());
 }
 
 // -- TMEMAllocOp --
@@ -476,8 +477,7 @@ LogicalResult TMEMAllocOp::verify() {
   if (!isa<TensorMemoryEncodingAttr, TensorMemoryScalesEncodingAttr>(
           getType().getEncoding()))
     return emitOpError("should use tensor memory encoding");
-
-  return LocalAllocOp::verifyAllocOp(*this, getSrc(), getType());
+  return triton::gpu::verifyAllocOp(*this, getSrc(), getType());
 }
 
 void TMEMAllocOp::getEffects(
diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir
@@ -605,7 +605,7 @@ tt.func @convert_layout5(%A : !tt.ptr<f16>) {
   // CHECK: ttg.local_load
   // CHECK-NEXT: gpu.barrier
   // CHECK: ttg.local_load
-  %3 = ttg.local_load %0 : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL>
+  %3 = ttg.local_load %0 : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<32x16xf16, #AL>
   %4 = ttg.local_load %1 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL>
   tt.return
 }
diff --git a/test/TritonGPU/optimize-partition-warps.mlir b/test/TritonGPU/optimize-partition-warps.mlir
@@ -148,9 +148,9 @@ tt.func @tmem_min_4_warps(%tensor_desc: !ttg.memdesc<64x64xf32, #tmem, #ttng.ten
   }
   // CHECK: partition1{{.*}} num_warps(4)
   partition1(%desc: !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>) num_warps(8) {
-    %cst = arith.constant dense<0> : tensor<64x64xi32, #blocked2d_8>
+    %cst = arith.constant dense<0.0> : tensor<64x64xf32, #blocked2d_8>
     %true = arith.constant true
-    ttng.tmem_store %cst, %desc, %true : tensor<64x64xi32, #blocked2d_8> -> !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>
+    ttng.tmem_store %cst, %desc, %true : tensor<64x64xf32, #blocked2d_8> -> !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>
     ttg.warp_return
   }
   // CHECK: partition2{{.*}} num_warps(4)
diff --git a/test/TritonNvidiaGPU/test_tensor_memory_allocation.mlir b/test/TritonNvidiaGPU/test_tensor_memory_allocation.mlir
@@ -14,7 +14,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
     %cst0 = arith.constant dense<0.000000e+00> : tensor<128x128xf16, #blocked>
     %cst1 = arith.constant dense<0.000000e+00> : tensor<64x64xf16, #blocked>
-    %cst2 = arith.constant dense<0.000000e+00> : tensor<64x256xf16, #blocked>
+    %cst2 = arith.constant dense<0.000000e+00> : tensor<64x128xf16, #blocked>
     %cst3 = arith.constant dense<0> : tensor<64x4xi8, #linear>
     %cst4 = arith.constant dense<0.000000e+00> : tensor<64x128xf16, #blocked>
 
@@ -39,8 +39,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     // CHECK: ttng.tmem_alloc %{{.+}} {tensor_memory_col_offset = 128 : i32, tensor_memory_row_offset = 0 : i32}
     %6 = ttng.tmem_alloc %cst : (tensor<128x128xf32, #blocked>) -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
 
-    ttng.tmem_store %cst2, %4, %true : tensor<64x256xf16, #blocked> -> !ttg.memdesc<64x128xf16, #tmem2, #ttng.tensor_memory, mutable>
-    ttng.tmem_store %cst2, %5, %true : tensor<64x256xf16, #blocked> -> !ttg.memdesc<64x128xf16, #tmem2, #ttng.tensor_memory, mutable>
+    ttng.tmem_store %cst2, %4, %true : tensor<64x128xf16, #blocked> -> !ttg.memdesc<64x128xf16, #tmem2, #ttng.tensor_memory, mutable>
+    ttng.tmem_store %cst2, %5, %true : tensor<64x128xf16, #blocked> -> !ttg.memdesc<64x128xf16, #tmem2, #ttng.tensor_memory, mutable>
     ttng.tmem_store %cst, %6, %true : tensor<128x128xf32, #blocked> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
 
     %7 = ttng.tmem_alloc : () -> !ttg.memdesc<64x4xi8, #tmem_scales, #ttng.tensor_memory, mutable>

Original file line number	Diff line number	Diff line change
`@@ -455,7 +455,8 @@ LogicalResult TMEMStoreOp::verify() {`
`455`	`455`	`if (!getDst().getType().getMutableMemory()) {`
`456`	`456`	`return emitOpError("Cannot store into an immutable alloc");`
`457`	`457`	`}`
`458`		`- return success();`
	`458`	`+ return triton::gpu::verifyMemoryOpTypes(*this, getSrc().getType(),`
	`459`	`+ getDst().getType());`
`459`	`460`	`}`
`460`	`461`
`461`	`462`	`// -- TMEMLoadOp --`
`@@ -466,7 +467,7 @@ LogicalResult TMEMLoadOp::verify() {`
`466`	`467`	`if (!isa<triton::nvidia_gpu::TensorMemoryEncodingAttr>(`
`467`	`468`	`getSrc().getType().getEncoding()))`
`468`	`469`	`return emitOpError("should use tensor memory encoding.");`
`469`		`- return success();`
	`470`	`+ return triton::gpu::verifyMemoryOpTypes(*this, getSrc().getType(), getType());`
`470`	`471`	`}`
`471`	`472`
`472`	`473`	`// -- TMEMAllocOp --`
`@@ -476,8 +477,7 @@ LogicalResult TMEMAllocOp::verify() {`
`476`	`477`	`if (!isa<TensorMemoryEncodingAttr, TensorMemoryScalesEncodingAttr>(`
`477`	`478`	`getType().getEncoding()))`
`478`	`479`	`return emitOpError("should use tensor memory encoding");`
`479`		`-`
`480`		`- return LocalAllocOp::verifyAllocOp(*this, getSrc(), getType());`
	`480`	`+ return triton::gpu::verifyAllocOp(*this, getSrc(), getType());`
`481`	`481`	`}`
`482`	`482`
`483`	`483`	`void TMEMAllocOp::getEffects(`
Original file line number	Diff line number	Diff line change
`@@ -605,7 +605,7 @@ tt.func @convert_layout5(%A : !tt.ptr<f16>) {`
`605`	`605`	`// CHECK: ttg.local_load`
`606`	`606`	`// CHECK-NEXT: gpu.barrier`
`607`	`607`	`// CHECK: ttg.local_load`
`608`		`- %3 = ttg.local_load %0 : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL>`
	`608`	`+ %3 = ttg.local_load %0 : !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<32x16xf16, #AL>`
`609`	`609`	`%4 = ttg.local_load %1 : !ttg.memdesc<16x16xf16, #A_SHARED, #ttg.shared_memory, mutable> -> tensor<16x16xf16, #AL>`
`610`	`610`	`tt.return`
`611`	`611`	`}`
Original file line number	Diff line number	Diff line change
`@@ -148,9 +148,9 @@ tt.func @tmem_min_4_warps(%tensor_desc: !ttg.memdesc<64x64xf32, #tmem, #ttng.ten`
`148`	`148`	`}`
`149`	`149`	`// CHECK: partition1{{.*}} num_warps(4)`
`150`	`150`	`partition1(%desc: !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>) num_warps(8) {`
`151`		`- %cst = arith.constant dense<0> : tensor<64x64xi32, #blocked2d_8>`
	`151`	`+ %cst = arith.constant dense<0.0> : tensor<64x64xf32, #blocked2d_8>`
`152`	`152`	`%true = arith.constant true`
`153`		`- ttng.tmem_store %cst, %desc, %true : tensor<64x64xi32, #blocked2d_8> -> !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>`
	`153`	`+ ttng.tmem_store %cst, %desc, %true : tensor<64x64xf32, #blocked2d_8> -> !ttg.memdesc<64x64xf32, #tmem, #ttng.tensor_memory, mutable>`
`154`	`154`	`ttg.warp_return`
`155`	`155`	`}`
`156`	`156`	`// CHECK: partition2{{.*}} num_warps(4)`