[TritonGPU] Misc cleanups (#6402)

Mogball · web-flow · commit 05b500cc1e0e · 2025-04-04T18:05:29.000-07:00
These are some diffs split from other patches that didn't quite make it.

* Adds verifiers for TMA scatter ops
* Some misc code cleanup
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -1340,9 +1340,10 @@ def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [MemoryEffects<[MemRead<G
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
-    // TMA gathers have resstrictions on the minimum size of the gather result.
+    // TMA gathers have restrictions on the minimum size of the gather result.
     // This function verifies the result type.
-    static LogicalResult verifyResultType(Operation *op, mlir::ShapedType type);
+    static LogicalResult verifyResultType(Operation *op, ShapedType resultType,
+                                          RankedTensorType indicesType);
   }];
 }
 
@@ -1360,6 +1361,8 @@ def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter", [
     $desc `[` $x_offsets `,` $y_offset `]` `,` $src
     attr-dict `:` type(operands)
   }];
+
+  let hasVerifier = 1;
 }
 
 def TT_ExperimentalTensormapCreateOp: TT_Op<
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -366,6 +366,8 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter", [DeclareOpInterfaceMet
     $desc_ptr `[` $x_offsets `,` $y_offset `]` $src
     attr-dict `:` type(operands)
   }];
+
+  let hasVerifier = 1;
 }
 
 def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {
diff --git a/lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp
@@ -1,3 +1,4 @@
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
@@ -171,7 +172,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
     if (auto totalNumWarps = funcOp.getParentOp()->getAttrOfType<IntegerAttr>(
             "ttg.total-num-warps"))
       numWarps = totalNumWarps.getInt();
-    newFuncOp->setAttr("nvvm.reqntid",
+    newFuncOp->setAttr(NVVM::NVVMDialect::getReqntidAttrName(),
                        rewriter.getDenseI32ArrayAttr(32 * numWarps));
 
     rewriter.eraseOp(funcOp);
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -1245,69 +1245,84 @@ LogicalResult GatherOp::inferReturnTypes(
 }
 
 // -- DescriptorGatherOp
-LogicalResult DescriptorGatherOp::verifyResultType(Operation *op,
-                                                   mlir::ShapedType type) {
-  if (type.getRank() != 2)
-    return op->emitOpError("result must be a 2D tensor, but got ") << type;
+LogicalResult
+DescriptorGatherOp::verifyResultType(Operation *op, ShapedType resultType,
+                                     RankedTensorType indicesType) {
+  if (indicesType.getRank() != 1)
+    return op->emitOpError("x offsets must be a 1D tensor, but got ")
+           << indicesType;
+  if (resultType.getRank() != 2)
+    return op->emitOpError("result must be a 2D tensor, but got ")
+           << resultType;
 
   // The swizzling of TMA accesses matches that of the MMAv3 shared memory
   // layouts. However, these have minimum size requirements.
   // TODO: We can support smaller gather sizes by padding the `local_alloc` this
   // lowers to to the nearest minimum tile size.
-  if (unsigned rows = type.getShape()[0]; rows < 8) {
+  if (unsigned rows = resultType.getShape()[0]; rows < 8) {
     return op->emitOpError("gather must have at least 8 rows, but got ")
            << rows;
   }
 
-  Type dtype = type.getElementType();
+  Type dtype = resultType.getElementType();
   if (dtype.getIntOrFloatBitWidth() > 32)
     return op->emitOpError("TMA dtype cannot be greater than 32 bits");
 
   unsigned minCols = 32 / dtype.getIntOrFloatBitWidth() * 8;
-  if (unsigned cols = type.getShape()[1]; cols < minCols) {
+  if (unsigned cols = resultType.getShape()[1]; cols < minCols) {
     return op->emitOpError("gather of ")
            << dtype << " must have at least " << minCols << " columns, but got "
            << cols;
   }
 
+  if (resultType.getShape()[0] != indicesType.getShape()[0]) {
+    return op->emitOpError("result tensor must have as many rows as indices (")
+           << indicesType.getShape()[0] << "), but got " << resultType;
+  }
+
   return success();
 }
 
-LogicalResult DescriptorGatherOp::verify() {
-  RankedTensorType blockType = getDesc().getType().getBlockType();
+static LogicalResult verifyGatherScatterOp(Operation *op,
+                                           RankedTensorType blockType,
+                                           RankedTensorType resultType,
+                                           RankedTensorType indicesType) {
   // Gather from `!tt.tensordesc<tensor<1xMxdtype>>`.
-  if (blockType.getRank() != 2)
-    return emitOpError("block must be a 2D tensor, but got ") << blockType;
-  if (blockType.getShape()[0] != 1)
-    return emitOpError("block must have exactly 1 row, but got ") << blockType;
-
-  // With x offsets `tensor<Nxinttype>`.
-  RankedTensorType indicesType = getXOffsets().getType();
-  if (indicesType.getRank() != 1)
-    return emitOpError("x offsets must be a 1D tensor, but got ")
-           << indicesType;
+  if (blockType.getRank() != 2) {
+    return op->emitOpError("block must be a 2D tensor, but got ") << blockType;
+  }
+  if (blockType.getShape()[0] != 1) {
+    return op->emitOpError("block must have exactly 1 row, but got ")
+           << blockType;
+  }
 
-  // Into `tensor<NxMxdtype>`.
-  RankedTensorType resultType = getType();
-  if (failed(verifyResultType(*this, resultType)))
+  // With x offsets `tensor<Nxinttype>` into `tensor<NxMxdtype>`.
+  if (failed(DescriptorGatherOp::verifyResultType(op, resultType, indicesType)))
     return failure();
 
-  if (resultType.getShape()[0] != indicesType.getShape()[0]) {
-    return emitOpError("result tensor must have as many rows as indices (")
-           << indicesType.getShape()[0] << "), but got " << resultType;
-  }
   if (resultType.getShape()[1] != blockType.getShape()[1]) {
-    return emitOpError("result tensor number of columns must match block (")
+    return op->emitOpError("result tensor number of columns must match block (")
            << blockType.getShape()[1] << "), but got " << resultType;
   }
   if (resultType.getElementType() != blockType.getElementType()) {
-    return emitOpError("result tensor element type must match block (")
+    return op->emitOpError("result tensor element type must match block (")
            << blockType.getElementType() << "), but got " << resultType;
   }
 
   return success();
 }
 
+LogicalResult DescriptorGatherOp::verify() {
+  return verifyGatherScatterOp(*this, getDesc().getType().getBlockType(),
+                               getResult().getType(), getXOffsets().getType());
+}
+
+// -- DescriptorScatterOp --
+LogicalResult DescriptorScatterOp::verify() {
+  return verifyGatherScatterOp(*this, getDesc().getType().getBlockType(),
+                               getSrc().getType(), getXOffsets().getType());
+}
+
 // -- DescriptorLoadOp --
 static LogicalResult verifyDescriptorLoadStoreType(Operation *op,
                                                    TensorDescType desc,
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -244,7 +244,8 @@ LogicalResult AsyncTMAGatherOp::verify() {
   triton::gpu::MemDescType resultType = getResult().getType();
   if (!resultType.getMutableMemory())
     return emitOpError("cannot store into immutable memory");
-  return DescriptorGatherOp::verifyResultType(*this, resultType);
+  return DescriptorGatherOp::verifyResultType(*this, resultType,
+                                              getXOffsets().getType());
 }
 
 void AsyncTMAGatherOp::getEffects(
@@ -259,6 +260,11 @@ void AsyncTMAGatherOp::getEffects(
 }
 
 // -- AsyncTMAScatter --
+LogicalResult AsyncTMAScatterOp::verify() {
+  return DescriptorGatherOp::verifyResultType(*this, getSrc().getType(),
+                                              getXOffsets().getType());
+}
+
 void AsyncTMAScatterOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir
@@ -787,12 +787,14 @@ tt.func @tma_special_cases(%arg1: !tt.ptr<i8, 0>) -> (tensor<256x64xf16, #blocke
   ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>
   ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
 
+  // CHECK-NEXT: memdesc_subview
   // CHECK-NEXT: ttng.barrier_expect
   // CHECK-NEXT: ttng.async_tma_gather
   // CHECK-NEXT: gpu.barrier
   // CHECK-NEXT: ttng.wait_barrier
+  %view = ttg.memdesc_subview %alloc[%c0, %c0] : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable>
   ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
-  ttng.async_tma_gather %arg1[%cx, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, tensor<32xi32>, i32, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>, !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable>, i1
+  ttng.async_tma_gather %arg1[%cx, %c0] %view, %barrier, %true : !tt.ptr<i8, 0>, tensor<32xi32>, i32, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>, !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable>, i1
   ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>
 
   // CHECK-NEXT: gpu.barrier

Original file line number	Diff line number	Diff line change
`@@ -366,6 +366,8 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter", [DeclareOpInterfaceMet`
`366`	`366`	$desc_ptr `[` $x_offsets `,` $y_offset `]` $src
`367`	`367`	attr-dict `:` type(operands)
`368`	`368`	`}];`
	`369`	`+`
	`370`	`+ let hasVerifier = 1;`
`369`	`371`	`}`
`370`	`372`
`371`	`373`	`def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {`