intel
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 6 additions & 3 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 2 additions & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 50 additions & 35 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 50 additions & 35 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -1314,7 +1314,7 @@ def TT_DescriptorStoreOp : TT_Op<"descriptor_store", [
 def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [MemoryEffects<[MemRead<GlobalMemory>]>]> {
   let summary = "gather multiple rows from a descriptor into a single tensor";
   let description = [{
-    The `tt.desciptor_gather` op will be lowered to NVIDIA TMA
+    The `tt.descriptor_gather` op will be lowered to NVIDIA TMA
     load operations on targets that support it.
 
     `desc_ptr` is a pointer to the TMA descriptor allocated in global memory.
@@ -1340,9 +1340,10 @@ def TT_DescriptorGatherOp : TT_Op<"descriptor_gather", [MemoryEffects<[MemRead<G
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
-    // TMA gathers have resstrictions on the minimum size of the gather result.
+    // TMA gathers have restrictions on the minimum size of the gather result.
     // This function verifies the result type.
-    static LogicalResult verifyResultType(Operation *op, mlir::ShapedType type);
+    static LogicalResult verifyResultType(Operation *op, ShapedType resultType,
+                                          RankedTensorType indicesType);
   }];
 }
 
@@ -1360,6 +1361,8 @@ def TT_DescriptorScatterOp : TT_Op<"descriptor_scatter", [
     $desc `[` $x_offsets `,` $y_offset `]` `,` $src
     attr-dict `:` type(operands)
   }];
+
+  let hasVerifier = 1;
 }
 
 def TT_ExperimentalTensormapCreateOp: TT_Op<
 
@@ -49,7 +49,7 @@ std::pair<OpResult, int64_t> getDefinitionAndDistance(scf::ForOp forOp,
 std::pair<Operation *, int64_t> getDefiningOpAndDistance(scf::ForOp forOp,
                                                          Value value);
 
-// Return maxumum length of the vectorized copy between registers and shared
+// Return maximum length of the vectorized copy between registers and shared
 // memory for the given tensor type and shared encoding.
 int getCopyVecBytes(RankedTensorType registerTy,
                     gpu::SharedEncodingTrait sharedEnc);
 
@@ -237,6 +237,11 @@ SetVector<Value> getNestedOperands(Operation *op);
 // Erase the given loop carried values from the loop, where `loop` is replaced
 // with a new loop.
 void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices);
+
+// Return true if two value sets may refer to the same allocation.
+bool mayAliasAllocations(const DenseSet<Value> &lhs,
+                         const DenseSet<Value> &rhs);
+
 } // namespace mlir
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -366,6 +366,8 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter", [DeclareOpInterfaceMet
     $desc_ptr `[` $x_offsets `,` $y_offset `]` $src
     attr-dict `:` type(operands)
   }];
+
+  let hasVerifier = 1;
 }
 
 def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {
 
@@ -1,3 +1,4 @@
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
@@ -171,7 +172,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
     if (auto totalNumWarps = funcOp.getParentOp()->getAttrOfType<IntegerAttr>(
             "ttg.total-num-warps"))
       numWarps = totalNumWarps.getInt();
-    newFuncOp->setAttr("nvvm.reqntid",
+    newFuncOp->setAttr(NVVM::NVVMDialect::getReqntidAttrName(),
                        rewriter.getDenseI32ArrayAttr(32 * numWarps));
 
     rewriter.eraseOp(funcOp);
 
@@ -358,7 +358,7 @@ struct ReduceOpConversion
                resultIdx < resultDim; ++resultIdx) {
             auto smemIdx = resultIdx < op.getAxis() ? resultIdx : resultIdx + 1;
             if (resultShape[resultIdx] > smemShape[smemIdx]) {
-              // When srcShape smaller then src sizePerThread, only srcShape
+              // When srcShape smaller than src sizePerThread, only srcShape
               // elements is accumulated in smem. Modulo smemShape effectively
               // replicates srcShape elements to src sizePerThread.
               readIdx[smemIdx] =
 
@@ -628,7 +628,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
 }
 // Proton patterns
 // NOTE: Because Proton's inputs are scalars and not tensors this conversion
-// isn't strictly nessessary however you could envision a case where we pass in
+// isn't strictly necessary however you could envision a case where we pass in
 // tensors in for Triton object specific tracing operations in which case we
 // would need to fill in the OpConversionPattern
 void populateProtonPatterns(TritonGPUTypeConverter &typeConverter,
 
@@ -1245,73 +1245,88 @@ LogicalResult GatherOp::inferReturnTypes(
 }
 
 // -- DescriptorGatherOp
-LogicalResult DescriptorGatherOp::verifyResultType(Operation *op,
-                                                   mlir::ShapedType type) {
-  if (type.getRank() != 2)
-    return op->emitOpError("result must be a 2D tensor, but got ") << type;
+LogicalResult
+DescriptorGatherOp::verifyResultType(Operation *op, ShapedType resultType,
+                                     RankedTensorType indicesType) {
+  if (indicesType.getRank() != 1)
+    return op->emitOpError("x offsets must be a 1D tensor, but got ")
+           << indicesType;
+  if (resultType.getRank() != 2)
+    return op->emitOpError("result must be a 2D tensor, but got ")
+           << resultType;
 
   // The swizzling of TMA accesses matches that of the MMAv3 shared memory
   // layouts. However, these have minimum size requirements.
   // TODO: We can support smaller gather sizes by padding the `local_alloc` this
   // lowers to to the nearest minimum tile size.
-  if (unsigned rows = type.getShape()[0]; rows < 8) {
+  if (unsigned rows = resultType.getShape()[0]; rows < 8) {
     return op->emitOpError("gather must have at least 8 rows, but got ")
            << rows;
   }
 
-  Type dtype = type.getElementType();
+  Type dtype = resultType.getElementType();
   if (dtype.getIntOrFloatBitWidth() > 32)
     return op->emitOpError("TMA dtype cannot be greater than 32 bits");
 
   unsigned minCols = 32 / dtype.getIntOrFloatBitWidth() * 8;
-  if (unsigned cols = type.getShape()[1]; cols < minCols) {
+  if (unsigned cols = resultType.getShape()[1]; cols < minCols) {
     return op->emitOpError("gather of ")
            << dtype << " must have at least " << minCols << " columns, but got "
            << cols;
   }
 
+  if (resultType.getShape()[0] != indicesType.getShape()[0]) {
+    return op->emitOpError("result tensor must have as many rows as indices (")
+           << indicesType.getShape()[0] << "), but got " << resultType;
+  }
+
   return success();
 }
 
-LogicalResult DescriptorGatherOp::verify() {
-  RankedTensorType blockType = getDesc().getType().getBlockType();
+static LogicalResult verifyGatherScatterOp(Operation *op,
+                                           RankedTensorType blockType,
+                                           RankedTensorType resultType,
+                                           RankedTensorType indicesType) {
   // Gather from `!tt.tensordesc<tensor<1xMxdtype>>`.
-  if (blockType.getRank() != 2)
-    return emitOpError("block must be a 2D tensor, but got ") << blockType;
-  if (blockType.getShape()[0] != 1)
-    return emitOpError("block must have exactly 1 row, but got ") << blockType;
-
-  // With x offsets `tensor<Nxinttype>`.
-  RankedTensorType indicesType = getXOffsets().getType();
-  if (indicesType.getRank() != 1)
-    return emitOpError("x offsets must be a 1D tensor, but got ")
-           << indicesType;
+  if (blockType.getRank() != 2) {
+    return op->emitOpError("block must be a 2D tensor, but got ") << blockType;
+  }
+  if (blockType.getShape()[0] != 1) {
+    return op->emitOpError("block must have exactly 1 row, but got ")
+           << blockType;
+  }
 
-  // Into `tensor<NxMxdtype>`.
-  RankedTensorType resultType = getType();
-  if (failed(verifyResultType(*this, resultType)))
+  // With x offsets `tensor<Nxinttype>` into `tensor<NxMxdtype>`.
+  if (failed(DescriptorGatherOp::verifyResultType(op, resultType, indicesType)))
     return failure();
 
-  if (resultType.getShape()[0] != indicesType.getShape()[0]) {
-    return emitOpError("result tensor must have as many rows as indices (")
-           << indicesType.getShape()[0] << "), but got " << resultType;
-  }
   if (resultType.getShape()[1] != blockType.getShape()[1]) {
-    return emitOpError("result tensor number of columns must match block (")
+    return op->emitOpError("result tensor number of columns must match block (")
            << blockType.getShape()[1] << "), but got " << resultType;
   }
   if (resultType.getElementType() != blockType.getElementType()) {
-    return emitOpError("result tensor element type must match block (")
+    return op->emitOpError("result tensor element type must match block (")
            << blockType.getElementType() << "), but got " << resultType;
   }
 
   return success();
 }
 
+LogicalResult DescriptorGatherOp::verify() {
+  return verifyGatherScatterOp(*this, getDesc().getType().getBlockType(),
+                               getResult().getType(), getXOffsets().getType());
+}
+
+// -- DescriptorScatterOp --
+LogicalResult DescriptorScatterOp::verify() {
+  return verifyGatherScatterOp(*this, getDesc().getType().getBlockType(),
+                               getSrc().getType(), getXOffsets().getType());
+}
+
 // -- DescriptorLoadOp --
-static LogicalResult verifyDesciptorLoadStoreType(Operation *op,
-                                                  TensorDescType desc,
-                                                  RankedTensorType tensor) {
+static LogicalResult verifyDescriptorLoadStoreType(Operation *op,
+                                                   TensorDescType desc,
+                                                   RankedTensorType tensor) {
   RankedTensorType block = desc.getBlockType();
   ArrayRef<int64_t> blockShape = block.getShape();
   ArrayRef<int64_t> tensorShape = tensor.getShape();
@@ -1328,17 +1343,17 @@ static LogicalResult verifyDesciptorLoadStoreType(Operation *op,
   if (blockShape == tensorShape &&
       block.getElementType() == tensor.getElementType())
     return success();
-  return op->emitOpError("tensor desciptor block and tensor types must match");
+  return op->emitOpError("tensor descriptor block and tensor types must match");
 }
 
 LogicalResult DescriptorLoadOp::verify() {
-  return verifyDesciptorLoadStoreType(*this, getDesc().getType(), getType());
+  return verifyDescriptorLoadStoreType(*this, getDesc().getType(), getType());
 }
 
 // -- DescriptorStoreOp --
 LogicalResult DescriptorStoreOp::verify() {
-  return verifyDesciptorLoadStoreType(*this, getDesc().getType(),
-                                      getSrc().getType());
+  return verifyDescriptorLoadStoreType(*this, getDesc().getType(),
+                                       getSrc().getType());
 }
 
 // -- ExperimentalTensormapCreateOp --
 
@@ -502,7 +502,7 @@ static LogicalResult parseBoolAttrValue(AsmParser &parser, Attribute attr,
                                         bool &value, StringRef desc) {
   auto boolAttr = mlir::dyn_cast<BoolAttr>(attr);
   if (!boolAttr) {
-    parser.emitError(parser.getNameLoc(), "expected an bool type in ") << desc;
+    parser.emitError(parser.getNameLoc(), "expected a bool type in ") << desc;
     return failure();
   }
   value = boolAttr.getValue();
@@ -2798,7 +2798,7 @@ std::string getSharedLayoutStr(RankedTensorType tensorType,
   // Shared layouts are a mapping of (block, offset) --> (...)
 
   // We can just use a single int to index into elementMapping because
-  // the 'swizzle' operation rearranges the indicies---and we want to keep it
+  // the 'swizzle' operation rearranges the indices---and we want to keep it
   // that way
   int32_t idx = 0;
   // Enumerate all the offsets for each block
 
@@ -461,7 +461,7 @@ static scf::IfOp eraseIfResults(ImplicitLocOpBuilder &b, scf::IfOp ifOp,
 // epilogueK and the first iteration of bodyj(K+1). Hence the `- N` term in the
 // total number of iterations.
 //
-// What the above Python-psuedo-code glosses over is SSA dependency management.
+// What the above Python-pseudo-code glosses over is SSA dependency management.
 // To interpret the pseudocode as SSA IR, just imagine everything is put back
 // into allocas and SSA formation re-runs after fusion, which one should note
 // will introduce undefs.
Original file line number	Diff line number	Diff line change
`@@ -366,6 +366,8 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter", [DeclareOpInterfaceMet`
`366`	`366`	$desc_ptr `[` $x_offsets `,` $y_offset `]` $src
`367`	`367`	attr-dict `:` type(operands)
`368`	`368`	`}];`
	`369`	`+`
	`370`	`+ let hasVerifier = 1;`
`369`	`371`	`}`
`370`	`372`
`371`	`373`	`def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {`
Original file line number	Diff line number	Diff line change
`@@ -628,7 +628,7 @@ void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,`
`628`	`628`	`}`
`629`	`629`	`// Proton patterns`
`630`	`630`	`// NOTE: Because Proton's inputs are scalars and not tensors this conversion`
`631`		`-// isn't strictly nessessary however you could envision a case where we pass in`
	`631`	`+// isn't strictly necessary however you could envision a case where we pass in`
`632`	`632`	`// tensors in for Triton object specific tracing operations in which case we`
`633`	`633`	`// would need to fill in the OpConversionPattern`
`634`	`634`	`void populateProtonPatterns(TritonGPUTypeConverter &typeConverter,`