intel
diff --git a/‎include/triton/Dialect/Triton/IR/OpInterfaces.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/OpInterfaces.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Traits.h‎
Lines changed: 0 additions & 47 deletions b/‎include/triton/Dialect/Triton/IR/Traits.h‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonInterfaces.td‎
Lines changed: 0 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonInterfaces.td‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 21 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 2 additions & 2 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 1 addition & 7 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 6 additions & 3 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 15 additions & 6 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp‎
Lines changed: 8 additions & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/TypeConverter.cpp‎
Lines changed: 8 additions & 1 deletion
@@ -11,6 +11,8 @@ namespace impl {
 
 LogicalResult verifyTransposeOpInterface(Operation *op);
 
+LogicalResult verifyDotOpInterface(Operation *op);
+
 } // namespace impl
 
 } // namespace triton
 
@@ -58,53 +58,6 @@ class VerifyTensorLayoutsTrait
   }
 };
 
-// Verify if the op is a dot-like operation.
-// A dot-like operation should have three operands.
-// The first two operands should share a common dimension, and the result
-// should have the dimensions of the two operands that are not shared.
-// A dot-like operation can be either 2d or 3d.
-// In the 3d case, the first dimension of operands is the batch dimension.
-template <class ConcreteType>
-class DotLike : public TraitBase<ConcreteType, DotLike> {
-public:
-  static LogicalResult verifyTrait(Operation *op) {
-    if (op->getNumOperands() < 3)
-      return op->emitOpError("expected at least 3 operands");
-    auto aTy = cast<ShapedType>(op->getOperand(0).getType());
-    auto bTy = cast<ShapedType>(op->getOperand(1).getType());
-    auto cTy = cast<ShapedType>(op->getOperand(2).getType());
-    auto aShape = aTy.getShape();
-    auto bShape = bTy.getShape();
-    auto cShape = cTy.getShape();
-    // Check if all 3d or all 2d
-    if (aShape.size() != 2 && aShape.size() != 3)
-      return op->emitOpError("expected operands to be 2d or 3d");
-    if (aShape.size() != bShape.size() || aShape.size() != cShape.size())
-      return op->emitOpError("expected all operands to have the same rank");
-    // Check if the first two operands share a common dimension
-    // TODO: enable back with an interface to support scaled dot.
-    // if (aShape[aShape.size() - 1] != bShape[aShape.size() - 2])
-    //   return op->emitOpError("expected the last dimension of the first
-    //   operand "
-    //                          "to be equal to the second-to-last dimension of
-    //                          " "the second operand");
-    // Check the batch dimension
-    if (aShape.size() == 3 &&
-        (aShape[0] != cShape[0] || bShape[0] != cShape[0]))
-      return op->emitOpError("expected the first dimension of the first "
-                             "operand to be equal to the first dimension of "
-                             "the result");
-    // Check the output shape
-    if (cShape[cShape.size() - 2] != aShape[aShape.size() - 2] ||
-        cShape[cShape.size() - 1] != bShape[aShape.size() - 1])
-      return op->emitOpError(
-          "expected the output shape to be the concatenation of the last "
-          "dimension of the first operand and the last dimension of the "
-          "second ");
-    return success();
-  }
-};
-
 template <typename ConcreteType>
 class SameOperandsAndResultEncoding
     : public TraitBase<ConcreteType, SameOperandsAndResultEncoding> {
 
@@ -6,7 +6,6 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 
 def TensorSizeTrait : NativeOpTrait<"TensorSizeTrait">;
 def VerifyTensorLayoutsTrait : NativeOpTrait<"VerifyTensorLayoutsTrait">;
-def DotLike : NativeOpTrait<"DotLike">;
 def SameOperandsEncoding : NativeOpTrait<"SameOperandsEncoding">;
 def SameOperandsAndResultEncoding : NativeOpTrait<"SameOperandsAndResultEncoding">;
 def SameLoadStoreOperandsShape : NativeOpTrait<"SameLoadStoreOperandsShape">;
 
@@ -29,7 +29,27 @@ def TransposeOpInterface : OpInterface<"TransposeOpInterface"> {
 			/*args=*/(ins)>
   ];
 
-	let verify = [{ return ::mlir::triton::impl::verifyTransposeOpInterface($_op); }];
+  let verify = [{ return ::mlir::triton::impl::verifyTransposeOpInterface($_op); }];
+}
+
+def DotOpInterface : OpInterface<"DotOpInterface"> {
+  let description = [{
+	This interface is implemented by operations that perform a dot product.
+  }];
+
+  let cppNamespace = "::mlir::triton";
+
+  let methods = [
+	InterfaceMethod<
+			/*desc=*/[{
+			  Verifies the dimensions of the A and B DotOp operands.
+		  }],
+			/*retType=*/"bool",
+			/*methodName=*/"verifyDims",
+			/*args=*/(ins)>
+	];
+
+  let verify = [{ return ::mlir::triton::impl::verifyDotOpInterface($_op); }];
 }
 
 
 
@@ -631,7 +631,7 @@ def TT_GetNumProgramsOp : TT_Op<"get_num_programs", [Pure]> {
 //
 def TT_DotOp : TT_Op<"dot", [Pure,
                              DeclareOpInterfaceMethods<InferTypeOpInterface>,
-                             DotLike,
+                             DeclareOpInterfaceMethods<DotOpInterface>,
                              TypesMatchWith<"result's type matches accumulator's type",
                                             "d", "c", "$_self">]> {
     let summary = "dot";
@@ -671,7 +671,7 @@ def TT_DotOp : TT_Op<"dot", [Pure,
 //
 def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
                              AttrSizedOperandSegments,
-                             DotLike,
+                             DeclareOpInterfaceMethods<DotOpInterface>,
                              TypesMatchWith<"result's type matches accumulator's type",
                                             "d", "c", "$_self">]> {
     let summary = "dot_scaled";
 
@@ -200,13 +200,7 @@ StringRef getAMDArch(Operation *module);
 std::optional<mlir::triton::gpu::SwizzledSharedEncodingAttr>
 getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible);
 
-enum class MMALoadType {
-  SharedV3,
-  Registers,     // may be v2 or v3
-  DoNotPipeline, // could be a valid shared/registers MMA operand, but skip
-                 // pipelining
-};
-MMALoadType getMMALoadType(Operation *loadOp);
+bool canUseMMAv3Pipelining(Operation *loadOp);
 
 // Convert \param op operands and results to layout \param encoding.
 void convertOpEncoding(Attribute encoding, Operation *op);
 
@@ -27,6 +27,7 @@ include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "triton/Dialect/Triton/IR/TritonTypes.td"
 include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
 include "triton/Dialect/Triton/IR/TritonInterfaces.td"
+include "triton/Dialect/Triton/IR/TritonOpInterfaces.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td"
 include "mlir/IR/OpBase.td"
@@ -71,7 +72,7 @@ def TTNG_ClusterWaitOp : TTNG_Op<"cluster_wait", []> {
 //
 def TTNG_WarpGroupDotOp : TTNG_Op<"warp_group_dot", [DeclareOpInterfaceMethods<InferTypeOpInterface>,
                                                      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
-                                                     DotLike,
+                                                     DeclareOpInterfaceMethods<DotOpInterface>,
                                                      TypesMatchWith<"result's type matches accumulator's type",
                                                                      "d", "c", "$_self">]> {
     let summary = "warp group dot";
@@ -325,7 +326,7 @@ def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {
   let assemblyFormat = "attr-dict";
 }
 
-def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DotLike]> {
+def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface>]> {
     let summary = "block level op mapping to tensorcore gen5 mma";
 
     let description = [{
@@ -343,11 +344,12 @@ def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [DeclareOpInterfaceMethods<MemoryE
                          I1:$pred,
                          Optional<TTG_MemDescType>:$barrier,
                          OptionalAttr<UnitAttr>:$two_ctas);
+
     // TODO: improve printing format.
     let assemblyFormat = "$a`,` $b`,` $d`,` $useD`,` $pred (`,` $barrier^)? attr-dict `:` functional-type(operands, results)";
 }
 
-def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DotLike]> {
+def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface>]> {
     let summary = "block level op mapping to tensorcore gen5 mma";
 
     let description = [{
@@ -366,6 +368,7 @@ def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMe
                          I1:$useD,
                          I1:$pred,
                          Optional<TTG_MemDescType>:$barrier);
+
     // TODO: improve printing format.
     let assemblyFormat = "$a `,` $b `,` $d `,` $a_scale `,` $b_scale `,` $useD`,` $pred `lhs` `=` $a_type `rhs` `=` $b_type (`,` $barrier^)? attr-dict `:` functional-type(operands, results)";
 }
 
@@ -50,8 +50,18 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
 
   Value elemSizeVal = builder.template create<arith::ConstantOp>(
       loc, builder.getI64Type(), builder.getI64IntegerAttr(elemSize));
-  Value globalStride = builder.template create<arith::MulIOp>(
-      loc, op.getStrides()[0], elemSizeVal);
+
+  SmallVector<Value> globalDim(llvm::reverse(op.getShape()));
+  SmallVector<Value> globalStride;
+  for (int k = op.getStrides().size() - 2; k >= 0; --k) {
+    globalStride.push_back(op.getStrides()[k]);
+  }
+
+  SmallVector<Value> elementStride(globalDim.size(), mkI32Constant(1));
+
+  for (int i = 0; i < globalStride.size(); ++i)
+    globalStride[i] = builder.template create<arith::MulIOp>(
+        loc, globalStride[i], elemSizeVal);
 
   int elemTypeEnum;
   switch (elemSize) {
@@ -75,15 +85,14 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
   }
   }
 
-  auto one = mkI32Constant(1);
   builder.template create<triton::ExperimentalTensormapCreateOp>(
       loc,
       /*desc_ptr=*/tmaPtr,
       /*global_address=*/op.getBase(),
       /*box_dim=*/boxDim,
-      /*global_dim=*/ValueRange{op.getShape()[1], op.getShape()[0]},
-      /*global_stride=*/ValueRange{globalStride},
-      /*element_strides=*/ValueRange{one, one},
+      /*global_dim=*/globalDim,
+      /*global_stride=*/globalStride,
+      /*element_strides=*/elementStride,
       /*elem_type*/ builder.getI32IntegerAttr(elemTypeEnum),
       /*interleave_layout*/ builder.getI32IntegerAttr(0),
       /*swizzle_mode=*/builder.getI32IntegerAttr(swizzle_mode),
 
@@ -935,7 +935,7 @@ class ShROpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {
       // Treat [2^n,2^n+1,...]'s divisibility as 1 instead of 2^n
       lhsDivisibility = 1;
     }
-    return std::max<int64_t>(1, lhsDivisibility / (1 << shift));
+    return std::max<int64_t>(1, lhsDivisibility / (int64_t(1) << shift));
   }
 
   int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,
 
@@ -47,10 +47,17 @@ Type TritonGPUToLLVMTypeConverter::convertTritonTensorType(
 Type TritonGPUToLLVMTypeConverter::convertMemDescType(
     MemDescType type, const TargetInfoBase &targetInfo) {
   auto ctx = type.getContext();
-  SmallVector<Type, 4> types;
   // base ptr
   auto ptrType =
       LLVM::LLVMPointerType::get(ctx, targetInfo.getSharedAddressSpace());
+
+  if (isa<triton::nvidia_gpu::TensorMemoryEncodingAttr,
+          triton::nvidia_gpu::TensorMemoryScalesEncodingAttr>(
+          type.getEncoding())) {
+    return ptrType;
+  }
+
+  SmallVector<Type, 4> types;
   types.push_back(ptrType);
   auto rank = type.getRank();
   // offsets
Original file line number	Diff line number	Diff line change
`@@ -935,7 +935,7 @@ class ShROpAxisInfoVisitor final : public BinaryOpVisitorImpl<OpTy> {`
`935`	`935`	`// Treat [2^n,2^n+1,...]'s divisibility as 1 instead of 2^n`
`936`	`936`	`lhsDivisibility = 1;`
`937`	`937`	`}`
`938`		`- return std::max<int64_t>(1, lhsDivisibility / (1 << shift));`
	`938`	`+ return std::max<int64_t>(1, lhsDivisibility / (int64_t(1) << shift));`
`939`	`939`	`}`
`940`	`940`
`941`	`941`	`int64_t getConstancy(OpTy op, const AxisInfo &lhs, const AxisInfo &rhs,`