Merge OpenAI Triton commit 63cecbd (#3550)

whitneywhtsang · web-flow · commit e4035e9eb754 · 2025-02-27T10:19:19.000-05:00
This PR change the Triton base from 72193bb to 63cecbd (Feb 24). Pass rate: 97.65% Please do not squash and merge this PR.
diff --git a/include/triton/Dialect/Triton/IR/TritonOpInterfaces.td b/include/triton/Dialect/Triton/IR/TritonOpInterfaces.td
@@ -7,47 +7,54 @@ include "mlir/IR/OpBase.td"
 def TransposeOpInterface : OpInterface<"TransposeOpInterface"> {
   let description = [{
     This interface is implemented by operations that perform a transpose.
-    It provides methods to access common properties such as the order attribute and the source operand.
+    It provides methods to access common properties such as the order attribute
+    and the source operand.
   }];
 
   let cppNamespace = "::mlir::triton";
 
   let methods = [
     InterfaceMethod<
-			/*desc=*/[{
-			  Get the source operand of the transposition.
-		  }],
-			/*retType=*/"::mlir::Value",
-			/*methodName=*/"getSrc",
-			/*args=*/(ins)>,
+      /*desc=*/"Get the source operand of the transposition.",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getSrc",
+      /*args=*/(ins)>,
     InterfaceMethod<
-			/*desc=*/[{
-			  Get the order of the transposition.
-		  }],
-			/*retType=*/"::mlir::ArrayRef<int32_t>",
-			/*methodName=*/"getOrder",
-			/*args=*/(ins)>
+      /*desc=*/"Get the order of the transposition.",
+      /*retType=*/"::mlir::ArrayRef<int32_t>",
+      /*methodName=*/"getOrder",
+      /*args=*/(ins)>
   ];
 
-  let verify = [{ return ::mlir::triton::impl::verifyTransposeOpInterface($_op); }];
+  let verify = [{
+    return ::mlir::triton::impl::verifyTransposeOpInterface($_op);
+  }];
 }
 
 def DotOpInterface : OpInterface<"DotOpInterface"> {
   let description = [{
-	This interface is implemented by operations that perform a dot product.
+    This interface is implemented by operations that perform a dot product.
   }];
 
   let cppNamespace = "::mlir::triton";
 
   let methods = [
-	InterfaceMethod<
-			/*desc=*/[{
-			  Verifies the dimensions of the A and B DotOp operands.
-		  }],
-			/*retType=*/"bool",
-			/*methodName=*/"verifyDims",
-			/*args=*/(ins)>
-	];
+    InterfaceMethod<
+      /*desc=*/"Get the LHS A tensor",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getA",
+      /*args=*/(ins)>,
+    InterfaceMethod<
+      /*desc=*/"Get the RHS B tensor",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getB",
+      /*args=*/(ins)>,
+  InterfaceMethod<
+      /*desc=*/"Verify the dimensions of the A and B DotOp operands.",
+      /*retType=*/"bool",
+      /*methodName=*/"verifyDims",
+      /*args=*/(ins)>
+  ];
 
   let verify = [{ return ::mlir::triton::impl::verifyDotOpInterface($_op); }];
 }
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -681,29 +681,30 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
     let summary = "dot_scaled";
 
     let description = [{
-        $d = matrix_multiply(scale($lhs, $lhs_scale), scale(rlhs, $rhs_scale)) + $c.
+        $d = matrix_multiply(scale($a, $a_scale), scale($b, $b_scale)) + $c.
         Where scale(x, s) is a function that applies the scale per block following microscaling spec.
     }];
 
     let arguments = (
       ins
       // inputs are floats if we have a type for them, otherwise (fp4),
       // they are packed in pairs in an I8Tensor
-      RankedTensorOf<[TT_Float,I8]>:$lhs,
-      RankedTensorOf<[TT_Float,I8]>:$rhs,
+      RankedTensorOf<[TT_Float,I8]>:$a,
+      RankedTensorOf<[TT_Float,I8]>:$b,
       TT_FloatTensor:$c,
-      Optional<RankedTensorOf<[TT_Float, I8]>>:$lhs_scale,
-      Optional<RankedTensorOf<[TT_Float, I8]>>:$rhs_scale,
-      TT_ScaleDotElemTypeAttr:$lhs_type,
-      TT_ScaleDotElemTypeAttr:$rhs_type,
+      Optional<RankedTensorOf<[TT_Float, I8]>>:$a_scale,
+      Optional<RankedTensorOf<[TT_Float, I8]>>:$b_scale,
+      TT_ScaleDotElemTypeAttr:$a_elem_type,
+      TT_ScaleDotElemTypeAttr:$b_elem_type,
       BoolAttr:$fastMath
     );
 
     let results = (outs TT_FloatTensor:$d);
 
     let assemblyFormat = [{
-      $lhs (`scale` $lhs_scale^)? `,` $rhs (`scale` $rhs_scale^)? `,` $c `lhs` `=` $lhs_type `rhs` `=` $rhs_type attr-dict
-      `:` type($lhs) (`,` type($lhs_scale)^)? `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($d)
+      $a (`scale` $a_scale^)? `,` $b (`scale` $b_scale^)? `,` $c
+      `lhs` `=` $a_elem_type `rhs` `=` $b_elem_type attr-dict
+      `:` type($a) (`,` type($a_scale)^)? `*` type($b) (`,` type($b_scale)^)? `->` type($d)
     }];
 }
 
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -318,14 +318,14 @@ bool DotOp::verifyDims() {
 
 //-- DotScaledOp --
 bool DotScaledOp::verifyDims() {
-  auto aShape = this->getLhs().getType().getShape();
-  auto bShape = this->getRhs().getType().getShape();
+  auto aShape = this->getA().getType().getShape();
+  auto bShape = this->getB().getType().getShape();
 
   auto aKdim = aShape[aShape.size() - 1];
   auto bKdim = bShape[aShape.size() - 2];
-  if (this->getLhsType() == ScaleDotElemType::E2M1)
+  if (this->getAElemType() == ScaleDotElemType::E2M1)
     aKdim *= 2;
-  if (this->getRhsType() == ScaleDotElemType::E2M1)
+  if (this->getBElemType() == ScaleDotElemType::E2M1)
     bKdim *= 2;
 
   return aKdim == bKdim;
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -629,7 +629,7 @@ class ScaledBlockedToMMAv5
         mlir::isa<NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
       return failure();
 
-    if (dotOp.getLhsScale() == nullptr || dotOp.getRhsScale() == nullptr) {
+    if (dotOp.getAScale() == nullptr || dotOp.getBScale() == nullptr) {
       return failure();
     }
 
@@ -643,18 +643,18 @@ class ScaledBlockedToMMAv5
       return failure();
     Location loc = dotOp.getLoc();
     // operands
-    Value a = dotOp.getLhs();
-    Value b = dotOp.getRhs();
-    auto oldAType = dotOp.getLhs().getType();
-    auto oldBType = dotOp.getRhs().getType();
+    Value a = dotOp.getA();
+    Value b = dotOp.getB();
+    auto oldAType = a.getType();
+    auto oldBType = b.getType();
 
     bool IsAMixedPrecFp4 = false;
     bool IsBMixedPrecFp4 = false;
 
-    if (dotOp.getLhsType() != dotOp.getRhsType()) {
-      if (dotOp.getLhsType() == ScaleDotElemType::E2M1)
+    if (dotOp.getAElemType() != dotOp.getBElemType()) {
+      if (dotOp.getAElemType() == ScaleDotElemType::E2M1)
         IsAMixedPrecFp4 = true;
-      else if (dotOp.getRhsType() == ScaleDotElemType::E2M1)
+      else if (dotOp.getBElemType() == ScaleDotElemType::E2M1)
         IsBMixedPrecFp4 = true;
     }
 
@@ -676,8 +676,8 @@ class ScaledBlockedToMMAv5
     // descriptor requires options that are unavailable to the .kind=mxf4 mma.
     // This is likely preferable over a silent runtime performance degradation
     // from running f4xf4 via .kind=mxf8f6f4
-    if (dotOp.getLhsType() == ScaleDotElemType::E2M1 &&
-        dotOp.getRhsType() == ScaleDotElemType::E2M1) {
+    if (dotOp.getAElemType() == ScaleDotElemType::E2M1 &&
+        dotOp.getBElemType() == ScaleDotElemType::E2M1) {
       k = 64;
     }
     SmallVector<unsigned> instrShape = {m, n, k};
@@ -701,8 +701,8 @@ class ScaledBlockedToMMAv5
     auto acc = rewriter.create<triton::nvidia_gpu::TMEMAllocOp>(
         loc, accMemDescType, cvtAcc);
 
-    RankedTensorType oldScaleAType = dotOp.getLhsScale().getType();
-    RankedTensorType oldScaleBType = dotOp.getRhsScale().getType();
+    RankedTensorType oldScaleAType = dotOp.getAScale().getType();
+    RankedTensorType oldScaleBType = dotOp.getBScale().getType();
 
     Attribute scaleEncoding =
         triton::nvidia_gpu::TensorMemoryScalesEncodingAttr::get(
@@ -724,8 +724,8 @@ class ScaledBlockedToMMAv5
     RankedTensorType newScaleBType = RankedTensorType::get(
         oldScaleBType.getShape(), oldScaleBType.getElementType(), scaleBLayout);
 
-    auto lhsScale = addSmemStageToScaleLoad(dotOp.getLhsScale(), rewriter);
-    auto rhsScale = addSmemStageToScaleLoad(dotOp.getRhsScale(), rewriter);
+    auto lhsScale = addSmemStageToScaleLoad(dotOp.getAScale(), rewriter);
+    auto rhsScale = addSmemStageToScaleLoad(dotOp.getBScale(), rewriter);
 
     Value newScaleA =
         rewriter.create<ConvertLayoutOp>(loc, newScaleAType, lhsScale);
@@ -737,8 +737,8 @@ class ScaledBlockedToMMAv5
         loc, scaleBType, newScaleB);
     auto vTrue = rewriter.create<arith::ConstantIntOp>(dotOp.getLoc(), 1, 1);
     rewriter.create<triton::nvidia_gpu::TCGen5MMAScaledOp>(
-        loc, a, b, acc, scaleA, scaleB, dotOp.getLhsType(), dotOp.getRhsType(),
-        vTrue, vTrue, Value());
+        loc, a, b, acc, scaleA, scaleB, dotOp.getAElemType(),
+        dotOp.getBElemType(), vTrue, vTrue, Value());
 
     auto ld =
         rewriter.create<triton::nvidia_gpu::TMEMLoadOp>(loc, newAccType, acc);
@@ -792,17 +792,17 @@ static void decomposeMixedModeDotOp(ModuleOp mod, int computeCapability) {
 // Transpose scaled_dot ops that have a scale on lhs.
 static void transposeDotOp(DotScaledOp dotOp) {
   OpBuilder builder(dotOp);
-  Value lhs = dotOp.getLhs();
+  Value lhs = dotOp.getA();
   std::array<int, 2> transOrder = {1, 0};
   Value lhsTransposed = builder.create<TransOp>(lhs.getLoc(), lhs, transOrder);
-  Value rhs = dotOp.getRhs();
+  Value rhs = dotOp.getB();
   Value rhsTransposed = builder.create<TransOp>(rhs.getLoc(), rhs, transOrder);
   Value c = dotOp.getC();
   Value cTransposed = builder.create<TransOp>(c.getLoc(), c, transOrder);
   Value result = builder.create<DotScaledOp>(
       dotOp.getLoc(), cTransposed.getType(), rhsTransposed, lhsTransposed,
-      cTransposed, dotOp.getRhsScale(), dotOp.getLhsScale(), dotOp.getRhsType(),
-      dotOp.getLhsType(), dotOp.getFastMath());
+      cTransposed, dotOp.getBScale(), dotOp.getAScale(), dotOp.getBElemType(),
+      dotOp.getAElemType(), dotOp.getFastMath());
   Operation *transposedResult =
       builder.create<TransOp>(result.getLoc(), result, transOrder);
   dotOp.replaceAllUsesWith(transposedResult);
@@ -814,7 +814,7 @@ static void transposeDots(ModuleOp m) {
   // want to use rhs from register for mmav3.
   SmallVector<DotScaledOp> toTranspose;
   m.walk([&](DotScaledOp dotOp) -> void {
-    if (dotOp.getLhsScale() == nullptr && dotOp.getRhsScale() != nullptr)
+    if (dotOp.getAScale() == nullptr && dotOp.getBScale() != nullptr)
       toTranspose.push_back(dotOp);
   });
   for (DotScaledOp dotOp : toTranspose) {
diff --git a/lib/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.cpp b/lib/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.cpp
@@ -3,7 +3,6 @@
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
@@ -32,8 +31,8 @@ class DecomposeScaledBlocked : public OpRewritePattern<DotScaledOp> {
   LogicalResult matchAndRewrite(DotScaledOp scaledDotOp,
                                 PatternRewriter &rewriter) const override {
     // Types
-    auto computeType = getComputeType(scaledDotOp.getLhsType(),
-                                      scaledDotOp.getRhsType(), rewriter);
+    auto computeType = getComputeType(scaledDotOp.getAElemType(),
+                                      scaledDotOp.getBElemType(), rewriter);
     auto loc = scaledDotOp.getLoc();
 
     auto cvtDotOperand = [&](TypedValue<RankedTensorType> v,
@@ -185,12 +184,11 @@ class DecomposeScaledBlocked : public OpRewritePattern<DotScaledOp> {
   TypedValue<RankedTensorType> scaleArg(PatternRewriter &rewriter,
                                         DotScaledOp scaledDotOp, int opIdx,
                                         FloatType computeType) const {
-    auto v = opIdx == 0 ? scaledDotOp.getLhs() : scaledDotOp.getRhs();
-    auto scale =
-        opIdx == 0 ? scaledDotOp.getLhsScale() : scaledDotOp.getRhsScale();
+    auto v = opIdx == 0 ? scaledDotOp.getA() : scaledDotOp.getB();
+    auto scale = opIdx == 0 ? scaledDotOp.getAScale() : scaledDotOp.getBScale();
     auto isFp4 =
-        (opIdx == 0 ? scaledDotOp.getLhsType() : scaledDotOp.getRhsType()) ==
-        ScaleDotElemType::E2M1;
+        ScaleDotElemType::E2M1 ==
+        (opIdx == 0 ? scaledDotOp.getAElemType() : scaledDotOp.getBElemType());
     auto fastMath = scaledDotOp.getFastMath();
 
     auto *ctx = rewriter.getContext();
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/MFMA.cpp
@@ -516,10 +516,10 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     assert((mDim == nDim && (mDim == 32 || mDim == 16 || mDim == 4)) ||
            (mDim == 64 && nDim == 4) || (mDim == 4 && nDim == 64));
 
-    Value a = op.getLhs();
-    Value b = op.getRhs();
-    Value aScale = op.getLhsScale();
-    Value bScale = op.getRhsScale();
+    Value a = op.getA();
+    Value b = op.getB();
+    Value aScale = op.getAScale();
+    Value bScale = op.getBScale();
     bool isAScaleConstant = aScale.getDefiningOp<arith::ConstantOp>();
     bool isBScaleConstant = bScale.getDefiningOp<arith::ConstantOp>();
     Value d = op.getD();
@@ -528,8 +528,8 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     auto dTensorTy = cast<RankedTensorType>(d.getType());
     auto elemTyA = aTensorTy.getElementType();
     auto elemTyB = bTensorTy.getElementType();
-    ScaleDotElemType aElemType = op.getLhsType();
-    ScaleDotElemType bElemType = op.getRhsType();
+    ScaleDotElemType aElemType = op.getAElemType();
+    ScaleDotElemType bElemType = op.getBElemType();
 
     const auto kDimOperandSize = aTensorTy.getShape().back();
 
@@ -576,10 +576,10 @@ struct ScaledDotOpMFMAConversionHelper : DotOpMFMAConversionHelper {
     constexpr int scaleKWidth = 1;
     constexpr int scaleKBase = 1;
 
-    Value loadedA = adaptor.getLhs();
-    Value loadedB = adaptor.getRhs();
-    Value loadedAScale = adaptor.getLhsScale();
-    Value loadedBScale = adaptor.getRhsScale();
+    Value loadedA = adaptor.getA();
+    Value loadedB = adaptor.getB();
+    Value loadedAScale = adaptor.getAScale();
+    Value loadedBScale = adaptor.getBScale();
     Value loadedC = adaptor.getC();
 
     auto numRepM = repA[1];
@@ -709,12 +709,12 @@ LogicalResult convertScaledMFMA(triton::DotScaledOp op,
                                 triton::DotScaledOp::Adaptor adaptor,
                                 const LLVMTypeConverter *typeConverter,
                                 ConversionPatternRewriter &rewriter) {
-  assert(isa<DotOperandEncodingAttr>(op.getLhs().getType().getEncoding()) &&
-         isa<DotOperandEncodingAttr>(op.getRhs().getType().getEncoding()) &&
+  assert(isa<DotOperandEncodingAttr>(op.getA().getType().getEncoding()) &&
+         isa<DotOperandEncodingAttr>(op.getB().getType().getEncoding()) &&
          "Both lhs and rhs should be DotOperand layout.");
 
-  assert(isa<LinearEncodingAttr>(op.getLhsScale().getType().getEncoding()) &&
-         isa<LinearEncodingAttr>(op.getRhsScale().getType().getEncoding()) &&
+  assert(isa<LinearEncodingAttr>(op.getAScale().getType().getEncoding()) &&
+         isa<LinearEncodingAttr>(op.getBScale().getType().getEncoding()) &&
          "Both LhsScale and RhsScale should be linear layout.");
 
   auto cTensorTy = op.getC().getType();
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
diff --git a/third_party/intel/lib/Analysis/DPAS.cpp b/third_party/intel/lib/Analysis/DPAS.cpp
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp