intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h‎
Lines changed: 3 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOpInterfaces.td‎
Lines changed: 29 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOpInterfaces.td‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h‎
Lines changed: 10 additions & 4 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎lib/Analysis/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Analysis/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonGPU/IR/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.cpp‎
Lines changed: 33 additions & 17 deletions b/‎lib/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.cpp‎
Lines changed: 33 additions & 17 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 4 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 14 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 14 additions & 1 deletion
@@ -24,3 +24,8 @@ set(LLVM_TARGET_DEFINITIONS TritonGPUTypeInterfaces.td)
 mlir_tablegen(TypeInterfaces.h.inc -gen-type-interface-decls)
 mlir_tablegen(TypeInterfaces.cpp.inc -gen-type-interface-defs)
 add_public_tablegen_target(TritonGPUTypeInterfacesIncGen)
+
+set(LLVM_TARGET_DEFINITIONS TritonGPUOpInterfaces.td)
+mlir_tablegen(OpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(OpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(TritonGPUOpInterfacesIncGen)
@@ -1,8 +1,11 @@
 #ifndef TRITON_GPU_DIALECT_INTERFACES_H
 #define TRITON_GPU_DIALECT_INTERFACES_H
 
+#include "mlir/IR/OpDefinition.h"
+
 // clang-format off
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
+#include "triton/Dialect/TritonGPU/IR/OpInterfaces.h.inc"
 #include "triton/Dialect/TritonGPU/IR/AttrInterfaces.h.inc"
 // clang-format on
 
 
@@ -0,0 +1,29 @@
+#ifndef TRITONGPU_OP_INTERFACES
+#define TRITONGPU_OP_INTERFACES
+
+include "mlir/IR/OpBase.td"
+
+def UpcastFpOpInterface : OpInterface<"UpcastFpOpInterface"> {
+    let description = [{
+        This interface is for operations that upcast floating-point numbers.
+    }];
+
+    let cppNamespace = "::mlir::triton::gpu";
+
+    let methods = [
+        InterfaceMethod<
+            /*desc=*/"Infer destination encoding",
+            /*retType=*/"mlir::Attribute",
+            /*methodName=*/"inferDstEncoding",
+            /*args=*/(ins "unsigned":$opIdx, "mlir::Attribute":$srcEnc)
+        >,
+        InterfaceMethod<
+            /*desc=*/"Infer operand encoding from dst encoding",
+            /*retType=*/"mlir::Attribute",
+            /*methodName=*/"inferSrcEncoding",
+            /*args=*/(ins "unsigned":$opIdx, "mlir::Attribute":$dstEnc)
+        >
+    ];
+}
+
+#endif // TRITONGPU_OP_INTERFACES
@@ -22,16 +22,22 @@ class DecomposeScaledBlocked : public OpRewritePattern<DotScaledOp> {
                  ModuleOp mod, TypedValue<RankedTensorType> scale,
                  int dim) const;
   TypedValue<RankedTensorType> maskNan(PatternRewriter &rewriter,
-                                       DotScaledOp scaledDotOp, ModuleOp mod,
+                                       DotScaledOp scaledDotOp,
                                        TypedValue<RankedTensorType> mxfp,
                                        TypedValue<RankedTensorType> scale,
                                        int dim) const;
-  TypedValue<RankedTensorType> scaleArg(PatternRewriter &rewriter,
-                                        DotScaledOp scaledDotOp, int opIdx,
-                                        FloatType computeType) const;
+  virtual TypedValue<RankedTensorType> scaleArg(PatternRewriter &rewriter,
+                                                DotScaledOp scaledDotOp,
+                                                int opIdx,
+                                                FloatType computeType) const;
   TypedValue<RankedTensorType>
   cvtDotOperand(PatternRewriter &rewriter, DotScaledOp scaledDotOp, int opIdx,
                 TypedValue<RankedTensorType> v) const;
+  TypedValue<RankedTensorType>
+  extendAndBroadcastScale(PatternRewriter &rewriter, DotScaledOp scaledDotOp,
+                          TypedValue<RankedTensorType> &scale,
+                          FloatType computeType, RankedTensorType dstType,
+                          int opIdx) const;
   static SmallVector<int, 2> getTransposeOrder(int rank);
 };
 
 
@@ -10,6 +10,7 @@ add_triton_library(TritonAnalysis
   TritonGPUTableGen
   TritonGPUAttrDefsIncGen
   TritonGPUTypeInterfacesIncGen
+  TritonGPUOpInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRAnalysis
 
@@ -10,6 +10,7 @@ add_triton_library(TritonGPUIR
   TritonGPUAttrDefsIncGen
   TritonGPUTypeInterfacesIncGen
   TritonIntelGPUAttrDefsIncGen
+  TritonGPUOpInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRGPUDialect
 
@@ -31,6 +31,7 @@
 
 // Include TableGen'erated code
 #include "triton/Dialect/TritonGPU/IR/Dialect.cpp.inc"
+#include "triton/Dialect/TritonGPU/IR/OpInterfaces.cpp.inc"
 #include "triton/Dialect/TritonGPU/IR/TypeInterfaces.cpp.inc"
 
 using namespace mlir;
 
@@ -135,11 +135,16 @@ TypedValue<RankedTensorType> DecomposeScaledBlocked::broadcastScale(
 }
 
 TypedValue<RankedTensorType> DecomposeScaledBlocked::maskNan(
-    PatternRewriter &rewriter, DotScaledOp scaledDotOp, ModuleOp mod,
+    PatternRewriter &rewriter, DotScaledOp scaledDotOp,
     TypedValue<RankedTensorType> mxfp, TypedValue<RankedTensorType> scale,
     int dim) const {
+  // Skip NaN checks if fastMath
+  if (scaledDotOp.getFastMath())
+    return mxfp;
+
   // Implement tl.where(scale == 0xFF, float("nan"), mxfp)
   auto loc = scale.getLoc();
+  auto mod = scaledDotOp->getParentOfType<ModuleOp>();
 
   // Scale is NaN
   auto scaleTy = scale.getType();
@@ -180,7 +185,6 @@ DecomposeScaledBlocked::scaleArg(PatternRewriter &rewriter,
   auto fastMath = scaledDotOp.getFastMath();
 
   auto loc = v.getLoc();
-  auto mod = scaledDotOp->getParentOfType<ModuleOp>();
   auto rank = v.getType().getRank();
   auto kDim = opIdx == 0 ? rank - 1 : rank - 2;
 
@@ -196,9 +200,33 @@ DecomposeScaledBlocked::scaleArg(PatternRewriter &rewriter,
   if (!scale)
     return v;
 
+  // 1) Cast scale to fp16/bf16, broadcast it and convert its layout
+  auto reshapeScale = extendAndBroadcastScale(rewriter, scaledDotOp, scale,
+                                              computeType, v.getType(), opIdx);
+
+  // 2) Multiply
+  auto mxfp = cast<TypedValue<RankedTensorType>>(
+      rewriter.create<arith::MulFOp>(loc, v, reshapeScale).getResult());
+
+  // 3) If the scale is NaN, return NaN, else return the scaled value.
+  return maskNan(rewriter, scaledDotOp, mxfp, scale, kDim);
+}
+
+TypedValue<RankedTensorType> DecomposeScaledBlocked::extendAndBroadcastScale(
+    PatternRewriter &rewriter, DotScaledOp scaledDotOp,
+    TypedValue<RankedTensorType> &scale, FloatType computeType,
+    RankedTensorType dstType, int opIdx) const {
+  auto loc = scale.getLoc();
+  auto mod = scaledDotOp->getParentOfType<ModuleOp>();
+  auto v = opIdx == 0 ? scaledDotOp.getA() : scaledDotOp.getB();
+  auto rank = v.getType().getRank();
+  auto kDim = opIdx == 0 ? rank - 1 : rank - 2;
+
   // For some weird reason, we take the scale with shape as if it were coming
   // from the lhs even when it's the rhs. In a normal world, we should accept
-  // this parametre transposed, as we do with the mxfp.
+  // this parameter transposed, as we do with the mxfp.
+  //
+  // Notice: this is an inplace change.
   if (opIdx == 1) {
     auto order = getTransposeOrder(rank);
     scale = rewriter.create<TransOp>(loc, scale, order);
@@ -207,21 +235,9 @@ DecomposeScaledBlocked::scaleArg(PatternRewriter &rewriter,
   // 1) Cast scale to compute type (fp16/bf16)
   auto scale16 = scaleTo16(rewriter, scale, computeType);
 
-  // 2) Broadcast scale to the same shape and layout as v
+  // 2) Broadcast scale to the same shape as v and convert the layout
   auto reshapeScale = broadcastScale(rewriter, scaledDotOp, mod, scale16, kDim);
-  reshapeScale =
-      rewriter.create<ConvertLayoutOp>(loc, v.getType(), reshapeScale);
-
-  // 3) Multiply
-  auto mxfp = cast<TypedValue<RankedTensorType>>(
-      rewriter.create<arith::MulFOp>(loc, v, reshapeScale).getResult());
-
-  // Skip NaN checks if fastMath
-  if (fastMath)
-    return mxfp;
-
-  // 4) If the scale is NaN, return NaN, else return the scaled value.
-  return maskNan(rewriter, scaledDotOp, mod, mxfp, scale, kDim);
+  return rewriter.create<ConvertLayoutOp>(loc, dstType, reshapeScale);
 }
 
 TypedValue<RankedTensorType>
 
@@ -17,6 +17,7 @@
 #include "triton/Analysis/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
@@ -1309,7 +1310,9 @@ void LayoutRematerialization::hoistConvertDotOperand(
   // threads We do views and elementwise pure ops for now
   auto noDataMovement = [](Operation *op) {
     return (op->hasTrait<OpTrait::Elementwise>() && isMemoryEffectFree(op)) ||
-           isa<BroadcastOp, Fp4ToFpOp, ConvertLayoutOp>(op) || isView(op);
+           isa<BroadcastOp, Fp4ToFpOp, ConvertLayoutOp, UpcastFpOpInterface>(
+               op) ||
+           isView(op);
   };
   // Stop the slice as soon as we find an operation that cannot be done without
   // data movement between threads
 
@@ -525,6 +525,10 @@ Attribute inferSrcEncoding(Operation *op, Attribute encoding) {
     if (!isa<triton::gpu::BlockedEncodingAttr>(encoding))
       return {};
   }
+
+  if (isa<triton::gpu::UpcastFpOpInterface>(op))
+    return {};
+
   if (op->hasTrait<mlir::OpTrait::SameOperandsAndResultEncoding>() ||
       op->hasTrait<mlir::OpTrait::SameLoadStoreOperandsAndResultEncoding>() ||
       op->hasTrait<mlir::OpTrait::Elementwise>() ||
@@ -558,6 +562,9 @@ Attribute inferDstEncoding(Operation *op, Attribute encoding) {
     if (!isa<triton::gpu::BlockedEncodingAttr>(encoding))
       return {};
   }
+  if (isa<triton::gpu::UpcastFpOpInterface>(op))
+    return {};
+
   if (op->hasTrait<mlir::OpTrait::SameOperandsAndResultEncoding>() ||
       op->hasTrait<mlir::OpTrait::SameLoadStoreOperandsAndResultEncoding>() ||
       op->hasTrait<mlir::OpTrait::Elementwise>() ||
@@ -938,7 +945,13 @@ LogicalResult getConvertBackwardSlice(
         continue;
       }
       for (auto [i, operand] : llvm::enumerate(definingOp->getOpOperands())) {
-        auto srcEncoding = inferSrcEncoding(definingOp, encoding);
+        Attribute srcEncoding;
+        if (auto upcast =
+                dyn_cast<triton::gpu::UpcastFpOpInterface>(definingOp)) {
+          srcEncoding = upcast.inferSrcEncoding(i, encoding);
+        } else {
+          srcEncoding = inferSrcEncoding(definingOp, encoding);
+        }
         if (!srcEncoding)
           return failure();
         // If the infered layout matches the original one we don't need to keep