intel
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 7 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h‎
Lines changed: 3 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOpInterfaces.td‎
Lines changed: 29 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOpInterfaces.td‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h‎
Lines changed: 10 additions & 4 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/Analysis/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Analysis/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 12 additions & 4 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 12 additions & 4 deletions
@@ -232,6 +232,7 @@ See [`python/triton/knobs.py`](python/triton/knobs.py) for the full list of conf
 - `TRITON_F32_DEFAULT` sets the default input precision of `tl.dot` when using 32-bit floats, which can be either `ieee`, `tf32`, or `tf32x3`.
 - `TRITON_FRONT_END_DEBUGGING=1` disables exception wrapping when an error occurs in the compiler frontend, allowing the full stack trace to be seen.
 - `TRITON_DISABLE_LINE_INFO=1` removes all line information from the module.
+- `PTXAS_OPTIONS` passes additional command-line options to the PTX assembler `ptxas` (only on NVIDIA).
 
 > [!NOTE]
 > Some of these environment variables don't have a knob in `knobs.py`-- those are only relevant to the C++ layer(s), hence they don't exist in the python layer.
 
@@ -49,7 +49,12 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
       /*retType=*/"::mlir::Value",
       /*methodName=*/"getB",
       /*args=*/(ins)>,
-  InterfaceMethod<
+    InterfaceMethod<
+      /*desc=*/"Get the output tensor",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getD",
+      /*args=*/(ins)>,
+    InterfaceMethod<
       /*desc=*/"Verify the dimensions of the A and B DotOp operands.",
       /*retType=*/"bool",
       /*methodName=*/"verifyDims",
@@ -64,6 +69,7 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
         auto aTy = cast<ShapedType>($_op.getA().getType());
         auto bTy = cast<ShapedType>($_op.getB().getType());
         auto cTy = cast<ShapedType>($_op->getOperand(2).getType());
+        auto dTy = cast<ShapedType>($_op.getD().getType());
         auto aShape = aTy.getShape();
         auto bShape = bTy.getShape();
         auto cShape = cTy.getShape();
 
@@ -24,3 +24,8 @@ set(LLVM_TARGET_DEFINITIONS TritonGPUTypeInterfaces.td)
 mlir_tablegen(TypeInterfaces.h.inc -gen-type-interface-decls)
 mlir_tablegen(TypeInterfaces.cpp.inc -gen-type-interface-defs)
 add_public_tablegen_target(TritonGPUTypeInterfacesIncGen)
+
+set(LLVM_TARGET_DEFINITIONS TritonGPUOpInterfaces.td)
+mlir_tablegen(OpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(OpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(TritonGPUOpInterfacesIncGen)
@@ -135,6 +135,13 @@ LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<unsigned> tilesPerWarp,
                                          ArrayRef<unsigned> warpsPerCTA);
 
+LinearLayout getSM120DotScaledScaleLayout(MLIRContext *ctx, int dotOperandIdx,
+                                          ArrayRef<int64_t> dotOperandShape,
+                                          ArrayRef<unsigned> tilesPerWarp,
+                                          ArrayRef<unsigned> warpsPerCTA,
+                                          unsigned instrM, unsigned instrN,
+                                          CTALayoutAttr ctaLayoutAttr);
+
 // Create LinearLayout for nvidia mma tile.
 LinearLayout nvidiaMmaTile(MLIRContext *ctx, ArrayRef<unsigned> tileShape,
                            unsigned kWidth, ArrayRef<unsigned> order,
 
@@ -1,8 +1,11 @@
 #ifndef TRITON_GPU_DIALECT_INTERFACES_H
 #define TRITON_GPU_DIALECT_INTERFACES_H
 
+#include "mlir/IR/OpDefinition.h"
+
 // clang-format off
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
+#include "triton/Dialect/TritonGPU/IR/OpInterfaces.h.inc"
 #include "triton/Dialect/TritonGPU/IR/AttrInterfaces.h.inc"
 // clang-format on
 
 
@@ -0,0 +1,29 @@
+#ifndef TRITONGPU_OP_INTERFACES
+#define TRITONGPU_OP_INTERFACES
+
+include "mlir/IR/OpBase.td"
+
+def UpcastFpOpInterface : OpInterface<"UpcastFpOpInterface"> {
+    let description = [{
+        This interface is for operations that upcast floating-point numbers.
+    }];
+
+    let cppNamespace = "::mlir::triton::gpu";
+
+    let methods = [
+        InterfaceMethod<
+            /*desc=*/"Infer destination encoding",
+            /*retType=*/"mlir::Attribute",
+            /*methodName=*/"inferDstEncoding",
+            /*args=*/(ins "unsigned":$opIdx, "mlir::Attribute":$srcEnc)
+        >,
+        InterfaceMethod<
+            /*desc=*/"Infer operand encoding from dst encoding",
+            /*retType=*/"mlir::Attribute",
+            /*methodName=*/"inferSrcEncoding",
+            /*args=*/(ins "unsigned":$opIdx, "mlir::Attribute":$dstEnc)
+        >
+    ];
+}
+
+#endif // TRITONGPU_OP_INTERFACES
@@ -22,16 +22,22 @@ class DecomposeScaledBlocked : public OpRewritePattern<DotScaledOp> {
                  ModuleOp mod, TypedValue<RankedTensorType> scale,
                  int dim) const;
   TypedValue<RankedTensorType> maskNan(PatternRewriter &rewriter,
-                                       DotScaledOp scaledDotOp, ModuleOp mod,
+                                       DotScaledOp scaledDotOp,
                                        TypedValue<RankedTensorType> mxfp,
                                        TypedValue<RankedTensorType> scale,
                                        int dim) const;
-  TypedValue<RankedTensorType> scaleArg(PatternRewriter &rewriter,
-                                        DotScaledOp scaledDotOp, int opIdx,
-                                        FloatType computeType) const;
+  virtual TypedValue<RankedTensorType> scaleArg(PatternRewriter &rewriter,
+                                                DotScaledOp scaledDotOp,
+                                                int opIdx,
+                                                FloatType computeType) const;
   TypedValue<RankedTensorType>
   cvtDotOperand(PatternRewriter &rewriter, DotScaledOp scaledDotOp, int opIdx,
                 TypedValue<RankedTensorType> v) const;
+  TypedValue<RankedTensorType>
+  extendAndBroadcastScale(PatternRewriter &rewriter, DotScaledOp scaledDotOp,
+                          TypedValue<RankedTensorType> &scale,
+                          FloatType computeType, RankedTensorType dstType,
+                          int opIdx) const;
   static SmallVector<int, 2> getTransposeOrder(int rank);
 };
 
 
@@ -40,7 +40,10 @@ bool isPureScalarOp(Operation *op);
 bool getDominatingValueSetOpsToHoist(
     DominanceInfo &domInfo, Operation *refOp, ArrayRef<Value> valueSet,
     llvm::SetVector<Operation *> &toHoist,
-    function_ref<bool(Operation *)> canHoist = isPureScalarOp);
+    function_ref<bool(Operation *)> canHoist = isPureScalarOp,
+    function_ref<bool(BlockArgument)> canUseArg = [](BlockArgument) {
+      return false;
+    });
 
 // Hoist the given set of operations above the reference operation.
 void hoistOpsBefore(Operation *refOp,
 
@@ -10,6 +10,7 @@ add_triton_library(TritonAnalysis
   TritonGPUTableGen
   TritonGPUAttrDefsIncGen
   TritonGPUTypeInterfacesIncGen
+  TritonGPUOpInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRAnalysis
 
@@ -1158,6 +1158,14 @@ SharedMemoryObject::getMaskSpanOffsets(triton::gpu::MemDescType srcTy) {
   if (allocShape == shape) {
     return 0;
   }
+  if (auto paddedEncoding = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
+          srcTy.getEncoding())) {
+    // Mask is used in fusion of constant part of memory operation address as
+    // immediate operand. Padded layout has additional address computations
+    // between main offset computation and actual memory access, which breaks
+    // constand fusing. Full mask disables this optimization.
+    return ~uint64_t(0);
+  }
   auto totalLl = triton::gpu::toLinearLayout(allocShape, srcTy.getEncoding());
   auto dimNames = standardOutDimNames(ctx, shape.size());
   // Remove the kBlock dimension
@@ -1194,14 +1202,15 @@ Value SharedMemoryObject::getShmemOffset(Location loc, RewriterBase &rewriter,
     return b.i32_val(0);
   }
 
+  LinearLayout ll;
   // We return the offset without the padding. The padding will be added in the
   // lowering
   if (auto paddedSharedEncoding =
           dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
               srcTy.getEncoding())) {
-    auto allocShape64 = srcTy.getAllocShape();
-    SmallVector<unsigned> allocShape(allocShape64.begin(), allocShape64.end());
-    return LLVM::linearize(rewriter, loc, offsets, allocShape);
+    ll = paddedSharedEncoding.getLinearComponent();
+  } else {
+    ll = triton::gpu::toLinearLayout(srcTy);
   }
 
   auto dimNames = standardOutDimNames(ctx, offsets.size());
@@ -1210,7 +1219,6 @@ Value SharedMemoryObject::getShmemOffset(Location loc, RewriterBase &rewriter,
     logicalOffsets.push_back({dim, offset});
   }
 
-  LinearLayout ll = triton::gpu::toLinearLayout(srcTy);
   ll = ll.sublayout({str_attr("offset")}, dimNames);
   auto offset =
       applyLinearLayout(loc, rewriter, ll.invert(), logicalOffsets)[0].second;