intel
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 7 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 77 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 77 additions & 0 deletions
@@ -232,6 +232,7 @@ See [`python/triton/knobs.py`](python/triton/knobs.py) for the full list of conf
 - `TRITON_F32_DEFAULT` sets the default input precision of `tl.dot` when using 32-bit floats, which can be either `ieee`, `tf32`, or `tf32x3`.
 - `TRITON_FRONT_END_DEBUGGING=1` disables exception wrapping when an error occurs in the compiler frontend, allowing the full stack trace to be seen.
 - `TRITON_DISABLE_LINE_INFO=1` removes all line information from the module.
+- `PTXAS_OPTIONS` passes additional command-line options to the PTX assembler `ptxas` (only on NVIDIA).
 
 > [!NOTE]
 > Some of these environment variables don't have a knob in `knobs.py`-- those are only relevant to the C++ layer(s), hence they don't exist in the python layer.
 
@@ -49,7 +49,12 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
       /*retType=*/"::mlir::Value",
       /*methodName=*/"getB",
       /*args=*/(ins)>,
-  InterfaceMethod<
+    InterfaceMethod<
+      /*desc=*/"Get the output tensor",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getD",
+      /*args=*/(ins)>,
+    InterfaceMethod<
       /*desc=*/"Verify the dimensions of the A and B DotOp operands.",
       /*retType=*/"bool",
       /*methodName=*/"verifyDims",
@@ -64,6 +69,7 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
         auto aTy = cast<ShapedType>($_op.getA().getType());
         auto bTy = cast<ShapedType>($_op.getB().getType());
         auto cTy = cast<ShapedType>($_op->getOperand(2).getType());
+        auto dTy = cast<ShapedType>($_op.getD().getType());
         auto aShape = aTy.getShape();
         auto bShape = bTy.getShape();
         auto cShape = cTy.getShape();
 
@@ -135,6 +135,13 @@ LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<unsigned> tilesPerWarp,
                                          ArrayRef<unsigned> warpsPerCTA);
 
+LinearLayout getSM120DotScaledScaleLayout(MLIRContext *ctx, int dotOperandIdx,
+                                          ArrayRef<int64_t> dotOperandShape,
+                                          ArrayRef<unsigned> tilesPerWarp,
+                                          ArrayRef<unsigned> warpsPerCTA,
+                                          unsigned instrM, unsigned instrN,
+                                          CTALayoutAttr ctaLayoutAttr);
+
 // Create LinearLayout for nvidia mma tile.
 LinearLayout nvidiaMmaTile(MLIRContext *ctx, ArrayRef<unsigned> tileShape,
                            unsigned kWidth, ArrayRef<unsigned> order,
 
@@ -40,7 +40,10 @@ bool isPureScalarOp(Operation *op);
 bool getDominatingValueSetOpsToHoist(
     DominanceInfo &domInfo, Operation *refOp, ArrayRef<Value> valueSet,
     llvm::SetVector<Operation *> &toHoist,
-    function_ref<bool(Operation *)> canHoist = isPureScalarOp);
+    function_ref<bool(Operation *)> canHoist = isPureScalarOp,
+    function_ref<bool(BlockArgument)> canUseArg = [](BlockArgument) {
+      return false;
+    });
 
 // Hoist the given set of operations above the reference operation.
 void hoistOpsBefore(Operation *refOp,
 
@@ -1405,6 +1405,83 @@ LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef<int64_t> shape,
   return chooseDotDsReadB64TrLayout(dot, shape, elemBitWidth);
 }
 
+// Warp-level block scaling (sm_120, m16n8k32)
+// Reference: NVIDIA PTX ISA "Warp-level block scaling"
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
+//
+// Semantics:
+//   D = (A * SF_A) * (B * SF_B) + C
+//   scale_vec::1X  -> SF_A shape Mx1 (per-row),   SF_B shape 1xN (per-col)
+//
+// Providers (within each warp quad of 4 lanes):
+//   - A scales are provided by a lane-pair selected by thread-id-a ∈ {0,1}
+//       (0 => lanes {0,1}, 1 => lanes {2,3} in the quad).
+//   - B scales are provided by a single lane selected by thread-id-b ∈
+//   {0,1,2,3}.
+//
+// Byte selectors (which subfield of the 32-bit metadata is used):
+//   - 1X: 1 byte  => byte-id ∈ {0,1,2,3}
+//
+// Implementation notes:
+//   - We support only scale_vec::1X for now.
+//   - We choose a fixed provider for A (thread-id-a = 0) and B (thread-id-b =
+//   0)
+//   - In this implementation, each lane in a quad has the same scale factor.
+LinearLayout getSM120DotScaledScaleLayout(
+    MLIRContext *ctx, int dotOperandIdx, ArrayRef<int64_t> dotOperandShape,
+    ArrayRef<unsigned> tilesPerWarp, ArrayRef<unsigned> warpsPerCTA,
+    unsigned mmaInstrM, unsigned mmaInstrN, CTALayoutAttr ctaLayoutAttr) {
+  unsigned rank = dotOperandShape.size();
+  auto outDims = standardOutDimNames(ctx, rank);
+
+  StringAttr kRegister = StringAttr::get(ctx, "register");
+  StringAttr kLane = StringAttr::get(ctx, "lane");
+  StringAttr kWarp = StringAttr::get(ctx, "warp");
+
+  const unsigned mIndex = 0;
+  const unsigned nIndex = 1;
+  const int instrM = mmaInstrM;
+  const int instrN = mmaInstrN;
+  const int kSize = dotOperandShape[1];
+  const int mWarps = warpsPerCTA[mIndex];
+  const int nWarps = warpsPerCTA[nIndex];
+  const int totalWarps = mWarps * nWarps;
+  const unsigned mRep_warp = tilesPerWarp[mIndex];
+  const unsigned nRep_warp = tilesPerWarp[nIndex];
+  const unsigned kRep = std::min<unsigned>(kSize, 2);
+
+  std::vector<std::vector<int32_t>> registerBase;
+  std::vector<std::vector<int32_t>> laneBase;
+  std::vector<std::vector<int32_t>> warpBase;
+  if (dotOperandIdx == 0) { // per-row A-scale
+    laneBase = {{0, 8}, {0, 0}, {0, 1}, {0, 2}, {0, 4}};
+    for (int offset = instrM * mWarps; offset < instrM * mWarps * mRep_warp;
+         offset <<= 1)
+      registerBase.push_back({0, offset});
+    for (int w = mWarps; w < totalWarps; w <<= 1)
+      warpBase.push_back({0, 0});
+    for (int offset = instrM; offset < instrM * mWarps; offset <<= 1)
+      warpBase.push_back({0, offset});
+  } else { // per-col B-scale
+    laneBase = {{0, 0}, {0, 0}, {0, 1}, {0, 2}, {0, 4}};
+    if (nRep_warp > 1)
+      registerBase.push_back({0, nWarps * instrN});
+    for (int k = 1; k < kRep; k += 1)
+      registerBase.push_back({1 << (k - 1), 0});
+    for (int offset = instrN; offset < instrN * nWarps; offset <<= 1)
+      warpBase.push_back({0, offset});
+    for (int w = nWarps; w < totalWarps; w <<= 1)
+      warpBase.push_back({0, 0});
+  }
+
+  const unsigned kIdx = (dotOperandShape[0] == 1) ? 0 : 1;
+  const unsigned mnIdx = 1 - kIdx;
+  LinearLayout ctaLayout(
+      {{kRegister, registerBase}, {kLane, laneBase}, {kWarp, warpBase}},
+      {outDims[kIdx], outDims[mnIdx]});
+  return combineCtaCgaWithShape(ctaLayout, ctaLayoutAttr, dotOperandShape);
+}
+
 LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<int64_t> dotOperandShape,
                                          unsigned mfmaMDim,