intel
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 7 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/HistogramOpToLLVM.cpp‎
Lines changed: 18 additions & 20 deletions b/‎lib/Conversion/TritonGPUToLLVM/HistogramOpToLLVM.cpp‎
Lines changed: 18 additions & 20 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 6 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 77 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 6 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 6 additions & 0 deletions
@@ -49,7 +49,12 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
       /*retType=*/"::mlir::Value",
       /*methodName=*/"getB",
       /*args=*/(ins)>,
-  InterfaceMethod<
+    InterfaceMethod<
+      /*desc=*/"Get the output tensor",
+      /*retType=*/"::mlir::Value",
+      /*methodName=*/"getD",
+      /*args=*/(ins)>,
+    InterfaceMethod<
       /*desc=*/"Verify the dimensions of the A and B DotOp operands.",
       /*retType=*/"bool",
       /*methodName=*/"verifyDims",
@@ -64,6 +69,7 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
         auto aTy = cast<ShapedType>($_op.getA().getType());
         auto bTy = cast<ShapedType>($_op.getB().getType());
         auto cTy = cast<ShapedType>($_op->getOperand(2).getType());
+        auto dTy = cast<ShapedType>($_op.getD().getType());
         auto aShape = aTy.getShape();
         auto bShape = bTy.getShape();
         auto cShape = cTy.getShape();
 
@@ -19,6 +19,7 @@ class AMDRotatingSharedEncodingAttr;
 class AMDMfmaEncodingAttr;
 class TensorOrMemDesc;
 class MemDescType;
+class CTALayoutAttr;
 
 // - BlockedEncodingAttrs have the following input dimensions.
 //
@@ -126,6 +127,13 @@ LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<unsigned> tilesPerWarp,
                                          ArrayRef<unsigned> warpsPerCTA);
 
+LinearLayout getSM120DotScaledScaleLayout(MLIRContext *ctx, int dotOperandIdx,
+                                          ArrayRef<int64_t> dotOperandShape,
+                                          ArrayRef<unsigned> tilesPerWarp,
+                                          ArrayRef<unsigned> warpsPerCTA,
+                                          unsigned instrM, unsigned instrN,
+                                          CTALayoutAttr ctaLayoutAttr);
+
 // Create LinearLayout for nvidia mma tile.
 LinearLayout nvidiaMmaTile(MLIRContext *ctx, ArrayRef<unsigned> tileShape,
                            unsigned kWidth, ArrayRef<unsigned> order,
 
@@ -25,7 +25,6 @@ static SmallVector<Value> computeWarpLevelHistogram(
   int numBits = llvm::Log2_64(numBins);
   int numBitsLaneId = llvm::Log2_64(numThreadPerWarp);
   unsigned numElementsPerThreads = getTotalElemsPerThread(srcType);
-  unsigned numThreadWithUniqueData = getThreadsPerWarp(srcType)[0];
   // The histogram is distributed across threads, each thread owns `numBins /
   // numThreadPerWarp` bins.
   SmallVector<Value> warpLevelHistogram(numBins / numThreadPerWarp, zero);
@@ -43,10 +42,6 @@ static SmallVector<Value> computeWarpLevelHistogram(
         numThreadPerWarp == 32 ? 0xFFFFFFFF : 0xFFFFFFFFFFFFFFFF;
     Value fullMask = b.int_val(numThreadPerWarp, fullMaskValue);
     Value mask = fullMask;
-    // If not all threads have unique data, mask out the redundant ones.
-    if (numThreadWithUniqueData < numThreadPerWarp) {
-      mask = b.int_val(numThreadPerWarp, (1ULL << numThreadWithUniqueData) - 1);
-    }
     for (int i = 0; i < numBitsLaneId; i++) {
       Value updateMask =
           b.select(b.icmp_ne(b.and_(threadId, b.i32_val(1 << i)), zero),
@@ -96,8 +91,6 @@ static SmallVector<Value> computeCrossWarpHistogram(
     Value threadId, int numWarps) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   SmallVector<Value> histogramValues;
-  unsigned numWarpsWithUniqueData = mlir::triton::gpu::getWarpsPerCTA(
-      srcType.getEncoding(), srcType.getShape())[0];
   Value laneId = b.and_(threadId, b.i32_val(numThreadPerWarp - 1));
   // Initialize the shared memory with zeros.
   int64_t numElementPerThread =
@@ -112,19 +105,6 @@ static SmallVector<Value> computeCrossWarpHistogram(
   }
   b.barrier();
   Block *afterAtomics = nullptr;
-  // If some warps have replicated data we need to skip those warps when
-  // accumulating.
-  if (numWarpsWithUniqueData < numWarps) {
-    Block *currentBlock = rewriter.getInsertionBlock();
-    afterAtomics =
-        rewriter.splitBlock(currentBlock, rewriter.getInsertionPoint());
-    Block *atomicBlock = rewriter.createBlock(afterAtomics);
-    rewriter.setInsertionPointToEnd(currentBlock);
-    Value cond = b.icmp_ult(
-        threadId, b.i32_val(numWarpsWithUniqueData * numThreadPerWarp));
-    rewriter.create<LLVM::CondBrOp>(loc, cond, atomicBlock, afterAtomics);
-    rewriter.setInsertionPointToStart(atomicBlock);
-  }
   // Apply atomic add to update the histogram in shared memory.
   for (int i = 0; i < warpLevelHistogram.size(); ++i) {
     Value warpLevelHistogramValue = warpLevelHistogram[i];
@@ -209,6 +189,24 @@ struct HistogramOpConversion
         loc, rewriter, srcType, baseSharedMemPtr, warpLevelHistogram, numBins,
         numThreadsPerWarp, innerDimIndices, threadId, numWarps);
 
+    // Depending on the layout, some threads may have duplicate data. We can
+    // account for this by calculating a "replication factor" and dividing the
+    // results by it to avoid overcounting.
+    auto replicationFactor = numWarps * numThreadsPerWarp;
+    auto threadsPerWarp = getThreadsPerWarp(srcType);
+    auto warpsPerCTA =
+        getWarpsPerCTA(srcType.getEncoding(), srcType.getShape());
+    replicationFactor /= std::accumulate(
+        threadsPerWarp.begin(), threadsPerWarp.end(), 1, std::multiplies<>());
+    replicationFactor /= std::accumulate(warpsPerCTA.begin(), warpsPerCTA.end(),
+                                         1, std::multiplies<>());
+
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+    for (auto i = 0; i < histogramValue.size(); ++i) {
+      histogramValue[i] =
+          b.sdiv(histogramValue[i], b.i32_val(replicationFactor));
+    }
+
     Value results = packLLElements(loc, typeConverter, histogramValue, rewriter,
                                    op.getType());
     rewriter.replaceOp(op, results);
 
@@ -1339,12 +1339,6 @@ AMDWmmaEncodingAttr::verify(function_ref<mlir::InFlightDiagnostic()> emitError,
   if (version != 1 && version != 2) {
     return emitError() << "WMMA version must be in the [1, 2] range";
   }
-  // Transposed layout is needed for bypassing LDS between multiple dots.
-  // Version 1 tt.dot results and tt.dot operand layouts are different,
-  // therefore we test and support transposed only for version 2.
-  if (version != 2 && isTransposed) {
-    return emitError() << "Transposed WMMA is supported only for version 2";
-  }
   return success();
 }
 
 
@@ -1407,6 +1407,83 @@ LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef<int64_t> shape,
   return chooseDotDsReadB64TrLayout(dot, shape, elemBitWidth);
 }
 
+// Warp-level block scaling (sm_120, m16n8k32)
+// Reference: NVIDIA PTX ISA "Warp-level block scaling"
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
+//
+// Semantics:
+//   D = (A * SF_A) * (B * SF_B) + C
+//   scale_vec::1X  -> SF_A shape Mx1 (per-row),   SF_B shape 1xN (per-col)
+//
+// Providers (within each warp quad of 4 lanes):
+//   - A scales are provided by a lane-pair selected by thread-id-a ∈ {0,1}
+//       (0 => lanes {0,1}, 1 => lanes {2,3} in the quad).
+//   - B scales are provided by a single lane selected by thread-id-b ∈
+//   {0,1,2,3}.
+//
+// Byte selectors (which subfield of the 32-bit metadata is used):
+//   - 1X: 1 byte  => byte-id ∈ {0,1,2,3}
+//
+// Implementation notes:
+//   - We support only scale_vec::1X for now.
+//   - We choose a fixed provider for A (thread-id-a = 0) and B (thread-id-b =
+//   0)
+//   - In this implementation, each lane in a quad has the same scale factor.
+LinearLayout getSM120DotScaledScaleLayout(
+    MLIRContext *ctx, int dotOperandIdx, ArrayRef<int64_t> dotOperandShape,
+    ArrayRef<unsigned> tilesPerWarp, ArrayRef<unsigned> warpsPerCTA,
+    unsigned mmaInstrM, unsigned mmaInstrN, CTALayoutAttr ctaLayoutAttr) {
+  unsigned rank = dotOperandShape.size();
+  auto outDims = standardOutDimNames(ctx, rank);
+
+  StringAttr kRegister = StringAttr::get(ctx, "register");
+  StringAttr kLane = StringAttr::get(ctx, "lane");
+  StringAttr kWarp = StringAttr::get(ctx, "warp");
+
+  const unsigned mIndex = 0;
+  const unsigned nIndex = 1;
+  const int instrM = mmaInstrM;
+  const int instrN = mmaInstrN;
+  const int kSize = dotOperandShape[1];
+  const int mWarps = warpsPerCTA[mIndex];
+  const int nWarps = warpsPerCTA[nIndex];
+  const int totalWarps = mWarps * nWarps;
+  const unsigned mRep_warp = tilesPerWarp[mIndex];
+  const unsigned nRep_warp = tilesPerWarp[nIndex];
+  const unsigned kRep = std::min<unsigned>(kSize, 2);
+
+  std::vector<std::vector<int32_t>> registerBase;
+  std::vector<std::vector<int32_t>> laneBase;
+  std::vector<std::vector<int32_t>> warpBase;
+  if (dotOperandIdx == 0) { // per-row A-scale
+    laneBase = {{0, 8}, {0, 0}, {0, 1}, {0, 2}, {0, 4}};
+    for (int offset = instrM * mWarps; offset < instrM * mWarps * mRep_warp;
+         offset <<= 1)
+      registerBase.push_back({0, offset});
+    for (int w = mWarps; w < totalWarps; w <<= 1)
+      warpBase.push_back({0, 0});
+    for (int offset = instrM; offset < instrM * mWarps; offset <<= 1)
+      warpBase.push_back({0, offset});
+  } else { // per-col B-scale
+    laneBase = {{0, 0}, {0, 0}, {0, 1}, {0, 2}, {0, 4}};
+    if (nRep_warp > 1)
+      registerBase.push_back({0, nWarps * instrN});
+    for (int k = 1; k < kRep; k += 1)
+      registerBase.push_back({1 << (k - 1), 0});
+    for (int offset = instrN; offset < instrN * nWarps; offset <<= 1)
+      warpBase.push_back({0, offset});
+    for (int w = nWarps; w < totalWarps; w <<= 1)
+      warpBase.push_back({0, 0});
+  }
+
+  const unsigned kIdx = (dotOperandShape[0] == 1) ? 0 : 1;
+  const unsigned mnIdx = 1 - kIdx;
+  LinearLayout ctaLayout(
+      {{kRegister, registerBase}, {kLane, laneBase}, {kWarp, warpBase}},
+      {outDims[kIdx], outDims[mnIdx]});
+  return combineCtaCgaWithShape(ctaLayout, ctaLayoutAttr, dotOperandShape);
+}
+
 LinearLayout chooseScaledMfmaScaleLayout(MLIRContext *ctx, int dotOperandIdx,
                                          ArrayRef<int64_t> dotOperandShape,
                                          unsigned mfmaMDim,
 
@@ -908,6 +908,12 @@ LogicalResult WarpSpecializeOp::verify() {
         "cannot be nested inside another `ttg.warp_specialize` op");
   }
 
+  std::optional<int> numWarps = maybeLookupNumWarps(*this);
+  if (numWarps && *numWarps % 4 != 0) {
+    return mlir::emitError(getLoc()) << "warp-specialized kernels requires "
+                                        "num_warps to be a multiple of 4";
+  }
+
   return success();
 }
Original file line number	Diff line number	Diff line change
`@@ -1339,12 +1339,6 @@ AMDWmmaEncodingAttr::verify(function_ref<mlir::InFlightDiagnostic()> emitError,`
`1339`	`1339`	`if (version != 1 && version != 2) {`
`1340`	`1340`	`return emitError() << "WMMA version must be in the [1, 2] range";`
`1341`	`1341`	`}`
`1342`		`- // Transposed layout is needed for bypassing LDS between multiple dots.`
`1343`		`- // Version 1 tt.dot results and tt.dot operand layouts are different,`
`1344`		`- // therefore we test and support transposed only for version 2.`
`1345`		`- if (version != 2 && isTransposed) {`
`1346`		`- return emitError() << "Transposed WMMA is supported only for version 2";`
`1347`		`- }`
`1348`	`1342`	`return success();`
`1349`	`1343`	`}`
`1350`	`1344`
Original file line number	Diff line number	Diff line change
`@@ -908,6 +908,12 @@ LogicalResult WarpSpecializeOp::verify() {`
`908`	`908`	"cannot be nested inside another `ttg.warp_specialize` op");
`909`	`909`	`}`
`910`	`910`
	`911`	`+ std::optional<int> numWarps = maybeLookupNumWarps(*this);`
	`912`	`+ if (numWarps && *numWarps % 4 != 0) {`
	`913`	`+ return mlir::emitError(getLoc()) << "warp-specialized kernels requires "`
	`914`	`+ "num_warps to be a multiple of 4";`
	`915`	`+ }`
	`916`	`+`
`911`	`917`	`return success();`
`912`	`918`	`}`
`913`	`919`