Limit the minimum repeatCount value to make sure the number elements of DPAS operand A is enough to be shared to each logical lane. (#4787)

chengjunlu · web-flow · commit c97bd76a24c8 · 2025-08-01T12:53:14.000+08:00
The number of elements of the matrix for dot operands A maybe less than
the sub-group-size.
E.G for the case: repCount = 1, systolic depth = 8, ops per channel = 1
and threads_per_warp = 16.

It is not supported by the IGC scalar backend. We have to limit the
minimal number of repCount=2 for that case with paddings.

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -746,6 +746,24 @@ LogicalResult DotOperandEncodingAttr::verify(
           return emitError() << "ttg.dot_op kWidth parameter must match the "
                                 "parent's opsPerChannel";
       }
+
+      unsigned repeatCount = parentAttr.getRepeatCount();
+      unsigned systolicDepth = parentAttr.getSystolicDepth();
+      unsigned threadsPerWarp = parentAttr.getThreadsPerWarp();
+      // OpsPerChannel: 4 is for i8 type. 2 is for f16/bf16 type. 1 is for
+      // float32 type. 2 i8 elements are packed into i16. The number of packed
+      // elements per row for A operand is: 8, 16, 16.
+      unsigned numPackedElemPerRowForA =
+          opsPerChannel == 1 ? systolicDepth : systolicDepth * 2;
+      if (repeatCount * numPackedElemPerRowForA < threadsPerWarp)
+        return emitError()
+               << "The DPAS encoding implies an invalid layout for A "
+                  "operand. The non-uniform matrix A could not be "
+                  "referred in kernel with threadsPerWarp: "
+               << threadsPerWarp
+               << ". numPackedElemPerRowForA:" << numPackedElemPerRowForA
+               << ". RC:" << repeatCount << ", systolicDepth:" << systolicDepth
+               << ", opsPerChan:" << opsPerChannel;
     } else {
       // operand B
       if (kWidth != parentAttr.getOpsPerChannel())
diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
@@ -214,3 +214,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+#dpas = #ttig.dpas<{repeatCount = 1, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
+// expected-error @below {{The DPAS encoding implies an invalid layout for A operand. The non-uniform matrix A could not be referred in kernel}}
+#dot_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
+
+// -----
+
+// expected-error @below {{threadsPerWarp could not be smaller than the execution size}}
+#dpas = #ttig.dpas<{repeatCount = 1, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 8, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -291,7 +291,7 @@ LogicalResult DpasEncodingAttr::verify(
     ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
     unsigned repeatCount, unsigned systolicDepth, unsigned executionSize,
     unsigned opsPerChan, ::llvm::ArrayRef<unsigned> warpsPerCTA,
-    ::llvm::ArrayRef<unsigned> repCluster, unsigned sugGroupSize) {
+    ::llvm::ArrayRef<unsigned> repCluster, unsigned subGroupSize) {
   if (repeatCount > 8 || repeatCount < 1) {
     return emitError() << "repeatCount must be in the range [1, 8], but was:"
                        << repeatCount;
@@ -311,6 +311,12 @@ LogicalResult DpasEncodingAttr::verify(
                        << repCluster.size();
   }
 
+  if (subGroupSize < executionSize) {
+    return emitError() << "threadsPerWarp could not be smaller than the "
+                          "execution size. got subGroupSize:"
+                       << subGroupSize << ", executionSize:" << executionSize;
+  }
+
   return success();
 }
 
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
@@ -126,6 +126,12 @@ class BlockedToDPAS : public OpRewritePattern<tt::DotOp> {
     unsigned repeatCount =
         std::min(dpasCap.repeatCount, (unsigned)retShape[rank - 2] /*M*/);
     unsigned threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod);
+    unsigned numElemsPerRowForA =
+        opsPerChan == 1
+            ? dpasCap.systolicDepth
+            : dpasCap.systolicDepth * 2; // A is packed to i16 or i32.
+    unsigned minM = mlir::ceil<unsigned>(threadsPerWarp, numElemsPerRowForA);
+    repeatCount = std::max(repeatCount, minM);
     auto dpasEnc = ttgi::DpasEncodingAttr::get(
         oldRetType.getContext(), repeatCount, dpasCap.systolicDepth,
         dpasCap.executionSize, opsPerChan, warpsPerTile, repCluster,