[FlexDecoding] Support M < 8 tt.dot with DPAS to optimize the flex decoding performance. (#4727)

chengjunlu · web-flow · commit b2fcf6482c2c · 2025-07-22T19:00:41.000-04:00
Use M size as the repeat count if M &lt; 8. It can help to reduce the
number of duplicated redundant value of DotOp and DPAS layout.

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3927,7 +3927,7 @@ def get_test_dot_vdot2_cases():
 
 
 def get_test_small_dots_cases():
-    if not is_cuda():
+    if not (is_cuda() or is_xpu()):
         return []
     return [(2, 4, 32, 1, False, False, 'None', 'ieee', 'float16', 'float32', 1, None),
             (1, 2, 32, 1, False, False, 'None', 'ieee', 'float8e5', 'float32', 1, None)]
@@ -6211,6 +6211,8 @@ def kernel(Out):
         dim=1, parent=DotOperandLayout(parent=MmaLayout([2, 0], [4, 1, 1], [1, 1, 1], [1, 1, 1], [2, 1, 0], [1, 16, 8]),
                                        op_idx=1, k_width=2)),
     DpasLayout(repeatCount=8, systolic_depth=8, execution_size=8, ops_per_chan=1, threads_per_warp=32,
+               warps_per_cta=[4, 1], rep_cluster=[1, 1]),
+    DpasLayout(repeatCount=2, systolic_depth=8, execution_size=8, ops_per_chan=1, threads_per_warp=32,
                warps_per_cta=[4, 1], rep_cluster=[1, 1])
 ]
 
diff --git a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir
@@ -333,3 +333,38 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+// CHECK: #[[$DPAS0:.+]] = #ttig.dpas<{repeatCount = 1, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [1, 1], A = [1, 16], B = [16, 16], C = [1, 16]}>
+// CHECK: #[[$DPAS1:.+]] = #ttig.dpas<{repeatCount = 2, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [1, 1], A = [2, 16], B = [16, 16], C = [2, 16]}>
+// CHECK: #[[$DPAS2:.+]] = #ttig.dpas<{repeatCount = 4, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [1, 1], A = [4, 16], B = [16, 16], C = [4, 16]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.min_sg_size" = 16 : i32, "ttig.support_dpas"} {
+  tt.func @M_smaller_than_8(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
+    // CHECK-LABEL: M_smaller_than_8
+    %b = arith.constant dense<0.000000e+00> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
+
+    // CHECK: tt.dot {{.*}} -> tensor<1x16xf32, #[[$DPAS0]]>
+    %a0 = arith.constant dense<0.000000e+00> : tensor<1x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
+    %zero0 = arith.constant dense<0.000000e+00> : tensor<1x16xf32, #blocked>
+    %result0 = tt.dot %a0, %b, %zero0 : tensor<1x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x16xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<1x16xf32, #blocked>
+    %result_ptr0 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1x16x!tt.ptr<f32>, #blocked>
+    tt.store %result_ptr0, %result0 : tensor<1x16x!tt.ptr<f32>, #blocked>
+
+    // CHECK: tt.dot {{.*}} -> tensor<2x16xf32, #[[$DPAS1]]>
+    %a1 = arith.constant dense<0.000000e+00> : tensor<2x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
+    %zero1 = arith.constant dense<0.000000e+00> : tensor<2x16xf32, #blocked>
+    %result1 = tt.dot %a1, %b, %zero1 : tensor<2x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x16xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<2x16xf32, #blocked>
+    %result_ptr1 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<2x16x!tt.ptr<f32>, #blocked>
+    tt.store %result_ptr1, %result1 : tensor<2x16x!tt.ptr<f32>, #blocked>
+
+    // CHECK: tt.dot {{.*}} -> tensor<4x16xf32, #[[$DPAS2]]>
+    %a2 = arith.constant dense<0.000000e+00> : tensor<4x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
+    %zero2 = arith.constant dense<0.000000e+00> : tensor<4x16xf32, #blocked>
+    %result2 = tt.dot %a2, %b, %zero2 : tensor<4x128xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x16xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<4x16xf32, #blocked>
+    %result_ptr2 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<4x16x!tt.ptr<f32>, #blocked>
+    tt.store %result_ptr2, %result2 : tensor<4x16x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -67,9 +67,7 @@ def min_dot_size(device_props: dict):
     # M: repeatCount. 1,2,4,8
     # N: executionSize. 16 for PVC, 8 for ATS
     # K: systolicDepth x opsPerChan. systolicDepth must be 8
-
-    # default 8 because 1,2,4 is not supported by our backend now.
-    repeat_count = 8
+    repeat_count = 1
     sdepth = 8
     exec_size = min(device_props["sub_group_sizes"])
 
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
@@ -123,9 +123,11 @@ class BlockedToDPAS : public OpRewritePattern<tt::DotOp> {
     size_t rank = retShape.size();
     SmallVector<unsigned> repCluster(rank, 1);
 
+    unsigned repeatCount =
+        std::min(dpasCap.repeatCount, (unsigned)retShape[rank - 2] /*M*/);
     unsigned threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod);
     auto dpasEnc = ttgi::DpasEncodingAttr::get(
-        oldRetType.getContext(), dpasCap.repeatCount, dpasCap.systolicDepth,
+        oldRetType.getContext(), repeatCount, dpasCap.systolicDepth,
         dpasCap.executionSize, opsPerChan, warpsPerTile, repCluster,
         threadsPerWarp);
 
@@ -157,7 +159,7 @@ class BlockedToDPAS : public OpRewritePattern<tt::DotOp> {
       repCluster[rank - 1] = repClusterDimN;
 
       dpasEnc = ttgi::DpasEncodingAttr::get(
-          oldRetType.getContext(), dpasCap.repeatCount, dpasCap.systolicDepth,
+          oldRetType.getContext(), repeatCount, dpasCap.systolicDepth,
           dpasCap.executionSize, opsPerChan, warpsPerTile, repCluster,
           threadsPerWarp);
     }