Enable DPAS when sub-group-size=32 on GPU arch Xe+ and later. (#4869)

chengjunlu · web-flow · commit 6cb654f45445 · 2025-08-15T05:16:03.000Z
This PR enables DPAS (Dot Product Accumulate Systolic) when
sub-group-size=32 on GPU by introducing a new environment variable
TRITON_INTEL_ENABLE_DPAS_FOR_WARP_SIZE_32. When this environment
variable is enabled, the system allows DPAS operations with warp sizes
of 16 or 32 threads instead of being restricted to the minimum sub-group
size.

Key changes:

- Adds environment variable support to conditionally enable DPAS for
warp size 32
- Modifies DPAS analysis logic to support larger warp sizes when the
flag is enabled
- Updates the thread-per-warp setting logic to respect the new
environment variable

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -48,6 +48,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_INTEL_AGGRESSIVE_DPAS_REUSE",
     "TRITON_INTEL_DO_NOT_SINK_INSTR_ACROSS_RGN",
     "TRITON_INTEL_ENABLE_BLOCK_IO_ALL_LAYOUTS",
+    "TRITON_INTEL_ENABLE_DPAS_FOR_WARP_SIZE_32",
     "TRITON_INTEL_ENABLE_FIRST_LOAD_TO_SLM",
     "TRITON_INTEL_ENABLE_INSTR_SCHED",
     "TRITON_INTEL_FAST_MATH",
diff --git a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir
@@ -1,4 +1,4 @@
-// RUN: env TRITON_INTEL_DECOMPOSE_SCALED_BLOCKED=1 triton-opt %s -split-input-file --tritonintelgpu-accelerate-matmul | FileCheck %s
+// RUN: env TRITON_INTEL_ENABLE_DPAS_FOR_WARP_SIZE_32=1 TRITON_INTEL_DECOMPOSE_SCALED_BLOCKED=1 triton-opt %s -split-input-file --tritonintelgpu-accelerate-matmul | FileCheck %s
 
 // CHECK: #[[$DPAS:.+]] = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 1], A = [32, 16], B = [16, 16], C = [32, 16]}>
 // CHECK: #[[$DPAS_1:.+]] = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
@@ -368,3 +368,22 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+// CHECK: #[[$DPAS:.+]] = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 32, warpsPerCTA = [1, 1], repCluster = [4, 1], A = [32, 8], B = [8, 16], C = [32, 16]}>
+#blocked = #ttg.blocked<{sizePerThread = [4, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32, "ttig.min_sg_size" = 16 : i32, "ttig.support_dpas"} {
+  // CHECK-LABEL: dpas_sub_group_size_32
+  tt.func @dpas_sub_group_size_32(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
+    %zero_f32 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #blocked>
+    %a = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
+    %b = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
+
+    // CHECK: tt.dot {{.*}}, {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<128x128xf32, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<128x16xf32, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 1}>> -> tensor<128x16xf32, #[[$DPAS]]>
+    %result = tt.dot %a, %b, %zero_f32, inputPrecision = tf32 : tensor<128x128xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<128x16xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x16xf32, #blocked>
+    %result_ptr = tt.splat %arg0 : !tt.ptr<f32> -> tensor<128x16x!tt.ptr<f32>, #blocked>
+    tt.store %result_ptr, %result : tensor<128x16x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/Analysis/DPAS.cpp b/third_party/intel/lib/Analysis/DPAS.cpp
@@ -2,6 +2,7 @@
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include <triton/Tools/Sys/GetEnv.hpp>
 #include <type_traits>
 
 namespace mlir::triton::gpu::intel {
@@ -66,6 +67,17 @@ DPASAnalysis::canUseDPAS(FunctionOpInterface funcOp) const {
   unsigned minSGSize = mod->getAttrOfType<IntegerAttr>(
                               TritonIntelGPUDialect::getMinSGSizeAttrName())
                            .getInt();
+  bool enableWarp32 =
+      tools::getBoolEnv("TRITON_INTEL_ENABLE_DPAS_FOR_WARP_SIZE_32");
+  assert(minSGSize == 8 || minSGSize == 16 ||
+         minSGSize == 32 && "Unexpected minimum subgroup size");
+
+  if (enableWarp32 && minSGSize != 8) {
+    // We can support threads_per_warp=16 or 32 on Xe+ and later architectures.
+    return (threadsPerWarp == 16 || threadsPerWarp == 32) ? Result::True
+                                                          : Result::False;
+  }
+
   return (threadsPerWarp == minSGSize) ? Result::True : Result::False;
 }
 
diff --git a/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp b/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp
@@ -1,6 +1,7 @@
 #include "intel/include/Analysis/DPAS.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
 #include "intel/include/TritonAnnotateModule/Passes.h"
+#include <triton/Tools/Sys/GetEnv.hpp>
 
 namespace mlir::triton::gpu::intel {
 #define GEN_PASS_DEF_TRITONANNOTATEMODULE
@@ -53,25 +54,31 @@ struct TritonAnnotateModule
   void setThreadsPerWarp(ModuleOp &mod,
                          const DPASAnalysis &dpasAnalysis) const {
     Builder builder(mod);
-    mod.walk([&](FunctionOpInterface funcOp) {
-      // FIXME: DPAS lowering only implemented for 16 threads per warp, i.e.,
-      // DPAS is not used for devices like ATS.
-      constexpr unsigned supportedThreadsPerWarp = 16;
-      if (minSGSize != supportedThreadsPerWarp)
-        return WalkResult::interrupt();
-
-      if (dpasAnalysis.canUseDPAS(funcOp) == DPASAnalysis::Result::Maybe) {
-        // Set the threads per warp attribute to allow dot operation to be
-        // lowered to DPAS instructions.
-        mod->setAttr(AttrNumThreadsPerWarp,
-                     builder.getI32IntegerAttr(minSGSize));
-        assert(dpasAnalysis.canUseDPAS(funcOp) == DPASAnalysis::Result::True &&
-               "DPASAnalysis should report that dot operations can be "
-               "lowered to DPAS instructions");
-        return WalkResult::interrupt();
-      }
-      return WalkResult::advance();
-    });
+
+    bool enableWarp32 = mlir::triton::tools::getBoolEnv(
+        "TRITON_INTEL_ENABLE_DPAS_FOR_WARP_SIZE_32");
+    if (!enableWarp32) {
+      mod.walk([&](FunctionOpInterface funcOp) {
+        // DPAS lowering only implemented for 16 threads per warp, i.e., DPAS is
+        // not used for devices like ATS.
+        constexpr unsigned supportedThreadsPerWarp = 16;
+        if (minSGSize != supportedThreadsPerWarp)
+          return WalkResult::interrupt();
+
+        if (dpasAnalysis.canUseDPAS(funcOp) == DPASAnalysis::Result::Maybe) {
+          // Set the threads per warp attribute to allow dot operation to be
+          // lowered to DPAS instructions.
+          mod->setAttr(AttrNumThreadsPerWarp,
+                       builder.getI32IntegerAttr(minSGSize));
+          assert(dpasAnalysis.canUseDPAS(funcOp) ==
+                     DPASAnalysis::Result::True &&
+                 "DPASAnalysis should report that dot operations can be "
+                 "lowered to DPAS instructions");
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      });
+    }
 
     // If the threads per warp attribute was not set, use the option value.
     if (!mod->hasAttr(AttrNumThreadsPerWarp))