[TritonGEN] Use the sub-group-size of the module instead of hard code number of 16 in block load. (#4764)

chengjunlu · whitneywhtsang · web-flow · commit 92b6a324205d · 2025-07-24T11:18:12.000+08:00
[TritonGEN] Use the sub-group-size of the module instead of hard code
number of 16 in block load.

---------

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
Co-authored-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/test/TritonGEN/tritongen-invalid.mlir b/test/TritonGEN/tritongen-invalid.mlir
@@ -145,19 +145,23 @@ llvm.func @triton_gen.dpas(%c : vector<8xf32>, %a : vector<8xi16>, %b : vector<8
 
 // -----
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
   // expected-error @+1 {{'triton_gen.2Dblockload' op result size of 256 bits does not match the expected size of 128 bits}}
   %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<16xi16>
   llvm.return
 }
+}
 
 // -----
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
   // expected-error @+1 {{'triton_gen.2Dblockload' op transpose and vnni_transform are mutually exclusive}}
   %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=1, transpose=true, vnni_transform=true, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<2xi32>
   llvm.return
 }
+}
 
 // -----
 
@@ -177,19 +181,23 @@ llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height
 
 // -----
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
   // expected-error @+1 {{'triton_gen.2Dblockload' op transpose is only supported for 32 and 64 bit elements}}
   %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, transpose=true, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<8xi16>
   llvm.return
 }
+}
 
 // -----
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
   // expected-error @+1 {{'triton_gen.2Dblockload' op vnni_transform is only supported for 8 and 16 bit elements}}
   %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=true, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<4xi32>
   llvm.return
 }
+}
 
 // -----
 
@@ -316,11 +324,13 @@ llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height
 
 // -----
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
   // expected-error @+1 {{'triton_gen.2Dblockload' op expecting result element type to be 32 bits}}
   %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<8xi16>
   llvm.return
 }
+}
 
 // -----
 
diff --git a/test/TritonGEN/tritongen.mlir b/test/TritonGEN/tritongen.mlir
@@ -1,5 +1,6 @@
 // RUN: triton-opt %s -split-input-file -verify-diagnostics | FileCheck %s
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @triton_gen.barrier() {
   // CHECK-LABEL: triton_gen.barrier
   // CHECK: triton_gen.barrier {mem_fence = Local}
@@ -75,3 +76,4 @@ llvm.func @triton_gen.sub_group_block_write(%ptr : !llvm.ptr<3>, %val : i32) {
   triton_gen.sub_group_block_write %ptr, %val : !llvm.ptr<3>, i32
   llvm.return
 }
+}
diff --git a/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp b/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp
@@ -13,6 +13,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include <cstdint>
 
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
 using namespace mlir;
 using namespace mlir::triton;
 
@@ -238,7 +240,8 @@ verify2DBlockLoadHWRestriction(TritonGEN::Matrix2DBlockLoadOp op) {
   VectorType resTy = op.getRes().getType();
   unsigned resElemTySize = resTy.getElementType().getIntOrFloatBitWidth();
   unsigned resSize = resTy.getNumElements() * resElemTySize;
-  constexpr unsigned subgroupSize = 16;
+  unsigned subgroupSize = triton::gpu::TritonGPUDialect::getThreadsPerWarp(
+      op->getParentOfType<mlir::ModuleOp>());
   unsigned expectedSize = op.getElemSizeInBits() * op.getTileHeight() *
                           op.getTileWidth() * op.getVBlocks() / subgroupSize;
   if (resSize != expectedSize)