[TritonGEN]: Use GenISA block store if the sub group size is not equal to 16. (#4743)

chengjunlu · etiotto · whitneywhtsang · web-flow · commit b361546b81ec · 2025-07-18T16:06:06.000-04:00
The SPV block store interface only support sub group size = 16.
Use GenISA block store if the sub group size is not equal to 16.

---------

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
Co-authored-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
Co-authored-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir
@@ -1,6 +1,6 @@
 // RUN: env TRITON_INTEL_ADVANCED_PATH=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm --split-input-file | FileCheck %s
 
-module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 1 : i32} {
+module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-DAG: llvm.func spir_funccc @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iDv8_fi(i32, vector<8xi16>, vector<8xi32>, vector<8xf32>, i32) -> vector<8xf32> attributes {convergent, memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return}
   // CHECK-DAG: llvm.func spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv(i32, i32, i32, i32, !llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
   // CHECK-DAG: llvm.func spir_funccc @_Z41__spirv_Subgroup2DBlockLoadTransformINTELiiiiPU3AS1viiiDv2_iPv(i32, i32, i32, i32, !llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
diff --git a/test/TritonGEN/tritongen-2Dblockstore-to-llvm.mlir b/test/TritonGEN/tritongen-2Dblockstore-to-llvm.mlir
@@ -1,5 +1,6 @@
 // RUN: triton-opt -convert-tritongen-to-llvm -split-input-file %s | FileCheck %s
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) {
   // CHECK:    [[ONE0:%.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK:    [[PTR:%.*]] = llvm.ptrtoint %arg0 : !llvm.ptr<1> to i64
@@ -25,11 +26,11 @@ llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base
   triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>)
   llvm.return
 }
+}
 
 // -----
 
-// CHECK: llvm.func spir_funccc @_Z33__spirv_Subgroup2DBlockStoreINTELiiiiPvPU3AS1viiiDv2_i(i32, i32, i32, i32, !llvm.ptr {llvm.nonnull, llvm.readonly}, !llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>) attributes {no_unwind, will_return}
-
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi8>) {
   // CHECK:     llvm.func @triton_gen.2Dblockstore(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: vector<8xi8>) {
   // CHECK:       [[C8:%.*]] = llvm.mlir.constant(8 : i32) : i32
@@ -61,7 +62,7 @@ llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base
   triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=1, cache_control=L1UC_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi8>)
   llvm.return
 }
-
+}
 // -----
 
 llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) {
@@ -78,6 +79,7 @@ llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base
 
 // -----
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) {
   // CHECK-COUNT-2: llvm.mlir.constant(1 : i32) : i32
   // CHECK:         [[ElemSize:%.*]] = llvm.mlir.constant(1 : i32) : i32
@@ -88,6 +90,7 @@ llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base
   triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>)
   llvm.return
 }
+}
 
 // -----
 
@@ -133,6 +136,7 @@ llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base
 
 // -----
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi16>) {
   // CHECK:       llvm.mlir.constant(2 : i32) : i32
   // CHECK:       [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32
@@ -143,6 +147,7 @@ llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base
   triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi16>)
   llvm.return
 }
+}
 
 // -----
 
@@ -188,6 +193,7 @@ llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base
 
 // -----
 
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
 llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xi32>) {
   // CHECK:       llvm.mlir.constant(4 : i32) : i32
   // CHECK:       [[ElemSize:%.*]] = llvm.mlir.constant(4 : i32) : i32
@@ -198,3 +204,4 @@ llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base
   triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
   llvm.return
 }
+}
diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp
@@ -181,16 +181,10 @@ static bool isSPVBuiltinAvailable(TritonGEN::Matrix2DBlockStoreOp op) {
       op.getTileWidth() == 8 && op.getVBlocks() == 1)
     return false;
 
-  // FIXME: The following signatures have correctness issue with SPV interface.
-
-  // intel_sub_group_2d_block_write_8b_1r32x1c
-  if (op.getElemSizeInBits() == 8 && op.getTileHeight() == 1 &&
-      op.getTileWidth() == 32 && op.getVBlocks() == 1)
-    return false;
-
-  // intel_sub_group_2d_block_write_16b_2r16x1c
-  if (op.getElemSizeInBits() == 16 && op.getTileHeight() == 2 &&
-      op.getTileWidth() == 16 && op.getVBlocks() == 1)
+  // FIXME: The SPV block store only support subgroup size 16.
+  int subGroupSize = triton::gpu::TritonGPUDialect::getThreadsPerWarp(
+      op->getParentOfType<mlir::ModuleOp>());
+  if (subGroupSize != 16)
     return false;
 
   return true;