Add i64 type for Triton Gen 2D block io operation because hardware supports 64 bits data size (#4935)

chengjunlu · etiotto · web-flow · commit 01d1b3645487 · 2025-08-27T09:12:37.000Z
This PR adds support for 64-bit integer (i64) data types to Triton Gen
2D block I/O operations to align with hardware capabilities that support
64-bit data sizes.

---------

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
Signed-off-by: Ettore Tiotto &lt;ettore.tiotto@intel.com&gt;
Co-authored-by: Ettore Tiotto &lt;ettore.tiotto@intel.com&gt;
diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path_invalid.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path_invalid.mlir
@@ -1,42 +1,42 @@
 // RUN: env TRITON_INTEL_ADVANCED_PATH=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --verify-diagnostics --split-input-file
 
 module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f32>, %arg1: i64, %arg2: i32) {
+  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<i64>, %arg1: i64, %arg2: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32
-    %22 = tt.make_tensor_ptr %arg0, [%arg1, %arg1], [%arg1, %c1_i64], [%arg2, %c0_i32] {order = array<i32: 1, 0>} : <tensor<2x32xf32>>
-    // expected-error @+2 {{expecting elem_size_in_bits * tile_width * v_blocks <= 512}}
+    %22 = tt.make_tensor_ptr %arg0, [%arg1, %arg1], [%arg1, %c1_i64], [%arg2, %c0_i32] {order = array<i32: 1, 0>} : <tensor<2x32xi64>>
+    // expected-error @+2 {{expecting elem_size_in_bits * tile_width * v_blocks <= 1024}}
     // expected-error @+1 {{failed to legalize operation 'ttig.prefetch'}}
-    ttig.prefetch %22 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<2x32xf32>>
+    ttig.prefetch %22 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<2x32xi64>>
     tt.return
   }
 }
 
 // -----
 
 module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f32>, %arg1: i64, %arg2: i32) {
+  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<i64>, %arg1: i64, %arg2: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32
-    %22 = tt.make_tensor_ptr %arg0, [%arg1, %arg1], [%arg1, %c1_i64], [%arg2, %c0_i32] {order = array<i32: 1, 0>} : <tensor<2x32xf32>>
-    // expected-error @+2 {{expecting elem_size_in_bits * tile_width * v_blocks <= 512}}
+    %22 = tt.make_tensor_ptr %arg0, [%arg1, %arg1], [%arg1, %c1_i64], [%arg2, %c0_i32] {order = array<i32: 1, 0>} : <tensor<2x32xi64>>
+    // expected-error @+2 {{expecting elem_size_in_bits * tile_width * v_blocks <= 1024}}
     // expected-error @+1 {{failed to legalize operation 'tt.load'}}
-    %res = tt.load %22 {DotIdx = 0 : i32, boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<2x32xf32>>
+    %res = tt.load %22 {DotIdx = 0 : i32, boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<2x32xi64>>
     tt.return
   }
 }
 
 // -----
 
 module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f32>, %arg1: i64, %arg2: i32) {
+  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<i64>, %arg1: i64, %arg2: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32
-    %cst = arith.constant dense<0.000000e+00> : tensor<2x32xf32>
-    %22 = tt.make_tensor_ptr %arg0, [%arg1, %arg1], [%arg1, %c1_i64], [%arg2, %c0_i32] {order = array<i32: 1, 0>} : <tensor<2x32xf32>>
-    // expected-error @+2 {{expecting elem_size_in_bits * tile_width * v_blocks <= 512}}
+    %cst = arith.constant dense<0> : tensor<2x32xi64>
+    %22 = tt.make_tensor_ptr %arg0, [%arg1, %arg1], [%arg1, %c1_i64], [%arg2, %c0_i32] {order = array<i32: 1, 0>} : <tensor<2x32xi64>>
+    // expected-error @+2 {{expecting elem_size_in_bits * tile_width * v_blocks <= 1024}}
     // expected-error @+1 {{failed to legalize operation 'tt.store'}}
-    tt.store %22, %cst {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<2x32xf32>>
+    tt.store %22, %cst {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<2x32xi64>>
     tt.return
   }
 }
diff --git a/test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir b/test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir
@@ -801,3 +801,18 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_
   llvm.return
 }
 }
+
+// -----
+
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
+llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
+  // CHECK:      llvm.mlir.constant(8 : i32) : i32
+  // CHECK:      [[ElemSize:%.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK-NEXT: [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK-NEXT: [[TileHeight:%.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK-NEXT: [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NEXT: llvm.call spir_funccc @_Z32__spirv_Subgroup2DBlockLoadINTELiiiiPU3AS1viiiDv2_iPv([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], {{.*}}, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (i32, i32, i32, i32, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>, !llvm.ptr{{.*}}) -> ()
+  %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=64, tile_width=8, tile_height=4, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<2xi64>
+  llvm.return
+}
+}
diff --git a/test/TritonGEN/tritongen-2Dblockstore-to-llvm.mlir b/test/TritonGEN/tritongen-2Dblockstore-to-llvm.mlir
@@ -193,3 +193,18 @@ llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base
   llvm.return
 }
 }
+
+// -----
+
+module attributes {"ttg.threads-per-warp" = 16 : i32} {
+llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<2xi64>) {
+  // CHECK:       llvm.mlir.constant(0 : i32) : i32
+  // CHECK:       [[ElemSize:%.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK-DAG:   [[TileWidth:%.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK-DAG:   [[TileHeight:%.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK-DAG:   [[VBlocks:%.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK-NEXT:  llvm.call spir_funccc @_Z33__spirv_Subgroup2DBlockStoreINTELiiiiPvPU3AS1viiiDv2_i([[ElemSize]], [[TileWidth]], [[TileHeight]], [[VBlocks]], [[DEST:%.*]], {{.*}}, %arg2, %arg3, {{.*}}) {{.*}} : (i32, i32, i32, i32, !llvm.ptr{{.*}}, !llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> ()
+  triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits = 64, tile_width = 8, tile_height = 4, v_blocks = 1, cache_control = Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<2xi64>)
+  llvm.return
+}
+}
diff --git a/test/TritonGEN/tritongen-invalid.mlir b/test/TritonGEN/tritongen-invalid.mlir
@@ -174,8 +174,8 @@ llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height
 // -----
 
 llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
-  // expected-error @+1 {{'triton_gen.2Dblockload' op expecting elem_size_in_bits * tile_width * v_blocks <= 512}}
-  %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<32xi16>
+  // expected-error @+1 {{'triton_gen.2Dblockload' op expecting elem_size_in_bits * tile_width * v_blocks <= 1024}}
+  %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=32, tile_height=8, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<32xi16>
   llvm.return
 }
 
@@ -501,8 +501,8 @@ llvm.func @matrix_2Dblockprefetch(%ptr : !llvm.ptr, %base_width : i32, %base_hei
 // -----
 
 llvm.func @matrix_2Dblockprefetch(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
-  // expected-error @+1 {{'triton_gen.2Dblockprefetch' op expecting elem_size_in_bits * tile_width * v_blocks <= 512}}
-  triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=32, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
+  // expected-error @+1 {{'triton_gen.2Dblockprefetch' op expecting elem_size_in_bits * tile_width * v_blocks <= 1024}}
+  triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=64, tile_width=32, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
   llvm.return
 }
 
diff --git a/test/TritonGEN/tritongen.mlir b/test/TritonGEN/tritongen.mlir
@@ -45,21 +45,33 @@ llvm.func @triton_gen.cache_controls(%arg0: !llvm.ptr) {
 llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
   // CHECK:      llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) {
   // CHECK-NEXT:   %0 = triton_gen.2Dblockload %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 {elem_size_in_bits = 16, tile_width = 16, tile_height = 16, v_blocks = 1, transpose = false, vnni_transform = false, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<16xf16>
+  // CHECK-NEXT:   %1 = triton_gen.2Dblockload %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 {elem_size_in_bits = 32, tile_width = 16, tile_height = 16, v_blocks = 1, transpose = false, vnni_transform = false, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<16xf32>
+  // CHECK-NEXT:   %2 = triton_gen.2Dblockload %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 {elem_size_in_bits = 64, tile_width = 8, tile_height = 16, v_blocks = 1, transpose = false, vnni_transform = false, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<8xi64>
   %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<16xf16>
+  %1 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=16, tile_height=16, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<16xf32>
+  %2 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=64, tile_width=8, tile_height=16, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<8xi64>
   llvm.return
 }
 
-llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<16xf32>) {
-  // CHECK:      llvm.func @triton_gen.2Dblockstore(%arg0: !llvm.ptr, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: vector<16xf32>) {
-  // CHECK-NEXT:   triton_gen.2Dblockstore %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 {elem_size_in_bits = 32, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<16xf32>)
-  triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<16xf32>)
+llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val1 : vector<16xf16>, %stored_val2 : vector<16xf32>, %stored_val3 : vector<8xi64>) {
+  // CHECK:      llvm.func @triton_gen.2Dblockstore(%arg0: !llvm.ptr, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: vector<16xf16>, %arg7: vector<16xf32>, %arg8: vector<8xi64>) {
+  // CHECK-NEXT:   triton_gen.2Dblockstore %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 {elem_size_in_bits = 16, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<16xf16>)
+  // CHECK-NEXT:   triton_gen.2Dblockstore %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg7 {elem_size_in_bits = 32, tile_width = 16, tile_height = 8, v_blocks = 1, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<16xf32>)
+  // CHECK-NEXT:   triton_gen.2Dblockstore %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg8 {elem_size_in_bits = 64, tile_width = 8, tile_height = 8, v_blocks = 1, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<8xi64>)
+  triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val1 {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<16xf16>)
+  triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val2 {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<16xf32>)
+  triton_gen.2Dblockstore %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val3 {elem_size_in_bits=64, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32, vector<8xi64>)
   llvm.return
 }
 
 llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
   // CHECK:      llvm.func @triton_gen.2Dblockprefetch(%arg0: !llvm.ptr, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) {
+  // CHECK-NEXT:    triton_gen.2Dblockprefetch %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 {elem_size_in_bits = 16, tile_width = 8, tile_height = 8, v_blocks = 1, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
   // CHECK-NEXT:    triton_gen.2Dblockprefetch %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 {elem_size_in_bits = 32, tile_width = 8, tile_height = 8, v_blocks = 1, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
+  // CHECK-NEXT:    triton_gen.2Dblockprefetch %arg0, %arg1, %arg2, %arg3, %arg4, %arg5 {elem_size_in_bits = 64, tile_width = 8, tile_height = 8, v_blocks = 1, cache_control = Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
+  triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
   triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
+  triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=64, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
   llvm.return
 }
 
diff --git a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td
@@ -154,7 +154,7 @@ def TritonGEN_MatrixDPASOp : TritonGEN_Op<"dpas">,
 }
 
 def TritonGEN_Matrix2DBlockLoadOp : TritonGEN_Op<"2Dblockload">,
-  Results<(outs FixedVectorOfNonZeroRankOf<[TritonGEN_MatrixElemType]>:$res)>,
+  Results<(outs FixedVectorOfNonZeroRankOf<[TritonGEN_MatrixElemType, AnyI64]>:$res)>,
   Arguments<(ins
     Arg<LLVM_AnyPointer, "", [MemRead]>:$ptr,
     I32:$base_width,
@@ -180,6 +180,7 @@ def TritonGEN_Matrix2DBlockLoadOp : TritonGEN_Op<"2Dblockload">,
       $base_width, $base_height, $base_pitch - the shape of matrix
       $x, $y, $tile_width, $tile_height - the starting offsets and shape of the tile to load
       $elem_size_in_bits - the size in bits of the matrix element
+        - 64 for f64, i64
         - 32 for f32, bf32
         - 16 for f16, int16, bf16
         - 8 for int8, int4, int2
@@ -217,7 +218,7 @@ def TritonGEN_Matrix2DBlockStoreOp : TritonGEN_Op<"2Dblockstore">,
     I32Attr:$tile_width,
     I32Attr:$tile_height,
     I32Attr:$v_blocks,
-    FixedVectorOfNonZeroRankOf<[TritonGEN_MatrixElemType]>:$stored_val,
+    FixedVectorOfNonZeroRankOf<[TritonGEN_MatrixElemType, AnyI64]>:$stored_val,
     DefaultValuedAttr<TritonGEN_StoreCacheControl, "::mlir::triton::TritonGEN::StoreCacheControl::DEFAULT">:$cache_control
   )> {
 
@@ -230,6 +231,7 @@ def TritonGEN_Matrix2DBlockStoreOp : TritonGEN_Op<"2Dblockstore">,
       $base_width, $base_height, $base_pitch - the shape of the matrix
       $x, $y, $tile_width, $tile_height - the starting offsets and shape of the tile to store
       $elem_size_in_bits - the size in bits of the matrix element
+        - 64 for f64, i64
         - 32 for f32, bf32
         - 16 for f16, int16, bf16
         - 8 for int8, int4, int2
@@ -274,6 +276,7 @@ def TritonGEN_Matrix2DBlockPrefetchOp : TritonGEN_Op<"2Dblockprefetch">,
       $base_width, $base_height, $base_pitch - the shape of the matrix
     $x, $y, $tile_width, $tile_height - the starting offsets and shape of tile to prefetch
     $elem_size_in_bits - the size in bits of the matrix element
+      - 64 for f64, i64
       - 32 for f32, bf32
       - 16 for f16, int16, bf16
       - 8 for int8, int4, int2
diff --git a/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp b/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp
@@ -102,9 +102,9 @@ template <typename Op> static LogicalResult verify2DBlockHWRestriction(Op op) {
 
   uint32_t tileWidth = op.getTileWidth();
   uint32_t vBlocks = op.getVBlocks();
-  if (elemSizeInBits * tileWidth * vBlocks > 512)
+  if (elemSizeInBits * tileWidth * vBlocks > 1024)
     return op->emitOpError(
-        "expecting elem_size_in_bits * tile_width * v_blocks <= 512");
+        "expecting elem_size_in_bits * tile_width * v_blocks <= 1024");
 
   assert(tileWidth >= 1 && tileWidth <= 64 &&
          "tile_width should be between 1 and 64");