intel
diff --git a/‎python/triton/tools/compile.py‎
Lines changed: 1 addition & 1 deletion b/‎python/triton/tools/compile.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/patch-pytorch.sh‎
Lines changed: 1 addition & 0 deletions b/‎scripts/patch-pytorch.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/patch/pytorch_global_scratch.patch‎
Lines changed: 13 additions & 0 deletions b/‎scripts/patch/pytorch_global_scratch.patch‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎test/Conversion/intel/arith_to_llvm.mlir‎
Lines changed: 4 additions & 2 deletions b/‎test/Conversion/intel/arith_to_llvm.mlir‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎test/Conversion/intel/dot_layout_offset.mlir‎
Lines changed: 2 additions & 2 deletions b/‎test/Conversion/intel/dot_layout_offset.mlir‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/Conversion/intel/glue.mlir‎
Lines changed: 3 additions & 3 deletions b/‎test/Conversion/intel/glue.mlir‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/Conversion/intel/shared_to_dot_layout_convert.mlir‎
Lines changed: 6 additions & 3 deletions b/‎test/Conversion/intel/shared_to_dot_layout_convert.mlir‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎test/Conversion/intel/sub-group-shuffle.mlir‎
Lines changed: 14 additions & 7 deletions b/‎test/Conversion/intel/sub-group-shuffle.mlir‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎test/Conversion/intel/tritongpu_to_gen.mlir‎
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/intel/tritongpu_to_gen.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/intel/tritongpu_to_gen_dot.mlir‎
Lines changed: 2 additions & 3 deletions b/‎test/Conversion/intel/tritongpu_to_gen_dot.mlir‎
Lines changed: 2 additions & 3 deletions
@@ -183,7 +183,7 @@ def constexpr(s):
             "bin_data": ", ".join([f"0x{x}{y}" for x, y in zip(hex_[::2], hex_[1::2])]),
             "signature": ", ".join([f"{ty_to_cpp(ty)} {name}" for name, ty in zip(arg_names_not_1, arg_types_not_1)]),
             "full_signature": ", ".join([f"{ty_to_cpp(ty)} {name}" for name, ty in zip(arg_names, arg_types)]),
-            "arg_pointers": ", ".join([f"&{arg}" for arg in arg_names_not_1]),
+            "arg_pointers": ", ".join([f"&{arg}" for arg in arg_names_not_1] + ["&global_scratch"]),
             "arg_types": ", ".join(ty_to_cpp(arg) for arg in arg_types_not_1),
             "num_args": len(arg_names_not_1),
             "kernel_docstring": doc_string,
 
@@ -38,3 +38,4 @@ echo "Applying PyTorch patches in $REPO_ROOT"
 apply_patch ./patch/flex_attn_143553.patch
 apply_patch pytorch_fp64.patch
 apply_patch ./patch/Patch_torch_flex_attention_for_autotune_in_benchmark.patch
+apply_patch ./patch/pytorch_global_scratch.patch
@@ -0,0 +1,13 @@
+diff --git a/torch/_inductor/codegen/xpu/device_op_overrides.py b/torch/_inductor/codegen/xpu/device_op_overrides.py
+index 8678e30d26..93a5e50975 100644
+--- a/torch/_inductor/codegen/xpu/device_op_overrides.py
++++ b/torch/_inductor/codegen/xpu/device_op_overrides.py
+@@ -55,7 +55,7 @@ class XPUDeviceOpOverrides(DeviceOpOverrides):
+         return "void *"
+ 
+     def cpp_global_scratch(self, idx: int) -> Optional[tuple[str, str]]:
+-        return None
++        return f"void *global_scratch_{idx} = 0;", f"global_scratch_{idx}"
+ 
+ 
+ register_device_op_overrides("xpu", XPUDeviceOpOverrides())
@@ -6,7 +6,8 @@
 // CHECK-SCALAR-DAG:   llvm.func spir_funccc @_Z27__spirv_ConvertFToBF16INTELf(f32) -> i16 attributes {memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return}
 
 // CHECK-LABEL:   llvm.func spir_kernelcc @float_to_bfloat_conversion(
-// CHECK-SCALAR:                                             %[[VAL_0:.*]]: !llvm.struct<(f32, f32, f32, f32)>) -> !llvm.struct<(bf16, bf16, bf16, bf16)>
+// CHECK-SCALAR:                                             %[[VAL_0:.*]]: !llvm.struct<(f32, f32, f32, f32)>,
+// CHECK-SCALAR:                                             %[[PTR_1:.*]]: !llvm.ptr<1>) -> !llvm.struct<(bf16, bf16, bf16, bf16)>
 // CHECK-VECTOR:                                             %[[VAL_0:.*]]: vector<32xf32>) -> vector<32xbf16>
 module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttig.support_bf16_conversion", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   tt.func @float_to_bfloat_conversion(%arg0 : tensor<512xf32, #blocked>) ->  tensor<512xbf16, #blocked>{
@@ -35,7 +36,8 @@ module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttig.suppor
   }
 
 // CHECK-LABEL:   llvm.func spir_kernelcc @bfloat_to_float_conversion(
-// CHECK-SCALAR:                                             %[[VAL_0:.*]]: !llvm.struct<(bf16, bf16, bf16, bf16)>) -> !llvm.struct<(f32, f32, f32, f32)>
+// CHECK-SCALAR:                                             %[[VAL_0:.*]]: !llvm.struct<(bf16, bf16, bf16, bf16)>,
+// CHECK-SCALAR:                                             %[[PTR_1:.*]]: !llvm.ptr<1>) -> !llvm.struct<(f32, f32, f32, f32)>
   tt.func @bfloat_to_float_conversion(%arg0 : tensor<512xbf16, #blocked>) ->  tensor<512xf32, #blocked>{
 // CHECK-SCALAR:    %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(bf16, bf16, bf16, bf16)>
 // CHECK-SCALAR:    %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(bf16, bf16, bf16, bf16)>
 
@@ -3,7 +3,7 @@
 #dpas = #ttig.dpas<{repeatCount=8, systolicDepth=8, executionSize = 8, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA=[1, 1], repCluster=[2, 2]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset()
+  // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset(%arg0: !llvm.ptr<1>)
   tt.func public @dot_layout_emit_offset() {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dot_operand_a>
     // CHECK-COUNT-64:  {{.*}} = llvm.extractvalue {{.*}}
@@ -324,7 +324,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
 
-  // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset()
+  // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset(%arg0: !llvm.ptr<1>)
   tt.func public @dot_layout_emit_offset() {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dot_operand_b>
     // CHECK-COUNT-64:           {{.*}} = llvm.extractvalue {{.*}}
 
@@ -3,7 +3,7 @@
 
 module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-warps" = 4 : i32} {
 // CHECK-LABEL:   llvm.func spir_kernelcc @test_scalar(
-// CHECK-SAME:                                         %[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: f32, %[[VAL_3:.*]]: f32) -> vector<4xf32>
+// CHECK-SAME:                                         %[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: f32, %[[VAL_3:.*]]: f32, %[[PTR_1:.*]]: !llvm.ptr<1>) -> vector<4xf32>
 // CHECK:           %[[VAL_8:.*]] = llvm.mlir.poison : vector<4xf32>
 // CHECK:           %[[VAL_9:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK:           %[[VAL_10:.*]] = llvm.insertelement %[[VAL_0]], %[[VAL_8]]{{\[}}%[[VAL_9]] : i32] : vector<4xf32>
@@ -21,7 +21,7 @@ module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-war
   }
 
 // CHECK-LABEL:   llvm.func spir_kernelcc @test_vec_2(
-// CHECK-SAME:                                        %[[VAL_0:.*]]: vector<4xf32>, %[[VAL_1:.*]]: vector<4xf32>) -> vector<8xf32>
+// CHECK-SAME:                                        %[[VAL_0:.*]]: vector<4xf32>, %[[VAL_1:.*]]: vector<4xf32>, %[[PTR_1:.*]]: !llvm.ptr<1>) -> vector<8xf32>
 // CHECK:           %[[VAL_4:.*]] = llvm.shufflevector %[[VAL_0]], %[[VAL_1]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4xf32>
 // CHECK:           llvm.return %[[VAL_4]] : vector<8xf32>
 // CHECK:         }
@@ -31,7 +31,7 @@ module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-war
   }
 
 // CHECK-LABEL:   llvm.func spir_kernelcc @test_vec_4(
-// CHECK-SAME:                                        %[[VAL_0:.*]]: vector<4xf32>, %[[VAL_1:.*]]: vector<4xf32>, %[[VAL_2:.*]]: vector<4xf32>, %[[VAL_3:.*]]: vector<4xf32>) -> vector<16xf32>
+// CHECK-SAME:                                        %[[VAL_0:.*]]: vector<4xf32>, %[[VAL_1:.*]]: vector<4xf32>, %[[VAL_2:.*]]: vector<4xf32>, %[[VAL_3:.*]]: vector<4xf32>, %[[PTR_1:.*]]: !llvm.ptr<1>) -> vector<16xf32>
 // CHECK:           %[[VAL_8:.*]] = llvm.shufflevector %[[VAL_0]], %[[VAL_1]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4xf32>
 // CHECK:           %[[VAL_9:.*]] = llvm.shufflevector %[[VAL_2]], %[[VAL_3]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4xf32>
 // CHECK:           %[[VAL_10:.*]] = llvm.shufflevector %[[VAL_8]], %[[VAL_9]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8xf32>
 
@@ -8,7 +8,8 @@
 
 module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL: llvm.func spir_kernelcc @convert_dot(
-  // CHECK-SAME:    %[[VAL_0:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>)
+  // CHECK-SAME:    %[[VAL_0:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>
+  // CHECK-SAME:    %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK-SAME:    attributes {intel_reqd_sub_group_size = 16 : i32, {{.*}}} {
   tt.func @convert_dot(%A: tensor<128x64xf16, #blocked0>) {
     // CHECK-DAG:     %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
@@ -44,7 +45,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
 
 module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL: llvm.func spir_kernelcc @convert_dot(
-  // CHECK-SAME:    %[[VAL_0:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>)
+  // CHECK-SAME:    %[[VAL_0:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>
+  // CHECK-SAME:    %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK-SAME:    attributes {intel_reqd_sub_group_size = 16 : i32, {{.*}}} {
   tt.func @convert_dot(%A: tensor<128x64xf16, #blocked0>) {
     // CHECK-DAG:     %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
@@ -81,7 +83,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
 
 module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL: llvm.func spir_kernelcc @convert_dot(
-  // CHECK-SAME:    %[[VAL_1:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>)
+  // CHECK-SAME:    %[[VAL_1:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>
+  // CHECK-SAME:    %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK-SAME:    attributes {intel_reqd_sub_group_size = 16 : i32, {{.*}}} {
   tt.func @convert_dot(%B: tensor<64x256xf16, #blocked1>) {
     // CHECK-DAG:     %[[CST_128:.*]] = llvm.mlir.constant(128 : i32) : i32
 
@@ -9,7 +9,8 @@
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_f16(
-  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(f16)>)
+  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(f16)>
+  // CHECK-SAME:                                      %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f16)>
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK:           llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_4]])
@@ -49,7 +50,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_bf16(
-  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(bf16)>)
+  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(bf16)>
+  // CHECK-SAME:                                       %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(bf16)>
   // CHECK:           %[[VAL_2:.*]] = llvm.bitcast %[[VAL_1]] : bf16 to i16
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -91,7 +93,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_i1(
-  // CHECK-SAME:                                     %[[VAL_0:.*]]: !llvm.struct<(i1)>)
+  // CHECK-SAME:                                     %[[VAL_0:.*]]: !llvm.struct<(i1)>
+  // CHECK-SAME:                                     %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(i1)>
   // CHECK:           %[[VAL_2:.*]] = llvm.zext %[[VAL_1]] : i1 to i8
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -133,7 +136,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_ptr(
-  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(ptr<1>)>)
+  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(ptr<1>)>
+  // CHECK-SAME:                                      %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(ptr<1>)>
   // CHECK:           %[[VAL_2:.*]] = llvm.ptrtoint %[[VAL_1]] : !llvm.ptr<1> to i64
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -186,7 +190,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_f32(
-  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(f32)>)
+  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(f32)>
+  // CHECK-SAME:                                       %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32)>
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK:           llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_4]])
@@ -269,7 +274,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_non_sliced_multi_register(
-  // CHECK-SAME:                                                            %[[VAL_0:.*]]: !llvm.struct<(f64, f64)>)
+  // CHECK-SAME:                                                            %[[VAL_0:.*]]: !llvm.struct<(f64, f64)>
+  // CHECK-SAME:                                                            %[[PTR_1:.*]]: !llvm.ptr<1>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f64, f64)>
   // CHECK:           %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f64, f64)>
   // CHECK:           %[[VAL_5:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -370,7 +376,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, "ttg.thr
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_contiguous(
-  // CHECK-SAME:                                             %[[VAL_0:.*]]: !llvm.struct<(f16, f16)>)
+  // CHECK-SAME:                                             %[[VAL_0:.*]]: !llvm.struct<(f16, f16)>
+  // CHECK-SAME:                                             %[[PTR_1:.*]]: !llvm.ptr<1>
   tt.func @test_contiguous(%arg0: tensor<32xf16, #ttg.slice<{dim = 1, parent = #blocked}>>) -> tensor<32xf16, #ttg.slice<{dim = 1, parent = #blocked1}>> {
     // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f16, f16)>
     // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f16, f16)>
 
@@ -1,7 +1,7 @@
 // RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
-  // CHECK: llvm.func spir_kernelcc @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>)
+  // CHECK: llvm.func spir_kernelcc @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>)
   // Here the 128 comes from the 4 in module attribute multiples 32
   // CHECK-SAME: attributes {intel_reqd_sub_group_size = 32 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>} {
   tt.func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
 
@@ -73,8 +73,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
   // CHECK-LABEL: llvm.func spir_kernelcc @dot_f32_tf32_tf32_f32_1(
-  // CHECK-SAME:    %[[A:.*]]: !llvm.struct<(f32, f32, f32, f32)>, %[[B:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>,
-  // CHECK-SAME:    %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>) attributes {intel_reqd_sub_group_size = 32 : i32, triton_gen.max_work_group_size = array<i32: 32, 1, 1>} {
+  // CHECK-SAME:  %[[A:.*]]: !llvm.struct<(f32, f32, f32, f32)>, %[[B:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 32 : i32, triton_gen.max_work_group_size = array<i32: 32, 1, 1>} {
   tt.func @dot_f32_tf32_tf32_f32_1(%a: tensor<8x8xf32, #dot_operand_a>, %b: tensor<8x16xf32, #dot_operand_b>, %c: tensor<8x16xf32, #dpas>) {
     // COM: To simplify, only check RTNE and its usage for the last element of A, B, C
     // CHECK: %[[A_LAST_VAL:.*]] = llvm.extractvalue %[[A]][3]
@@ -117,7 +116,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
   // CHECK: llvm.func spir_funccc @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iDv8_fi(i32, vector<8xi16>, vector<8xi32>, vector<8xf32>, i32) -> vector<8xf32> attributes {convergent, memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return}
   // CHECK-LABEL: llvm.func spir_kernelcc @dot_rep_cluster_4_2(
   // CHECK-SAME:    %[[A:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>, %[[B:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>,
-  // CHECK-SAME:    %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 16, 1, 1>} {
+  // CHECK-SAME:    %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 16, 1, 1>} {
   tt.func @dot_rep_cluster_4_2(%a: tensor<32x32xf16, #dot_operand_a>, %b: tensor<32x32xf16, #dot_operand_b>, %c: tensor<32x32xf32, #dpas>) {
     // CHECK:           %[[VAL_3:.*]] = llvm.mlir.undef : vector<8xf32>
     // CHECK:           %[[CST_15:.*]] = llvm.mlir.constant(15 : i32) : i32