intel
diff --git a/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/build-test-reusable.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/pip-test.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/pip-test.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/Conversion/intel/dot_layout_offset.mlir‎
Lines changed: 2 additions & 2 deletions b/‎test/Conversion/intel/dot_layout_offset.mlir‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/Conversion/intel/tritongpu_to_gen.mlir‎
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/intel/tritongpu_to_gen.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir‎
Lines changed: 2 additions & 1 deletion b/‎test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/Proton/allocate_shared_memory.mlir‎
Lines changed: 3 additions & 19 deletions b/‎test/Proton/allocate_shared_memory.mlir‎
Lines changed: 3 additions & 19 deletions
diff --git a/‎test/Proton/nvidia/protongpu_to_llvm.mlir‎
Lines changed: 1 addition & 1 deletion b/‎test/Proton/nvidia/protongpu_to_llvm.mlir‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/TritonIntelGPU/blockptr_load.mlir‎
Lines changed: 4 additions & 4 deletions b/‎test/TritonIntelGPU/blockptr_load.mlir‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎test/TritonIntelGPU/blockptr_store.mlir‎
Lines changed: 2 additions & 2 deletions b/‎test/TritonIntelGPU/blockptr_store.mlir‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/TritonIntelGPU/tritonintelgpu-rewrite-stack-ptr.mlir‎
Lines changed: 6 additions & 6 deletions b/‎test/TritonIntelGPU/tritonintelgpu-rewrite-stack-ptr.mlir‎
Lines changed: 6 additions & 6 deletions
@@ -285,7 +285,8 @@ jobs:
         if: matrix.suite == 'rest' && inputs.driver_version == 'rolling' && inputs.device == 'max1100'
         run: |
           cd third_party/proton/test
-          pytest test_api.py test_lib.py test_profile.py test_viewer.py test_record.py -s -v
+          # FIXME: enable 'test_record.py' back
+          pytest test_api.py test_lib.py test_profile.py test_viewer.py -s -v
           cd ..
 
       - name: Run minicore tests
 
@@ -50,7 +50,7 @@ jobs:
           gh_token: ${{ secrets.GITHUB_TOKEN }}
           python_version: ${{ env.PYTHON_VERSION }}
           # transformers package is required for the inductor (e2e) test
-          wheels_pattern: '{torch,transformers}-*.whl'
+          wheels_pattern: 'torch-*.whl'
 
       - name: Install Triton
         uses: ./.github/actions/setup-triton
@@ -61,6 +61,7 @@ jobs:
           sed -i '/^validate_nccl_dep_consistency.*/d' generate_binary_build_matrix.py
           python -c "from generate_binary_build_matrix import PYTORCH_EXTRA_INSTALL_REQUIREMENTS; print('\n'.join(PYTORCH_EXTRA_INSTALL_REQUIREMENTS['xpu'].split(' | ')))" | tee /tmp/requirements.txt
           pip install -r /tmp/requirements.txt
+          pip install transformers==4.54.0
 
       - name: Run core tests
         run: |
 
@@ -3,7 +3,7 @@
 #dpas = #ttig.dpas<{repeatCount=8, systolicDepth=8, executionSize = 8, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA=[1, 1], repCluster=[2, 2]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset(%arg0: !llvm.ptr<1>)
+  // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>)
   tt.func public @dot_layout_emit_offset() {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dot_operand_a>
     // CHECK-COUNT-64:  {{.*}} = llvm.extractvalue {{.*}}
@@ -315,7 +315,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32}
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
 module attributes {"ttg.num-warps" = 1 : i32, "ttg.num-ctas" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
 
-  // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset(%arg0: !llvm.ptr<1>)
+  // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>)
   tt.func public @dot_layout_emit_offset() {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dot_operand_b>
     // CHECK-COUNT-64:           {{.*}} = llvm.extractvalue {{.*}}
 
@@ -1,7 +1,7 @@
 // RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm --convert-tritongen-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm --dump-input-context=20
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
-  // CHECK: llvm.func spir_kernelcc @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>)
+  // CHECK: llvm.func spir_kernelcc @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<1>)
   // Here the 128 comes from the 4 in module attribute multiples 32
   // CHECK-SAME: attributes {intel_reqd_sub_group_size = 32 : i32, reqd_work_group_size = array<i32: 128, 1, 1>} {
   tt.func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
 
@@ -227,7 +227,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
   }
 
   // CHECK-LABEL: llvm.func spir_kernelcc @addptr(
-  // CHECK-SAME:                                  [[VAL_0:%.*]]: !llvm.ptr<1>, [[PTR_1:%.*]]: !llvm.ptr<1>) -> !llvm.ptr<1> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>}
+  // CHECK-SAME:  [[VAL_0:%.*]]: !llvm.ptr<1>, [[PTR_1:%.*]]: !llvm.ptr<1>, [[PTR_2:%.*]]: !llvm.ptr<1>) ->
+  // CHECK-SAME:  !llvm.ptr<1> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>}
   tt.func public @addptr(%arg0: !tt.ptr<f16>) -> !tt.ptr<f16> {
     // CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK: [[VAL_2:%.*]] = llvm.call spir_funccc @_Z12get_group_idj([[VAL_1]]) {{.*}} : (i32) -> i64
 
@@ -21,28 +21,12 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} {
 // -----
 
 #A_SHARED = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
-// CHECK: ttg.shared = 144 : i32
-module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} {
-  // CHECK-LABEL: allocate_unaligned
-  tt.func @allocate_unaligned(%A : !tt.ptr<f16>) {
-  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x6xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  proton.record start "name0"
-  ttg.local_dealloc %cst0 : !ttg.memdesc<1x6xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  proton.record end "name0"
-  // CHECK: ttg.local_alloc  {allocation.offset = 16 : i32}
-  tt.return
-  }
-}
-
-// -----
-
-#A_SHARED = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
-// CHECK: ttg.shared = 50 : i32
+// CHECK: ttg.shared = 64 : i32
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} {
   // CHECK-LABEL: no_proton
   tt.func @no_proton(%A : !tt.ptr<f16>) {
-  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x25xf16, #A_SHARED, #ttg.shared_memory, mutable>
-  ttg.local_dealloc %cst0 : !ttg.memdesc<1x25xf16, #A_SHARED, #ttg.shared_memory, mutable>
+  %cst0 = ttg.local_alloc : () -> !ttg.memdesc<1x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
+  ttg.local_dealloc %cst0 : !ttg.memdesc<1x32xf16, #A_SHARED, #ttg.shared_memory, mutable>
   // CHECK: ttg.local_alloc
   // CHECK-NOT: ttg.local_alloc
   tt.return
 
@@ -136,7 +136,7 @@ module attributes {"ttg.num-warps" = 8 : i32, ttg.profile_scratch_memory_alignme
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-warps" = 8 : i32, ttg.profile_scratch_memory_alignment = 128 : i32, ttg.profile_scratch_memory_size = 384 : i32} {
   // CHECK-LABEL: convert_smem_finalize
-  // CHECK-DAG: llvm.nvvm.read.ptx.sreg.smid
+  // CHECK-DAG: nvvm.read.ptx.sreg.smid
   // CHECK-DAG: llvm.extractvalue %{{.*}}[0] : !llvm.struct<(ptr<3>, i32)>
   // CHECK-DAG: llvm.store
   // CHECK-DAG: llvm.cond_br %{{.*}}, ^bb1, ^bb3
 
@@ -107,8 +107,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
 #dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
 module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
 // CHECK-LABEL:   llvm.func spir_kernelcc @dot_op_a_2d_load(
-// CHECK-SAME:                                              %[[VAL_0:.*]]: !llvm.ptr<1>,
-// CHECK-SAME:                                              %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
+// CHECK-SAME:    %[[VAL_0:.*]]: !llvm.ptr<1>, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>,
+// CHECK-SAME:    %[[PTR_2:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
   tt.func public @dot_op_a_2d_load(%arg0: !tt.ptr<f16>, %arg2: i64, %arg4: i64, %arg5: i64, %arg7: i64) {
     %c0_i32 = arith.constant 0 : i32
     %c1_i64 = arith.constant 1 : i64
@@ -170,8 +170,8 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32,
 #dot1 = #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
 module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
 // CHECK-LABEL:   llvm.func spir_kernelcc @dot_op_b_2d_load(
-// CHECK-SAME:                                              %[[VAL_0:.*]]: !llvm.ptr<1>,
-// CHECK-SAME:                                              %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
+// CHECK-SAME:    %[[VAL_0:.*]]: !llvm.ptr<1>, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>,
+// CHECK-SAME:    %[[PTR_2:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
   tt.func public @dot_op_b_2d_load(%arg1: !tt.ptr<f16>, %arg3: i64, %arg4: i64, %arg7: i64) {
     %c0_i32 = arith.constant 0 : i32
     %c1_i64 = arith.constant 1 : i64
 
@@ -241,8 +241,8 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32,
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
 module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
 // CHECK-LABEL:   llvm.func spir_kernelcc @dpas_layout_2d_store_rep_cluster_4_2(
-// CHECK-SAME:      %[[base:.*]]: !llvm.ptr<1>,
-// CHECK-SAME:      %[[width:.*]]: i64, %[[height:.*]]: i64, %[[rowStride:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
+// CHECK-SAME:      %[[base:.*]]: !llvm.ptr<1>, %[[width:.*]]: i64, %[[height:.*]]: i64, %[[rowStride:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>,
+// CHECK-SAME:      %[[PTR_2:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
   tt.func public @dpas_layout_2d_store_rep_cluster_4_2(%base: !tt.ptr<f16>, %width: i64, %height: i64, %rowStride: i64) {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dpas>
     %c0_i32 = arith.constant 0 : i32
 
@@ -2,18 +2,18 @@
 
 module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.support_sg_2d_block, ttig.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 0 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL:   llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
-  // CHECK-LABEL:   llvm.func spir_kernelcc @kernel(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<1>)
+  // CHECK-LABEL:   llvm.func spir_kernelcc @kernel(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<1>, %arg4: !llvm.ptr<1>)
   tt.func public @kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
     %0 = tt.load %arg0 : !tt.ptr<f32>
     %1 = tt.load %arg1 : !tt.ptr<f32>
     // CHECK: [[LOAD0:%.*]] = llvm.extractelement {{.*}}[{{.*}}]
     // CHECK: [[LOAD1:%.*]] = llvm.extractelement {{.*}}[{{.*}}]
     // CHECK: [[POISON:%.*]] = llvm.mlir.poison : !llvm.ptr<3>
-    // CHECK: llvm.call spir_funccc @noinline_simple_fn__fp32_fp32_Pfp32__([[LOAD0]], [[LOAD1]], %arg2, [[POISON]], %arg3)
+    // CHECK: llvm.call spir_funccc @noinline_simple_fn__fp32_fp32_Pfp32__([[LOAD0]], [[LOAD1]], %arg2, [[POISON]], %arg3, %arg4)
     tt.call @noinline_simple_fn__fp32_fp32_Pfp32__(%0, %1, %arg2) : (f32, f32, !tt.ptr<f32>) -> ()
     tt.return
   }
-  // CHECK:   llvm.func internal spir_funccc @noinline_simple_fn__fp32_fp32_Pfp32__(%arg0: f32, %arg1: f32, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>, %arg4: !llvm.ptr<1>)
+  // CHECK:   llvm.func internal spir_funccc @noinline_simple_fn__fp32_fp32_Pfp32__(%arg0: f32, %arg1: f32, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>, %arg4: !llvm.ptr<1>, %arg5: !llvm.ptr<1>)
   tt.func private @noinline_simple_fn__fp32_fp32_Pfp32__(%arg0: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg1: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg2: !tt.ptr<f32> {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 16 : i64})  attributes {noinline = true} {
     %0 = arith.addf %arg0, %arg1 fastmath<fast> : f32
     tt.store %arg2, %0 : !tt.ptr<f32>
@@ -29,17 +29,17 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
 #smem = #ttg.shared_memory
 module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.support_sg_2d_block, ttig.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 1280 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
-  // CHECK-LABEL:   llvm.func spir_kernelcc @kernel(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<1>, %arg4: !llvm.ptr<3>)
+  // CHECK-LABEL:   llvm.func spir_kernelcc @kernel(%arg0: !llvm.ptr<1>, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<1>, %arg4: !llvm.ptr<1>, %arg5: !llvm.ptr<3>)
   tt.func public @kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
     %0 = tt.load %arg0 : !tt.ptr<f32>
     %1 = tt.load %arg1 : !tt.ptr<f32>
     // CHECK: [[LOAD0:%.*]] = llvm.extractelement {{.*}}[{{.*}}]
     // CHECK: [[LOAD1:%.*]] = llvm.extractelement {{.*}}[{{.*}}]
-    // CHECK: llvm.call spir_funccc @noinline_shared_fn__fp32_fp32_Pfp32__([[LOAD0]], [[LOAD1]], %arg2, %arg4, %arg3)
+    // CHECK: llvm.call spir_funccc @noinline_shared_fn__fp32_fp32_Pfp32__([[LOAD0]], [[LOAD1]], %arg2, %arg5, %arg3, %arg4)
     tt.call @noinline_shared_fn__fp32_fp32_Pfp32__(%0, %1, %arg2) {allocation.offset = 0 : i32} : (f32, f32, !tt.ptr<f32>) -> ()
     tt.return
   }
-  // CHECK: llvm.func internal spir_funccc @noinline_shared_fn__fp32_fp32_Pfp32__(%arg0: f32, %arg1: f32, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>, %arg4: !llvm.ptr<1>)
+  // CHECK: llvm.func internal spir_funccc @noinline_shared_fn__fp32_fp32_Pfp32__(%arg0: f32, %arg1: f32, %arg2: !llvm.ptr<1>, %arg3: !llvm.ptr<3>, %arg4: !llvm.ptr<1>, %arg5: !llvm.ptr<1>)
   // CHECK: llvm.getelementptr %arg3[{{.*}}]
   tt.func private @noinline_shared_fn__fp32_fp32_Pfp32__(%arg0: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg1: f32 {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 1 : i64}, %arg2: !tt.ptr<f32> {tt.constancy = 1 : i64, tt.contiguity = 1 : i64, tt.divisibility = 16 : i64}) attributes {noinline = true} {
     %cst = arith.constant dense<16> : tensor<16x1xi32, #blocked>
Original file line number	Diff line number	Diff line change
`@@ -227,7 +227,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr`
`227`	`227`	`}`
`228`	`228`
`229`	`229`	`// CHECK-LABEL: llvm.func spir_kernelcc @addptr(`
`230`		`- // CHECK-SAME: [[VAL_0:%.]]: !llvm.ptr<1>, [[PTR_1:%.]]: !llvm.ptr<1>) -> !llvm.ptr<1> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>}`
	`230`	`+ // CHECK-SAME: [[VAL_0:%.]]: !llvm.ptr<1>, [[PTR_1:%.]]: !llvm.ptr<1>, [[PTR_2:%.*]]: !llvm.ptr<1>) ->`
	`231`	`+ // CHECK-SAME: !llvm.ptr<1> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>}`
`231`	`232`	`tt.func public @addptr(%arg0: !tt.ptr<f16>) -> !tt.ptr<f16> {`
`232`	`233`	`// CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32`
`233`	`234`	`// CHECK: [[VAL_2:%.]] = llvm.call spir_funccc @_Z12get_group_idj([[VAL_1]]) {{.}} : (i32) -> i64`