intel
diff --git a/‎docs/BLOCK_LOADS_LAYOUT.md
Lines changed: 3 additions & 3 deletions b/‎docs/BLOCK_LOADS_LAYOUT.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/test/unit/intel/test_block_load.py
Lines changed: 6 additions & 6 deletions b/‎python/test/unit/intel/test_block_load.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎python/test/unit/language/test_core.py
Lines changed: 1 addition & 1 deletion b/‎python/test/unit/language/test_core.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Analysis/test-liveness.mlir
Lines changed: 2 additions & 2 deletions b/‎test/Analysis/test-liveness.mlir
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/Conversion/intel/arith_to_llvm.mlir
Lines changed: 1 addition & 1 deletion b/‎test/Conversion/intel/arith_to_llvm.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/intel/dot_layout_offset.mlir
Lines changed: 2 additions & 2 deletions b/‎test/Conversion/intel/dot_layout_offset.mlir
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/Conversion/intel/dpas_to_block_layout_convert.mlir
Lines changed: 2 additions & 2 deletions b/‎test/Conversion/intel/dpas_to_block_layout_convert.mlir
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/Conversion/intel/glue.mlir
Lines changed: 4 additions & 4 deletions b/‎test/Conversion/intel/glue.mlir
Lines changed: 4 additions & 4 deletions
diff --git a/‎test/Conversion/intel/shared_to_dot_layout_convert.mlir
Lines changed: 3 additions & 3 deletions b/‎test/Conversion/intel/shared_to_dot_layout_convert.mlir
Lines changed: 3 additions & 3 deletions
@@ -15,14 +15,14 @@ We will first consider the `AxB` GEMM kernel with `A` matrix dimension `[1024, 5
 We will first consider the A matrix load, which will be the same for both the `AxB` and `AxBT` (transpose) cases as transpose does not currently change the DPAS layout. The tensor type (layout) generated by `TTGIR` dialect for the A matrix is:
 
 ```
-tensor<256x32xf16, #ttg.dot_op<{opIdx = 0, parent = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>, kWidth = 1}>>
+tensor<256x32xf16, #ttg.dot_op<{opIdx = 0, parent = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>, kWidth = 1}>>
 ```
 
 Note that the tensor type describes the expected data layout going into and out of the `tt.dot` / DPAS instructions. While this type is attached to the load instruction, the data retrieved by the load may not match the layout described above. The lowering of the `LoadOp` to 2D blocked load also implies a conversion to make the loaded data match the desired tensor type. This conversion is instantiated using shuffle vectors.
 
 We can use the `triton-tensor-layout` utility to print the DPAS layout with a hardware centric view (i.e. register/lane/warp mapping to tensor coordinates) using the following command:
 ```
-./build/cmake.linux-x86_64-cpython-3.10/bin/triton-tensor-layout -l "#ttg.dot_op<{opIdx = 0, parent = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>, kWidth = 1}>" -t "tensor<256x32xf16>" -use-hw-view |& tee A_hw_view
+./build/cmake.linux-x86_64-cpython-3.10/bin/triton-tensor-layout -l "#ttg.dot_op<{opIdx = 0, parent = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>, kWidth = 1}>" -t "tensor<256x32xf16>" -use-hw-view |& tee A_hw_view
 ```
 which produces the following layout
 
@@ -197,7 +197,7 @@ We now have a linear layout which maps DPAS instructions to locations in the 2D
 
 We will use the `triton-tensor-layout` utility to print the DPAS layouts for the `B` operand:
 ```
-tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>, kWidth = 2}>>
+tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>, kWidth = 2}>>
 
 ```
 
 
@@ -18,22 +18,22 @@ def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pa
     if dtype_str == "int8":
         A_width = 2
         B_width = 4
-        layouts = "#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 4, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 2]}>"
+        layouts = "#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 4, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 2]}>"
     elif dtype_str == "float32":
         A_width = 1
         B_width = 1
-        layouts = "#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2]}>"
+        layouts = "#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2]}>"
     else:
         A_width = 1
         B_width = 2
-        layouts = "#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2]}>"
+        layouts = "#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2]}>"
 
     block_io = "\"column_major\"" if transpose else "\"row_major\""
 
     ty = {"float32": "f32", "float16": "f16", "int8": "i8"}[dtype_str]
 
     ir = layouts + f"""
-    module attributes {{triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32}} {{
+    module attributes {{ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.support_sg_2d_block, ttig.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32}} {{
         tt.func public @block_load_dpas_layout(%arg0: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg2: !tt.ptr<{ty}> {{tt.divisibility = 16: i32}}, %arg3: !tt.ptr<{ty}> {{tt.divisibility = 16: i32}}) attributes {{noinline = false}} {{
             %0 = tt.get_program_id x : i32
             %M_i64 = arith.constant {M} : i64
@@ -43,13 +43,13 @@ def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pa
 
             // A matrix
             %1 = tt.make_tensor_ptr %arg0, [%M_i64, %N_i64], [%N_i64, %c1_i64], [%0, %c0_i32] {{order = array<i32: 1, 0>}} : <tensor<{M}x{N}x{ty}, #ttg.dot_op<{{opIdx = 0, parent = #mma, kWidth = {A_width}}}>>>
-            %2 = tt.load %1 {{boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"}} : !tt.ptr<tensor<{M}x{N}x{ty}, #ttg.dot_op<{{opIdx = 0, parent = #mma, kWidth = {A_width}}}>>>
+            %2 = tt.load %1 {{boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"}} : !tt.ptr<tensor<{M}x{N}x{ty}, #ttg.dot_op<{{opIdx = 0, parent = #mma, kWidth = {A_width}}}>>>
             %3 = tt.make_tensor_ptr %arg1, [%M_i64, %N_i64], [%N_i64, %c1_i64], [%0, %c0_i32] {{order = array<i32: 1, 0>}} : <tensor<{M}x{N}x{ty}, #ttg.dot_op<{{opIdx = 0, parent = #mma, kWidth = {A_width}}}>>>
             tt.store %3, %2 {{boundaryCheck = array<i32: 0, 1>}} : !tt.ptr<tensor<{M}x{N}x{ty}, #ttg.dot_op<{{opIdx = 0, parent = #mma, kWidth = {A_width}}}>>>
 
             // B matrix
             %4 = tt.make_tensor_ptr %arg2, [%N_i64, %M_i64], {"[%c1_i64, %N_i64]" if transpose else "[%M_i64, %c1_i64]"}, [%c0_i32, %0] {{order = array<i32: 1, 0>}} : <tensor<{N}x{M}x{ty}, #ttg.dot_op<{{opIdx = 1, parent = #mma, kWidth = {B_width}}}>>>
-            %5 = tt.load %4 {{boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = {block_io} }} : !tt.ptr<tensor<{N}x{M}x{ty}, #ttg.dot_op<{{opIdx = 1, parent = #mma, kWidth = {B_width}}}>>>
+            %5 = tt.load %4 {{boundaryCheck = array<i32: 0, 1>, ttig.block_io = {block_io} }} : !tt.ptr<tensor<{N}x{M}x{ty}, #ttg.dot_op<{{opIdx = 1, parent = #mma, kWidth = {B_width}}}>>>
             %6 = tt.make_tensor_ptr %arg3, [%N_i64, %M_i64], {"[%c1_i64, %N_i64]" if transpose else "[%M_i64, %c1_i64]"}, [%c0_i32, %0] {{order = array<i32: 1, 0>}} : <tensor<{N}x{M}x{ty}, #ttg.dot_op<{{opIdx = 1, parent = #mma, kWidth = {B_width}}}>>>
             tt.store %6, %5 {{boundaryCheck = array<i32: 0, 1>}} : !tt.ptr<tensor<{N}x{M}x{ty}, #ttg.dot_op<{{opIdx = 1, parent = #mma, kWidth = {B_width}}}>>>
 
 
@@ -197,7 +197,7 @@ def __init__(self, repeatCount, systolic_depth, execution_size, ops_per_chan, th
         self.rep_cluster = rep_cluster
 
     def __str__(self):
-        return f"#triton_intel_gpu.dpas<{{repeatCount={self.repeatCount}, systolicDepth={self.systolic_depth}, executionSize = {self.execution_size}, opsPerChan = {self.ops_per_chan}, threadsPerWarp = {self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, repCluster={self.rep_cluster}}}>"
+        return f"#ttig.dpas<{{repeatCount={self.repeatCount}, systolicDepth={self.systolic_depth}, executionSize = {self.execution_size}, opsPerChan = {self.ops_per_chan}, threadsPerWarp = {self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, repCluster={self.rep_cluster}}}>"
 
 
 class DotOperandLayout:
 
@@ -63,7 +63,7 @@ module attributes {"ttg.num-warps" = 8 : i32} {
            -> (tensor<8x16xf32>, tensor<8x16xf32>, !tt.ptr<tensor<16x16xf16>>, !tt.ptr<tensor<16x16xf16>>) : i32 {
       // CHECK:      [[LOAD_A]] = tt.load %arg6 {DotIdx = 1 : i32} : !tt.ptr<tensor<16x16xf16>>
       // CHECK-NEXT: [[LOAD_B]] = tt.load %arg7 {DotIdx = 1 : i32} : !tt.ptr<tensor<16x16xf16>>
-      // CHECK-NEXT: [[EXTRACT]] = triton_intel_gpu.extract %16[0] : tensor<16x32xf16> -> tensor<8x16xf16>
+      // CHECK-NEXT: [[EXTRACT]] = ttig.extract %16[0] : tensor<16x32xf16> -> tensor<8x16xf16>
       // CHECK-NEXT: [[DOT1]] = tt.dot [[EXTRACT]], [[LOAD_A]], %cst, inputPrecision = tf32 : tensor<8x16xf16> * tensor<16x16xf16> -> tensor<8x16xf32>
       // CHECK-NEXT: [[DOT2]] = tt.dot [[EXTRACT]], [[LOAD_B]], %cst, inputPrecision = tf32 : tensor<8x16xf16> * tensor<16x16xf16> -> tensor<8x16xf32>
       // CHECK-NEXT: [[ADVANCE1]] = tt.advance %arg6, [%c0_i32, %c64_i32] : <tensor<16x16xf16>>
@@ -72,7 +72,7 @@ module attributes {"ttg.num-warps" = 8 : i32} {
 
       %75 = tt.load %arg21 {DotIdx = 1 : i32} : !tt.ptr<tensor<16x16xf16>>
       %79 = tt.load %arg25 {DotIdx = 1 : i32} : !tt.ptr<tensor<16x16xf16>>
-      %91 = triton_intel_gpu.extract %58[0] : tensor<16x32xf16> -> tensor<8x16xf16>
+      %91 = ttig.extract %58[0] : tensor<16x32xf16> -> tensor<8x16xf16>
       %92 = tt.dot %91, %75, %cst_2, inputPrecision = tf32 : tensor<8x16xf16> * tensor<16x16xf16> -> tensor<8x16xf32>
       %107 = tt.dot %91, %79, %cst_2, inputPrecision = tf32 : tensor<8x16xf16> * tensor<16x16xf16> -> tensor<8x16xf32>
       %321 = tt.advance %arg21, [%c0_i32, %c64_i32] : <tensor<16x16xf16>>
 
@@ -8,7 +8,7 @@
 // CHECK-LABEL:   llvm.func spir_kernelcc @float_to_bfloat_conversion(
 // CHECK-SCALAR:                                             %[[VAL_0:.*]]: !llvm.struct<(f32, f32, f32, f32)>) -> !llvm.struct<(bf16, bf16, bf16, bf16)>
 // CHECK-VECTOR:                                             %[[VAL_0:.*]]: vector<32xf32>) -> vector<32xbf16>
-module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_intel_gpu.support_bf16_conversion", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttig.support_bf16_conversion", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   tt.func @float_to_bfloat_conversion(%arg0 : tensor<512xf32, #blocked>) ->  tensor<512xbf16, #blocked>{
 // CHECK-SCALAR:    %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32, f32, f32, f32)>
 // CHECK-SCALAR:    %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f32, f32, f32, f32)>
 
@@ -1,6 +1,6 @@
 // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s
 
-#dpas = #triton_intel_gpu.dpas<{repeatCount=8, systolicDepth=8, executionSize = 8, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA=[1, 1], repCluster=[2, 2]}>
+#dpas = #ttig.dpas<{repeatCount=8, systolicDepth=8, executionSize = 8, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA=[1, 1], repCluster=[2, 2]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset()
@@ -320,7 +320,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
 
 // -----
 
-#dpas = #triton_intel_gpu.dpas<{repeatCount=8, systolicDepth=8, executionSize = 8, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA=[1, 1], repCluster=[2, 2]}>
+#dpas = #ttig.dpas<{repeatCount=8, systolicDepth=8, executionSize = 8, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA=[1, 1], repCluster=[2, 2]}>
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
 
 
@@ -2,7 +2,7 @@
 
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 16], warpsPerCTA = [16, 2], order = [1, 0]}>
-#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
+#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.shared = 67584 : i32, "ttg.threads-per-warp" = 16 : i32} {
 // CHECK-LABEL:   llvm.func spir_kernelcc @convert_dpas(
 // CHECK-SAME:                                          %[[VAL_0:.*]]: !llvm.ptr<1>)
@@ -94,7 +94,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
 
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 16], warpsPerCTA = [16, 2], order = [1, 0]}>
-#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [2, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
+#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [2, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.shared = 67584 : i32, "ttg.threads-per-warp" = 16 : i32} {
 // CHECK-LABEL:   llvm.func spir_kernelcc @convert_dpas(
 // CHECK-SAME:                                          %[[VAL_0:.*]]: !llvm.ptr<1>)
 
@@ -1,7 +1,7 @@
 // RUN: env TRITON_INTEL_ADVANCED_PATH=1 \
 // RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm | FileCheck %s
 
-module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "ttg.num-warps" = 4 : i32} {
+module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-warps" = 4 : i32} {
 // CHECK-LABEL:   llvm.func spir_kernelcc @test_scalar(
 // CHECK-SAME:                                         %[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: f32, %[[VAL_3:.*]]: f32) -> vector<4xf32>
 // CHECK:           %[[VAL_8:.*]] = llvm.mlir.poison : vector<4xf32>
@@ -16,7 +16,7 @@ module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.sup
 // CHECK:           llvm.return %[[VAL_16]] : vector<4xf32>
 // CHECK:         }
   tt.func @test_scalar(%arg0: tensor<1x16xf32>, %arg1: tensor<1x16xf32>, %arg2: tensor<1x16xf32>, %arg3: tensor<1x16xf32>) -> tensor<4x16xf32> {
-    %0 = triton_intel_gpu.glue %arg0, %arg1, %arg2, %arg3 : (tensor<1x16xf32>, tensor<1x16xf32>, tensor<1x16xf32>, tensor<1x16xf32>) -> tensor<4x16xf32>
+    %0 = ttig.glue %arg0, %arg1, %arg2, %arg3 : (tensor<1x16xf32>, tensor<1x16xf32>, tensor<1x16xf32>, tensor<1x16xf32>) -> tensor<4x16xf32>
     tt.return %0 : tensor<4x16xf32>
   }
 
@@ -26,7 +26,7 @@ module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.sup
 // CHECK:           llvm.return %[[VAL_4]] : vector<8xf32>
 // CHECK:         }
   tt.func @test_vec_2(%arg0: tensor<4x16xf32>, %arg1: tensor<4x16xf32>) -> tensor<8x16xf32> {
-    %0 = triton_intel_gpu.glue %arg0, %arg1 : (tensor<4x16xf32>, tensor<4x16xf32>) -> tensor<8x16xf32>
+    %0 = ttig.glue %arg0, %arg1 : (tensor<4x16xf32>, tensor<4x16xf32>) -> tensor<8x16xf32>
     tt.return %0 : tensor<8x16xf32>
   }
 
@@ -38,7 +38,7 @@ module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.sup
 // CHECK:           llvm.return %[[VAL_10]] : vector<16xf32>
 // CHECK:         }
   tt.func @test_vec_4(%arg0: tensor<4x16xf32>, %arg1: tensor<4x16xf32>, %arg2: tensor<4x16xf32>, %arg3: tensor<4x16xf32>) -> tensor<16x16xf32> {
-    %0 = triton_intel_gpu.glue %arg0, %arg1, %arg2, %arg3 : (tensor<4x16xf32>, tensor<4x16xf32>, tensor<4x16xf32>, tensor<4x16xf32>) -> tensor<16x16xf32>
+    %0 = ttig.glue %arg0, %arg1, %arg2, %arg3 : (tensor<4x16xf32>, tensor<4x16xf32>, tensor<4x16xf32>, tensor<4x16xf32>) -> tensor<16x16xf32>
     tt.return %0 : tensor<16x16xf32>
   }
 }
@@ -1,7 +1,7 @@
 // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm -canonicalize | FileCheck %s
 
 #blocked0 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 8], warpsPerCTA = [32, 1], order = [1, 0]}>
-#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
+#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
@@ -37,7 +37,7 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
 // -----
 
 #blocked0 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 8], warpsPerCTA = [32, 1], order = [1, 0]}>
-#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
+#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
@@ -74,7 +74,7 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
 // -----
 
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 16], warpsPerCTA = [16, 2], order = [1, 0]}>
-#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
+#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>