intel
diff --git a/‎.github/workflows/integration-tests-amd.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests-amd.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/invalid_concat_op.mlir
Lines changed: 174 additions & 0 deletions b/‎test/Conversion/amd/invalid_concat_op.mlir
Lines changed: 174 additions & 0 deletions
diff --git a/‎test/TritonGPU/amd/amd-concat-op.mlir
Lines changed: 105 additions & 0 deletions b/‎test/TritonGPU/amd/amd-concat-op.mlir
Lines changed: 105 additions & 0 deletions
diff --git a/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
Lines changed: 69 additions & 0 deletions b/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
Lines changed: 69 additions & 0 deletions
@@ -109,7 +109,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
-          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice_concat_op.py
           TRITON_ALWAYS_COMPILE=1 pytest --capture=tee-sys -rfs third_party/amd/python/test/test_scalarize_packed_fops.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 12 language runtime \
 
@@ -0,0 +1,174 @@
+// RUN: triton-opt -split-input-file %s --convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics
+
+
+// Invalid ranks
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @invalid_concat(
+    %arg0: tensor<32x64xf32, #blocked>,
+    %arg1: tensor<32x64xf32, #blocked>,
+    %arg2: tensor<32x64xf32, #blocked>,
+    %arg3: tensor<32x64xf32, #blocked>,
+    %arg4: tensor<32x64xf32, #blocked>,
+    %arg5: tensor<32x64xf32, #blocked>,
+    %arg6: tensor<32x64xf32, #blocked>,
+    %arg7: tensor<32x64xf32, #blocked>) {
+
+    // expected-error @+1 {{Source and destination tensors must have the same rank.}}
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7:
+    tensor<32x64xf32, #blocked>,tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked> -> tensor<256xf32, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+// Invalid shapes 1
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @invalid_concat(
+    %arg0: tensor<32x64xf32, #blocked>,
+    %arg1: tensor<32x64xf32, #blocked>,
+    %arg2: tensor<32x64xf32, #blocked>,
+    %arg3: tensor<32x64xf32, #blocked>,
+    %arg4: tensor<32x64xf32, #blocked>,
+    %arg5: tensor<32x64xf32, #blocked>,
+    %arg6: tensor<32x64xf32, #blocked>,
+    %arg7: tensor<32x64xf32, #blocked>) {
+
+    // expected-error @+1 {{Source and destination tensor shapes don't match.}}
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7:
+    tensor<32x64xf32, #blocked>,tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked> -> tensor<257x128xf32, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+// Invalid shapes 2
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @invalid_concat(
+    %arg0: tensor<32x64xf32, #blocked>,
+    %arg1: tensor<32x64xf32, #blocked>,
+    %arg2: tensor<32x64xf32, #blocked>,
+    %arg3: tensor<32x64xf32, #blocked>,
+    %arg4: tensor<32x64xf32, #blocked>,
+    %arg5: tensor<32x64xf32, #blocked>,
+    %arg6: tensor<32x64xf32, #blocked>,
+    %arg7: tensor<32x64xf32, #blocked>) {
+
+    // expected-error @+1 {{Number of source tiles (8) doesn't match required count (16).}}
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7:
+    tensor<32x64xf32, #blocked>,tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked> -> tensor<256x128xf32, #blocked>
+    tt.return
+  }
+}
+
+
+// -----
+
+// Invalid shapes 3
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @invalid_concat(
+    %arg0: tensor<32x64xf32, #blocked>,
+    %arg1: tensor<32x64xf32, #blocked>,
+    %arg2: tensor<32x64xf32, #blocked>,
+    %arg3: tensor<32x64xf32, #blocked>,
+    %arg4: tensor<32x64xf32, #blocked>,
+    %arg5: tensor<32x64xf32, #blocked>,
+    %arg6: tensor<32x64xf32, #blocked>,
+    %arg7: tensor<32x64xf32, #blocked>) {
+
+    // expected-error @+1 {{CTA tile shapes must match between source and destination tensors.}}
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7:
+    tensor<32x64xf32, #blocked>,tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked> -> tensor<128x128xf32, #blocked1>
+    tt.return
+  }
+}
+
+// -----
+
+// Different types
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @invalid_concat(
+    %arg0: tensor<32x64xf32, #blocked1>,
+    %arg1: tensor<32x64xf32, #blocked>,
+    %arg2: tensor<32x64xf32, #blocked>,
+    %arg3: tensor<32x64xf32, #blocked>,
+    %arg4: tensor<32x64xf32, #blocked>,
+    %arg5: tensor<32x64xf32, #blocked>,
+    %arg6: tensor<32x64xf32, #blocked>,
+    %arg7: tensor<32x64xf32, #blocked>) {
+
+    // expected-error @+1 {{All sources must have identical tensor types.}}
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7:
+    tensor<32x64xf32, #blocked1>,tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked> -> tensor<128x128xf32, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+// Invalid element types
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @invalid_concat(
+    %arg0: tensor<32x64xf32, #blocked>,
+    %arg1: tensor<32x64xf32, #blocked>,
+    %arg2: tensor<32x64xf32, #blocked>,
+    %arg3: tensor<32x64xf32, #blocked>,
+    %arg4: tensor<32x64xf32, #blocked>,
+    %arg5: tensor<32x64xf32, #blocked>,
+    %arg6: tensor<32x64xf32, #blocked>,
+    %arg7: tensor<32x64xf32, #blocked>) {
+
+    // expected-error @+1 {{Element types of sources and destination must match.}}
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7:
+    tensor<32x64xf32, #blocked>,tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked> -> tensor<256x64xf16, #blocked>
+    tt.return
+  }
+}
+
+
+// -----
+
+// Different layouts 1
+#src_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [64, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+#dst_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4], [0, 0]], warp=[[0, 32], [32, 0]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @invalid_concat(
+    %arg0: tensor<128x128xf32, #src_layout>,
+    %arg1: tensor<128x128xf32, #src_layout>,
+    %arg2: tensor<128x128xf32, #src_layout>,
+    %arg3: tensor<128x128xf32, #src_layout>) {
+
+    // expected-error @+1 {{Lane and warp dim basis must match between source and destination layout.}}
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3:
+    tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout> -> tensor<256x256xf32, #dst_layout>
+    tt.return
+  }
+}
+
+// -----
+
+// Different layouts 2
+#src_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [64, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+#dst_layout = #ttg.linear<{register=[[0, 0], [0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @invalid_concat(
+    %arg0: tensor<128x128xf32, #src_layout>,
+    %arg1: tensor<128x128xf32, #src_layout>,
+    %arg2: tensor<128x128xf32, #src_layout>,
+    %arg3: tensor<128x128xf32, #src_layout>) {
+
+    // expected-error @+1 {{Register basis must match on a CTA tile between source and destination.}}
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3:
+    tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout> -> tensor<256x256xf32, #dst_layout>
+    tt.return
+  }
+}
@@ -0,0 +1,105 @@
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
+
+// -----
+
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @concat_blocked(
+    %arg0: tensor<32x64xf32, #blocked1>,
+    %arg1: tensor<32x64xf32, #blocked1>,
+    %arg2: tensor<32x64xf32, #blocked1>,
+    %arg3: tensor<32x64xf32, #blocked1>,
+    %arg4: tensor<32x64xf32, #blocked1>,
+    %arg5: tensor<32x64xf32, #blocked1>,
+    %arg6: tensor<32x64xf32, #blocked1>,
+    %arg7: tensor<32x64xf32, #blocked1>) {
+    // CHECK: llvm.func @concat_blocked
+
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg0[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg1[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg2[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg3[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg4[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg5[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg6[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg7[{{.*}}] : !llvm.struct
+
+    // CHECK-COUNT-64: %{{[0-9]*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7:
+    tensor<32x64xf32, #blocked1>,tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1> -> tensor<128x128xf32, #blocked1>
+    tt.return
+  }
+}
+
+// -----
+
+#src_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [64, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+#dst_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @concat_ll_2d_1(
+    %arg0: tensor<128x128xf32, #src_layout>,
+    %arg1: tensor<128x128xf32, #src_layout>,
+    %arg2: tensor<128x128xf32, #src_layout>,
+    %arg3: tensor<128x128xf32, #src_layout>){
+    // CHECK: llvm.func @concat_ll_2d_1
+
+    // CHECK-COUNT-64: %{{.*}} = llvm.extractvalue %arg0[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-64: %{{.*}} = llvm.extractvalue %arg1[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-64: %{{.*}} = llvm.extractvalue %arg2[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-64: %{{.*}} = llvm.extractvalue %arg3[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-256: %{{.*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3:
+    tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout>, tensor<128x128xf32, #src_layout> -> tensor<256x256xf32, #dst_layout>
+    tt.return
+  }
+}
+
+// -----
+
+#src_layout = #ttg.linear<{register=[[1, 0], [2, 0], [4, 0]], lane=[[0, 1], [0, 2], [0, 4], [0, 8], [8, 0], [16, 0]], warp=[[0, 16]], block=[]}>
+#dst_layout = #ttg.linear<{register=[[1, 0], [2, 0], [4, 0], [32, 0], [0, 32]], lane=[[0, 1], [0, 2], [0, 4], [0, 8], [8, 0], [16, 0]], warp=[[0, 16]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @concat_ll_2d_2(
+    %arg0: tensor<32x32xf32, #src_layout>,
+    %arg1: tensor<32x32xf32, #src_layout>,
+    %arg2: tensor<32x32xf32, #src_layout>,
+    %arg3: tensor<32x32xf32, #src_layout>){
+    // CHECK: llvm.func @concat_ll_2d_2
+
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg0[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg1[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg2[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg3[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-32: %{{.*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3:
+    tensor<32x32xf32, #src_layout>, tensor<32x32xf32, #src_layout>, tensor<32x32xf32, #src_layout>, tensor<32x32xf32, #src_layout> -> tensor<64x64xf32, #dst_layout>
+    tt.return
+  }
+}
+
+// -----
+
+#src_layout = #ttg.linear<{register=[[1]], lane=[[2], [4], [8], [16], [32], [64]], warp=[[128]], block=[]}>
+#dst_layout = #ttg.linear<{register=[[1], [256], [512]], lane=[[2], [4], [8], [16], [32], [64]], warp=[[128]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @concat_ll_1d(
+    %arg0: tensor<256xf32, #src_layout>,
+    %arg1: tensor<256xf32, #src_layout>,
+    %arg2: tensor<256xf32, #src_layout>,
+    %arg3: tensor<256xf32, #src_layout>){
+    // CHECK: llvm.func @concat_ll_1d
+
+    // CHECK-COUNT-2: %{{.*}} = llvm.extractvalue %arg0[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-2: %{{.*}} = llvm.extractvalue %arg1[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-2: %{{.*}} = llvm.extractvalue %arg2[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-2: %{{.*}} = llvm.extractvalue %arg3[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-8: %{{.*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3:
+    tensor<256xf32, #src_layout>, tensor<256xf32, #src_layout>, tensor<256xf32, #src_layout>, tensor<256xf32, #src_layout> -> tensor<1024xf32, #dst_layout>
+    tt.return
+  }
+}
@@ -119,6 +119,75 @@ def ExtractSliceOp : TT_AMDGPU_Op<"extract_slice", [Pure]> {
   let hasVerifier = 1;
 }
 
+def ConcatOp : TT_AMDGPU_Op<"concat", [Pure]> {
+  let summary = "concat operation";
+  let description = [{
+    The "concat" operation combines a list of source n-dimensional tensors into a single larger destination tensor.
+
+    All source tensors must have the same shape, element type, and encoding.
+    The concatenation dimension is inferred from the source and destination shapes provided by the user.
+    For example, two tensors of shape 64x128 can produce a destination shape of 128x128,
+    indicating concatenation along dimension 0; or 64x256, indicating concatenation along dimension 1.
+
+    Generally, source tensors passed as op arguments can be arranged into the resulting shape in multiple ways.
+    For example, given four tensors of shape 64x64:
+      concat s0<64x64>, s1<64x64>, s2<64x64>, s3<64x64> -> <128x128>
+
+    They can be laid out in different configurations within the result tensor:
+      1) s0 s1     2) s0 s2
+         s2 s3        s1 s3
+
+    From a logical tensor perspective, the source tensors are treated as elements of a tensor of tensors.
+    In other words, the 1-D array of input tensors is conceptually reshaped into an n-D grid.
+    The semantics of this op assume a row-major order (or its n-D generalization),
+    meaning the fastest-varying dimension is filled first, and the slowest-varying dimension is filled last.
+    In the example above, this corresponds to layout 1).
+
+    The source and destination tensors must have identical linear layouts at the CTA tile level.
+    That is, all base vectors for input dimensions must match, except for the register input dimension.
+    The register basis must align on the subset that defines the logical tensor shape of a single CTA tile.
+
+    This ensures that the concatenation is a no-op, meaning no data rearrangement among threads is required
+    to assemble the destination tensor with the given shape and layout.
+    However, the order of CTA tiles within the layout does not need to match between source and destination layouts.
+    It is the responsibility of the op's lowering logic to handle this correctly.
+
+    This op is designed to work on logical tensors directly, avoiding the need for complex layout reinterpretation or reshaping.
+    For example, the `tt.join` operation only supports concatenation along the innermost dimension,
+    and requires that the resulting innermost dimension provide 2 elements per thread, distributed across registers.
+    In contrast, this `concat` op imposes no constraints on the concatenation dimension or the size of dimensions.
+
+    * sources: a list of the input tensors.
+
+    Example 1:
+
+    ```mlir
+    #blocked = #ttg.blocked<{sizePerThread = [1, 8],
+        threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+    %0 = amdgpu.concat %arg0, %arg1: tensor<32x64xf32, #blocked>,tensor<32x64xf32, #blocked>,
+      -> tensor<64x64xf32, #blocked>
+    ```
+
+    Example 2:
+    ```mlir
+    #src_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [64, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+    #dst_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+    %0 = amdgpu.concat %arg0, %arg1, %arg2, %arg3 : tensor<128x128xf16, #src_layout>, tensor<128x128xf16, #src_layout>, tensor<128x128xf16, #src_layout>,
+                                                    tensor<128x128xf16, #src_layout> -> tensor<256x256xf16, #dst_layout>
+    ```
+
+    }];
+
+  let arguments = (ins Variadic<TT_Tensor>:$sources);
+  let results = (outs AnyRankedTensor:$result);
+
+  let assemblyFormat = [{
+    $sources attr-dict `:` type($sources) `->` type($result)
+  }];
+
+  let hasVerifier = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // InstructionSchedHint
 //===----------------------------------------------------------------------===//