intel
diff --git a/‎test/Conversion/amd/invalid_extractslice_to_llvm.mlir
Lines changed: 27 additions & 36 deletions b/‎test/Conversion/amd/invalid_extractslice_to_llvm.mlir
Lines changed: 27 additions & 36 deletions
diff --git a/‎test/TritonGPU/amd/amd-canonicalize-extract-slice.mlir
Lines changed: 24 additions & 0 deletions b/‎test/TritonGPU/amd/amd-canonicalize-extract-slice.mlir
Lines changed: 24 additions & 0 deletions
diff --git a/‎test/TritonGPU/amd/amd-extractslice-op.mlir
Lines changed: 49 additions & 6 deletions b/‎test/TritonGPU/amd/amd-extractslice-op.mlir
Lines changed: 49 additions & 6 deletions
diff --git a/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
Lines changed: 53 additions & 5 deletions b/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
Lines changed: 53 additions & 5 deletions
diff --git a/‎third_party/amd/include/Utils/Utility.h
Lines changed: 23 additions & 0 deletions b/‎third_party/amd/include/Utils/Utility.h
Lines changed: 23 additions & 0 deletions
@@ -3,37 +3,17 @@
 // Invalid size
 #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{sizes [256, 2] must be a multiple of shapePerCTATile [256, 16]}}
+  // expected-error @+1 {{result shape must be multiple of shapePerCTATile}}
   %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x2xi32, #blocked1>
   tt.return
 }
 
 // -----
 
-// Invalid zero source dimension
-#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
-tt.func @invalid_size_input(%arg0: tensor<256x0xi32, #blocked1> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{source tensor dimension size zero at dimension 1}}
-  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x0xi32, #blocked1> to tensor<256x16xi32, #blocked1>
-  tt.return
-}
-
-// -----
-
-// Invalid zero result dimension
-#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
-tt.func @invalid_size_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{result tensor dimension size zero at dimension 1}}
-  %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x0xi32, #blocked1>
-  tt.return
-}
-
-// -----
-
 // Invalid offset, not multiple of shapePerTile
 #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{offset [0, 5] must be a multiple of shapePerCTATile [256, 16]}}
+  // expected-error @+1 {{offset must be multiple of shapePerCTATile}}
   %1 = amdgpu.extract_slice %arg0 [0,5] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
   tt.return
 }
@@ -43,7 +23,7 @@ tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibi
 // Invalid offset, out of bounds for dimension
 #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{invalid offset 128 at dimension 1}}
+  // expected-error @+1 {{invalid offset at dimension 1}}
   %1 = amdgpu.extract_slice %arg0 [0,128] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
   tt.return
 }
@@ -54,11 +34,10 @@ tt.func @invalid_offset_input(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibi
 #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 tt.func @invalid_result_layout(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{result layout must match source layout}}
+  // expected-error @+1 {{CTA tile shapes must match between source and destination tensors.}}
   %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked2>
   tt.return
 }
-
 // -----
 
 // Invalid result element type
@@ -84,23 +63,13 @@ tt.func @invalid_result_rank(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibil
 // Invalid result shape
 #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 tt.func @invalid_result_rank(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{result shape cannot be larger than input shape at dimension 1}}
+  // expected-error @+1 {{result shape cannot exceed source shape at dimension 1}}
   %1 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x256xi32, #blocked1>
   tt.return
 }
 
 // -----
 
-// Invalid rank
-#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
-tt.func @invalid_rank(%arg0: tensor<256x128x2xi32, #blocked1> {tt.divisibility = 16 : i32}) {
-  // expected-error @+1 {{currently only 2D tensors are supported}}
-  %1 = amdgpu.extract_slice %arg0 [0,0,0] : tensor<256x128x2xi32, #blocked1> to tensor<256x16x2xi32, #blocked1>
-  tt.return
-}
-
-// -----
-
 // Invalid non static offset
 #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 tt.func @invalid_non_static_offset(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}, %arg1: i32) {
@@ -109,3 +78,25 @@ tt.func @invalid_non_static_offset(%arg0: tensor<256x128xi32, #blocked1> {tt.div
   %2 = amdgpu.extract_slice %arg0 [%arg1, 0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
   tt.return
 }
+
+// -----
+
+// Invalid layout 1
+#dst_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [64, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+#src_layout = #ttg.linear<{register=[[0, 0], [0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+tt.func @invalid_register_base(%arg0: tensor<256x256xi32, #src_layout> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{Register basis must match on a CTA tile between source and destination}}
+  %2 = amdgpu.extract_slice %arg0 [0, 0] : tensor<256x256xi32, #src_layout> to tensor<128x128xi32, #dst_layout>
+  tt.return
+}
+
+// -----
+
+// Invalid layout 2
+#dst_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [64, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4]], warp=[[0, 32], [32, 0]], block=[]}>
+#src_layout = #ttg.linear<{register=[[0, 1], [0, 2], [0, 8], [0, 16], [0, 64], [0, 128], [64, 0], [128, 0]], lane=[[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 4], [0, 0]], warp=[[0, 32], [32, 0]], block=[]}>
+tt.func @invalid_lane_warp_basis(%arg0: tensor<256x256xi32, #src_layout> {tt.divisibility = 16 : i32}) {
+  // expected-error @+1 {{Lane and warp dim basis must match between source and destination layout}}
+  %2 = amdgpu.extract_slice %arg0 [0, 0] : tensor<256x256xi32, #src_layout> to tensor<128x128xi32, #dst_layout>
+  tt.return
+}
@@ -0,0 +1,24 @@
+// RUN: triton-opt %s -split-input-file -canonicalize | FileCheck %s
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @canonicalize_after_concat(
+    %arg0: tensor<32x64xf32, #blocked>,
+    %arg1: tensor<32x64xf32, #blocked>,
+    %arg2: tensor<32x64xf32, #blocked>,
+    %arg3: tensor<32x64xf32, #blocked>,
+    %arg4: tensor<32x64xf32, #blocked>,
+    %arg5: tensor<32x64xf32, #blocked>,
+    %arg6: tensor<32x64xf32, #blocked>,
+    %arg7: tensor<32x64xf32, #blocked>) -> tensor<32x64xf32, #blocked> {
+    // CHECK-LABEL: tt.func @canonicalize_after_concat
+
+    %1 = amdgpu.concat %arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7:
+    tensor<32x64xf32, #blocked>,tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked>, tensor<32x64xf32, #blocked> -> tensor<128x128xf32, #blocked>
+    %2 = amdgpu.extract_slice %1 [32, 64] : tensor<128x128xf32, #blocked> to tensor<32x64xf32, #blocked>
+    // CHECK: tt.return %arg3 : tensor<32x64xf32, #blocked>
+    tt.return %2 : tensor<32x64xf32, #blocked>
+  }
+}
@@ -1,14 +1,57 @@
-// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s
 
 #blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>
 module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  tt.func @basic_insert_slice(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
-    // CHECK: llvm.func @basic_insert_slice
-    // CHECK-COUNT-64: %{{[0-9]*}} = llvm.extractvalue  %arg0[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)>
-    // CHECK: %64 = llvm.mlir.undef : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
-    // CHECK-COUNT-8:  %{{[0-9]*}} = llvm.insertvalue %{{[0-9]*}}, %{{[0-9]*}}[{{[0-9]*}}] : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
+  tt.func @extract_2d_blocked_tensor(%arg0: tensor<256x128xi32, #blocked1> {tt.divisibility = 16 : i32}) {
+    // CHECK-LABEL: llvm.func @extract_2d_blocked_tensor
+    // CHECK-COUNT-64: %{{.*}} = llvm.extractvalue  %{{.*}} : !llvm.struct
+    // CHECK-COUNT-8:  %{{.*}} = llvm.insertvalue %{{.*}} : !llvm.struct
     %72 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #blocked1> to tensor<256x16xi32, #blocked1>
     tt.return
   }
 }
+
+// -----
+
+#ll1 = #ttg.linear<{register = [[1, 0], [2, 0], [4, 0], [0, 16], [0, 32], [0, 64]], lane = [[0, 1], [0, 2], [0, 4], [0, 8], [8, 0], [16, 0]], warp = [[32, 0], [64, 0], [128, 0]], block = []}>
+#ll2 = #ttg.linear<{register = [[1, 0], [2, 0], [4, 0]], lane = [[0, 1], [0, 2], [0, 4], [0, 8], [8, 0], [16, 0]], warp = [[32, 0], [64, 0], [128, 0]], block = []}>
+
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @extract_2d_linear_tensor(%arg0: tensor<256x128xi32, #ll1> {tt.divisibility = 16 : i32}) {
+    // CHECK-LABEL: llvm.func @extract_2d_linear_tensor
+    // CHECK-COUNT-64: %{{.*}} = llvm.extractvalue  %arg0[{{[0-9]*}}] : !llvm.struct
+    // CHECK-COUNT-8:  %{{.*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+    %72 = amdgpu.extract_slice %arg0 [0,0] : tensor<256x128xi32, #ll1> to tensor<256x16xi32, #ll2>
+    tt.return
+  }
+}
+
+// -----
+
+#ll1 = #ttg.linear<{register = [[0, 1, 0], [0, 2, 0], [0, 4, 0], [0, 0, 16], [0, 0, 32], [0, 0, 64], [1, 0, 0]], lane = [[0, 0, 1], [0, 0, 2], [0, 0, 4], [0, 0, 8], [0, 8, 0], [0, 16, 0]], warp = [[0, 32, 0], [0, 64, 0], [0, 128, 0]], block = []}>
+#ll2 = #ttg.linear<{register = [[0, 1, 0], [0, 2, 0], [0, 4, 0], [0, 0, 16], [0, 0, 32], [0, 0, 64]], lane = [[0, 0, 1], [0, 0, 2], [0, 0, 4], [0, 0, 8], [0, 8, 0], [0, 16, 0]], warp = [[0, 32, 0], [0, 64, 0], [0, 128, 0]], block = []}>
+
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @extract_3d_linear_tensor(%arg0: tensor<2x256x128xi32, #ll1> {tt.divisibility = 16 : i32}) {
+    // CHECK-LABEL: llvm.func @extract_3d_linear_tensor
+    // CHECK-COUNT-128: %{{.*}} = llvm.extractvalue %arg0[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-64: %{{[0-9]*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+    %72 = amdgpu.extract_slice %arg0 [0,0,0] : tensor<2x256x128xi32, #ll1> to tensor<1x256x128xi32, #ll2>
+    tt.return
+  }
+}
+
+// -----
+
+#ll1 = #ttg.linear<{register=[[1], [256], [512]], lane=[[2], [4], [8], [16], [32], [64]], warp=[[128]], block=[]}>
+#ll2 = #ttg.linear<{register=[[1]], lane=[[2], [4], [8], [16], [32], [64]], warp=[[128]], block=[]}>
+module attributes {"ttg.compute-capability" = 0 : i32, "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @extract_1d_linear_tensor(%arg0: tensor<1024xi32, #ll1> {tt.divisibility = 16 : i32}) {
+    // CHECK-LABEL: llvm.func @extract_1d_linear_tensor
+    // CHECK-COUNT-8: %{{.*}} = llvm.extractvalue %arg0[{{.*}}] : !llvm.struct
+    // CHECK-COUNT-2: %{{[0-9]*}} = llvm.insertvalue %{{.*}} : !llvm.struct
+    %72 = amdgpu.extract_slice %arg0 [0] : tensor<1024xi32, #ll1> to tensor<256xi32, #ll2>
+    tt.return
+  }
+}
@@ -65,6 +65,53 @@ def ExtractSliceOp : TT_AMDGPU_Op<"extract_slice", [Pure]> {
     * source: the base tensor on which to create a view tensor
     * offsets: offsets into the base tensor at which to create the view
 
+    In distributed layouts, tensors are divided into CTA tiles.
+    A CTA tile represents the smallest contiguous portion of a tensor that is
+    distributed across all threads and warps within a workgroup.
+    The ExtractSlice operation extracts a portion of the tensor that is a
+    multiple of CTA tiles.
+
+    The source and destination must have matching linear layouts at the CTA
+    tile level. This ensures that the extract_slice is a no-op, meaning no data
+    rearrangement between threads is required to extract the destination tensor
+    with the given shape and layout.
+
+      +-------+-------+
+      |  W0   |  W1   |
+      |       |       |
+      |   +   |   +   |
+      |  W2   |  W3   |  <-- Single CTA tile (distributed across warps W0-W3)
+      |       |       |
+      |   +   |   +   |
+      |       |       |
+      +-------+-------+
+      |          Source Tensor                    Extracted Slice
+      |             .                           +--------------+
+      |             .                           |  W0  |  W1   |
+      |             .                           |      |       |
+      |                                         |  +   |   +   |
+      |                                         |  W2  |  W3   |
+      |                                         |      |       |
+      |                                         |  +   |   +   |
+      |                                         |      |       |
+      |                                         +-------+------+
+      |                                         |  W0  |   W1  |
+      |                                         |      |       |
+      |                                         |  +   |   +   |
+      |                                         |  W2     W3   |
+      |                                         |      |       |
+      |                                         |  +   |   +   |
+      |                                         |      |       |
+      |                                         +--------------+
+
+
+    This op is designed to work on logical tensors directly, avoiding the need
+    for complex layout reinterpretation or reshaping. For example, the tt.split
+    operation only supports splitting along the innermost dimension,
+    and requires that the resulting innermost dimension provide 2 elements per thread,
+    distributed across registers. In contrast, extract_slice op imposes no constraints
+    on the extraction dimension or the size of dimensions.
+
     Example 1:
 
     ```mlir
@@ -80,11 +127,11 @@ def ExtractSliceOp : TT_AMDGPU_Op<"extract_slice", [Pure]> {
     ```
 
     Example 1 shows how "extract_slice" operation may be used. In this example a
-    new slice of 128x32 is created. "extract_slice" works on tensors with layout
-    where the desired slice has the same layout as the source tensor.
-    "%0" cannot be sliced directly as the resulting slice cannot have the same
-    layout as "%0". Therefore it needs to be converted to a layout suitable
-    for slicing. "#blocked1" layout is appropriate for this as it keeps the
+    new slice of 128x32 is created. "extract_slice" works on tensors
+    where the desired slice has the same layout on a CTA tile as the source tensor.
+    "%0" cannot be sliced directly as the resulting slice does not satisfy this condition.
+    Therefore it needs to be converted to a layout suitable for slicing.
+    "#blocked1" layout is appropriate for this as it keeps the
     sizePerThread the same thus keeping coalescing properties the same.
     In order to utilize all threads in a warp, "threadsPerWarp" is set to
     [16,4] for this new layout. This layout conversion carried out before
@@ -117,6 +164,7 @@ def ExtractSliceOp : TT_AMDGPU_Op<"extract_slice", [Pure]> {
   }];
 
   let hasVerifier = 1;
+  let hasCanonicalizer = 1;
 }
 
 def ConcatOp : TT_AMDGPU_Op<"concat", [Pure]> {
 
@@ -0,0 +1,23 @@
+#ifndef TRITON_THIRD_PARTY_AMD_INCLUDE_UTILS_UTILITY_H_
+#define TRITON_THIRD_PARTY_AMD_INCLUDE_UTILS_UTILITY_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include <cassert>
+#include <vector>
+namespace mlir::LLVM::AMD {
+
+template <typename T, typename U, typename BinaryOp>
+std::vector<unsigned> multiDimElementwise(const ArrayRef<T> &lhs,
+                                          const ArrayRef<U> &rhs, BinaryOp op) {
+  assert(lhs.size() == rhs.size() && "Input dimensions must match");
+  std::vector<unsigned> result;
+  result.reserve(lhs.size());
+  for (size_t i = 0, n = lhs.size(); i < n; ++i) {
+    unsigned a = static_cast<unsigned>(lhs[i]);
+    unsigned b = static_cast<unsigned>(rhs[i]);
+    result.push_back(op(a, b));
+  }
+  return result;
+}
+} // namespace mlir::LLVM::AMD
+#endif // TRITON_THIRD_PARTY_AMD_INCLUDE_UTILS_UTILITY_H_