[VectorExt] Add generic vectorization to iree_vector_ext.transfer_gather (#20476)

Groverkss · web-flow · commit 73c74628ed81 · 2025-04-23T19:12:53.000+01:00
This patch adds support to GenericVectorization to vectorize some
special gather-like generic operations to transfer_gather. This support
is added behind a flag and should not affect any existing pipelines.
diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
@@ -191,6 +191,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
         "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect",
+        "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms:VectorExtTransforms",
         "//compiler/src/iree/compiler/Codegen/Interfaces:BufferizationInterfaces",
         "//compiler/src/iree/compiler/Codegen/Interfaces:PartitionableLoopsInterface",
         "//compiler/src/iree/compiler/Codegen/Interfaces:UKernelOpInterface",
diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -226,6 +226,7 @@ iree_cc_library(
     iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
     iree::compiler::Codegen::Dialect::GPU::TargetUtils::KnownTargets
     iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect
+    iree::compiler::Codegen::Dialect::VectorExt::Transforms::VectorExtTransforms
     iree::compiler::Codegen::Interfaces::BufferizationInterfaces
     iree::compiler::Codegen::Interfaces::PartitionableLoopsInterface
     iree::compiler::Codegen::Interfaces::UKernelOpInterface
diff --git a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
@@ -6,6 +6,8 @@
 
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Common/TileSizeSelection.h"
+#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
+#include "iree/compiler/Codegen/Dialect/VectorExt/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
@@ -99,8 +101,9 @@ class GenericVectorizationPass final
       GenericVectorizationPass>::GenericVectorizationPassBase;
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<tensor::TensorDialect, linalg::LinalgDialect,
-                    vector::VectorDialect>();
+    registry
+        .insert<tensor::TensorDialect, linalg::LinalgDialect,
+                vector::VectorDialect, IREE::VectorExt::IREEVectorExtDialect>();
   }
   void runOnOperation() override;
 };
@@ -156,8 +159,16 @@ void GenericVectorizationPass::runOnOperation() {
     }
     // Pad scalable dims with `false` to match the vector sizes.
     scalableVecDims.resize(vectorSizes.size());
-    (void)linalg::vectorize(rewriter, op, vectorSizes, scalableVecDims,
-                            vectorizeGatherAccesses);
+
+    // Try to vectorize to transfer_gather, if possible.
+    if (isa<linalg::GenericOp>(op) && vectorizeToTransferGather) {
+      (void)IREE::VectorExt::vectorizeGatherLikeGenericToTransferGather(
+          rewriter, cast<linalg::GenericOp>(op), vectorSizes, scalableVecDims,
+          vectorizeGatherAccesses);
+    } else {
+      (void)linalg::vectorize(rewriter, op, vectorSizes, scalableVecDims,
+                              vectorizeGatherAccesses);
+    }
   };
 
   {
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -352,6 +352,8 @@ def GenericVectorizationPass :
       "Rewrite all tensor.pad ops in the function to vector form.">,
     Option<"vectorizeGatherAccesses", "vectorize-gather-accesses", "bool", /*default=*/"false",
       "Enable vectorizaiton of operations that may generate vector.gather operations.">,
+    Option<"vectorizeToTransferGather", "vectorize-to-transfer-gather", "bool", /*default=*/"false",
+      "Enables vectorization of gather-like operations that may generate iree_vector_ext.transfer_gather">,
     Option<"enableCleanup", "enable-cleanup", "bool",/*default=*/"true",
       "Enable cleanups after vectorization. The patterns touch the structure"
       "generated from tiling so it affects later steps like bufferization and vector hoisting.">,
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
@@ -1,6 +1,7 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-generic-vectorization))" --split-input-file %s | FileCheck %s
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-generic-vectorization{enable-vector-masking=true vectorize-padding=true}))" --split-input-file %s | FileCheck %s -check-prefix=CHECK-MASK
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-generic-vectorization{fold-cast-into-contract=true}))" --split-input-file %s | FileCheck %s -check-prefix=CHECK-FOLD
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-generic-vectorization{vectorize-to-transfer-gather=true}))" --split-input-file %s | FileCheck %s --check-prefix=CHECK-GATHER
 
 func.func @matmul(%lhs: tensor<3x4xf16>, %rhs: tensor<4x5xf16>, %acc: tensor<3x5xf32>) -> tensor<3x5xf32> {
   %result = linalg.matmul ins(%lhs, %rhs: tensor<3x4xf16>, tensor<4x5xf16>) outs(%acc: tensor<3x5xf32>) -> tensor<3x5xf32>
@@ -533,3 +534,172 @@ func.func @depthwise_conv_fold_away_masking(%arg0: tensor<1x68x120x96xf32>, %arg
 // CHECK-MASK:     vector.fma
 // CHECK-MASK-NOT: vector.create_mask
 // CHECK-MASK-NOT: vector.constant_mask
+
+// -----
+
+!storage = tensor<8192x8xf16>
+!ind     = tensor<128xi64>
+!x       = tensor<128x8xf16>
+
+#gather = {
+    indexing_maps = [affine_map<(page, vec) -> (page)>,
+                     affine_map<(page, vec) -> (page, vec)>],
+    iterator_types = ["parallel", "parallel"]
+}
+
+func.func @paged_gather_read(%storage : !storage, %ind: !ind) -> !x {
+  %x = tensor.empty() : !x
+  %x_g = linalg.generic #gather
+         ins(%ind : !ind)
+         outs(%x : !x) {
+  ^bb0(%page: i64, %out: f16):
+    %pageidx = arith.index_cast %page : i64 to index
+    %vec   = linalg.index 1 : index
+    %extracted = tensor.extract %storage[%pageidx, %vec] : !storage
+    linalg.yield %extracted : f16
+  } -> !x
+  return %x_g : !x
+}
+
+// CHECK-GATHER-LABEL: @paged_gather_read
+// CHECK-GATHER-SAME: %[[ARG0:.+]]: tensor<8192x8xf16>, %[[ARG1:.+]]: tensor<128xi64>
+// CHECK-GATHER: %[[INDEX_LOAD:.+]] = vector.transfer_read %[[ARG1]]
+// CHECK-GATHER: %[[INDEX_CAST:.+]] = arith.index_cast %[[INDEX_LOAD]] : vector<128xi64> to vector<128xindex>
+// CHECK-GATHER: %[[GATHER:.+]] = iree_vector_ext.transfer_gather %[[ARG0]]
+// CHECK-GATHER-SAME: [%[[INDEX_CAST]]: vector<128xindex>, None]
+// CHECK-GATHER: vector.transfer_write %[[GATHER]], %{{.*}}
+
+// -----
+
+!storage = tensor<8192x8xf16>
+!x       = tensor<128x8xf16>
+
+#gather = {
+    indexing_maps = [affine_map<(page, vec) -> (page, vec)>],
+    iterator_types = ["parallel", "parallel"]
+}
+
+func.func @contiguous_gather_read(%storage : !storage) -> !x {
+  %x = tensor.empty() : !x
+  %x_g = linalg.generic #gather
+         outs(%x : !x) {
+  ^bb0(%out: f16):
+    %pageidx = linalg.index 0 : index
+    %vec   = linalg.index 1 : index
+    %extracted = tensor.extract %storage[%pageidx, %vec] : !storage
+    linalg.yield %extracted : f16
+  } -> !x
+  return %x_g : !x
+}
+
+// CHECK-GATHER-LABEL: @contiguous_gather_read
+// CHECK-GATHER-SAME: %[[ARG0:.+]]: tensor<8192x8xf16>
+// CHECK-GATHER: %[[GATHER:.+]] = iree_vector_ext.transfer_gather %[[ARG0]]
+// CHECK-GATHER-SAME: [None, None]
+// CHECK-GATHER: vector.transfer_write %[[GATHER]], %{{.*}}
+
+// -----
+
+!storage = tensor<8192x8xf16>
+!ind     = tensor<128xi64>
+!x       = tensor<128x8xf16>
+
+#gather = {
+    indexing_maps = [affine_map<(page, vec) -> (page)>,
+                     affine_map<(page, vec) -> (page, vec)>],
+    iterator_types = ["parallel", "parallel"]
+}
+
+func.func @negative_strided_paged_gather_read(%storage : !storage, %ind: !ind) -> !x {
+  %x = tensor.empty() : !x
+  %c2 = arith.constant 2 : index
+  %x_g = linalg.generic #gather
+         ins(%ind : !ind)
+         outs(%x : !x) {
+  ^bb0(%page: i64, %out: f16):
+    %pageidx = arith.index_cast %page : i64 to index
+    %vec   = linalg.index 1 : index
+    %strided_vec = arith.muli %vec, %c2 : index
+    %extracted = tensor.extract %storage[%pageidx, %strided_vec] : !storage
+    linalg.yield %extracted : f16
+  } -> !x
+  return %x_g : !x
+}
+
+// For now, the vectorizer does not walk back on binary ops to find a mapping
+// from the iteration space to the memory space. This can be improved in future.
+// CHECK-GATHER-LABEL: @negative_strided_paged_gather_read
+// CHECK-GATHER: linalg.generic
+
+// -----
+
+!storage = tensor<8192x8xf16>
+!ind0     = tensor<128xi64>
+!ind1     = tensor<8xi64>
+!x       = tensor<128x8xf16>
+
+#gather = {
+    indexing_maps = [affine_map<(d0, d1) -> (d0)>,
+                     affine_map<(d0, d1) -> (d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+}
+
+func.func @full_gather_read(%storage : !storage, %ind0: !ind0, %ind1 : !ind1) -> !x {
+  %x = tensor.empty() : !x
+  %x_g = linalg.generic #gather
+         ins(%ind0, %ind1 : !ind0, !ind1)
+         outs(%x : !x) {
+   ^bb0(%id0: i64, %id1 : i64, %out: f16):
+    %idx0 = arith.index_cast %id0 : i64 to index
+    %idx1 = arith.index_cast %id1 : i64 to index
+    %extracted = tensor.extract %storage[%idx0, %idx1] : !storage
+    linalg.yield %extracted : f16
+  } -> !x
+  return %x_g : !x
+}
+
+// CHECK-GATHER-LABEL: @full_gather_read
+// CHECK-GATHER-SAME: %[[ARG0:.+]]: tensor<8192x8xf16>, %[[ARG1:.+]]: tensor<128xi64>, %[[ARG2:.+]]: tensor<8xi64>
+// CHECK-GATHER-DAG: %[[IDX0:.+]] = vector.transfer_read %[[ARG1]]
+// CHECK-GATHER-DAG: %[[IDX1:.+]] = vector.transfer_read %[[ARG2]]
+// CHECK-GATHER-DAG: %[[CAST0:.+]] = arith.index_cast %[[IDX0]] : vector<128xi64> to vector<128xindex>
+// CHECK-GATHER-DAG: %[[CAST1:.+]] = arith.index_cast %[[IDX1]] : vector<8xi64> to vector<8xindex>
+// CHECK-GATHER-DAG: %[[GATHER:.+]] = iree_vector_ext.transfer_gather %[[ARG0]]
+// CHECK-GATHER-SAME: [%[[CAST0]]: vector<128xindex>, %[[CAST1]]: vector<8xindex>]
+// CHECK-GATHER: vector.transfer_write %[[GATHER]], %{{.*}}
+
+// -----
+
+!storage = tensor<8192x8xf16>
+!ind0     = tensor<128xi64>
+!ind1     = tensor<8xi64>
+!x       = tensor<128x8xf16>
+
+#gather = {
+    indexing_maps = [affine_map<(d0, d1) -> (d0)>,
+                     affine_map<(d0, d1) -> (d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+}
+
+func.func @multi_extract(%storage : !storage, %storage2: !storage, %ind0: !ind0, %ind1 : !ind1) -> ( !x, !x ) {
+  %x = tensor.empty() : !x
+  %x_g, %x_g1 = linalg.generic #gather
+         ins(%ind0, %ind1 : !ind0, !ind1)
+         outs(%x, %x : !x, !x) {
+   ^bb0(%id0: i64, %id1 : i64, %out: f16, %out2: f16):
+    %idx0 = arith.index_cast %id0 : i64 to index
+    %idx1 = arith.index_cast %id1 : i64 to index
+    %extracted = tensor.extract %storage[%idx0, %idx1] : !storage
+    %idx2 = arith.index_cast %id0 : i64 to index
+    %idx3 = arith.index_cast %id1 : i64 to index
+    %extracted1 = tensor.extract %storage2[%idx2, %idx3] : !storage
+    linalg.yield %extracted, %extracted1 : f16, f16
+  } -> (!x, !x)
+  return %x_g, %x_g1 : !x, !x
+}
+
+// CHECK-GATHER-LABEL: @multi_extract
+// CHECK-GATHER-COUNT-2: transfer_gather
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/BUILD.bazel
@@ -37,14 +37,18 @@ iree_compiler_cc_library(
     hdrs = [
         "Passes.h",
         "Passes.h.inc",
+        "Transforms.h",
     ],
     deps = [
         ":PassesIncGen",
         "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect",
+        "//compiler/src/iree/compiler/Dialect/LinalgExt/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/CMakeLists.txt
@@ -25,6 +25,7 @@ iree_cc_library(
   HDRS
     "Passes.h"
     "Passes.h.inc"
+    "Transforms.h"
   SRCS
     "Passes.cpp"
     "VectorExtFoldUnitExtentDims.cpp"
@@ -35,6 +36,8 @@ iree_cc_library(
     MLIRArithDialect
     MLIRFunctionInterfaces
     MLIRIR
+    MLIRLinalgDialect
+    MLIRLinalgTransforms
     MLIRPass
     MLIRSupport
     MLIRTensorDialect
@@ -44,6 +47,7 @@ iree_cc_library(
     MLIRVectorTransforms
     MLIRVectorUtils
     iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect
+    iree::compiler::Dialect::LinalgExt::Utils
   PUBLIC
 )
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/Transforms.h b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/Transforms.h
@@ -0,0 +1,25 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_CODEGEN_DIALECT_VECTOR_EXT_TRANSFORMS_TRANSFORMS_H_
+#define IREE_COMPILER_CODEGEN_DIALECT_VECTOR_EXT_TRANSFORMS_TRANSFORMS_H_
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::iree_compiler::IREE::VectorExt {
+
+LogicalResult vectorizeGatherLikeGenericToTransferGather(
+    RewriterBase &rewriter, linalg::GenericOp linalgOp,
+    ArrayRef<int64_t> vectorSizes = {}, ArrayRef<bool> scalableVecDims = {},
+    bool vectorizeNDExtract = false);
+
+void populateVectorTransferGatherLoweringPatterns(RewritePatternSet &patterns);
+
+}; // namespace mlir::iree_compiler::IREE::VectorExt
+
+#endif // IREE_COMPILER_CODEGEN_DIALECT_VECTOR_EXT_TRANSFORMS_TRANSFORMS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/Transforms/VectorizeIREEVectorExtOps.cpp