[XLA:CPU][XTile] Add lowering for StableHLO DotGeneral.

WillFroom · Google-ML-Automation · commit 832c86a2fde1 · 2025-10-16T07:00:35.000-07:00
PiperOrigin-RevId: 820214413
diff --git a/xla/backends/cpu/codegen/tiled/tiled_kernel_test.py b/xla/backends/cpu/codegen/tiled/tiled_kernel_test.py
@@ -33,6 +33,7 @@ def compare_kernel(
     output_shape: tuple[int, ...],
     dtype,
     expected_output: Callable[[np.ndarray, ...], np.ndarray],
+    exact: bool = True,
 ) -> None:
   mlir_emitter = cpu_testlib.MlirTestKernelEmitter(
       ir, kernel_name, (num_workgroups, 1, 1)
@@ -49,9 +50,14 @@ def compare_kernel(
   output_tensor = create_literal(np.zeros(output_shape, dtype=dtype))
   runner.call(input_tensors + [output_tensor])
 
-  np.testing.assert_array_equal(
-      np.asarray(output_tensor), expected_output(*inputs)
-  )
+  if exact:
+    np.testing.assert_array_equal(
+        np.asarray(output_tensor), expected_output(*inputs)
+    )
+  else:
+    np.testing.assert_array_almost_equal(
+        np.asarray(output_tensor), expected_output(*inputs)
+    )
 
 
 class XtileLoweringTest(absltest.TestCase):
@@ -139,6 +145,68 @@ def test_add_tranpose(self):
         lambda arg: arg + arg.transpose(),
     )
 
+  def test_dot_single_tile(self):
+    ir = """
+      module @dot_single_tile {
+        xtile.entry_func @dot_single_tile(
+            %lhs: memref<8x16xf32>,
+            %rhs: memref<16x8xf32>,
+            %output: memref<8x8xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %offset = arith.constant 0 : index
+          %lhs_tile = xtile.extract %lhs[%offset, %offset][8, 16][1, 1] : memref<8x16xf32> -> tensor<8x16xf32>
+          %rhs_tile = xtile.extract %rhs[%offset, %offset][16, 8][1, 1] : memref<16x8xf32> -> tensor<16x8xf32>
+          %result = stablehlo.dot_general %lhs_tile, %rhs_tile, contracting_dims = [1] x [0] : (tensor<8x16xf32>, tensor<16x8xf32>) -> tensor<8x8xf32>
+          xtile.insert %result into %output[%offset, %offset][8, 8][1, 1] : tensor<8x8xf32> -> memref<8x8xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "dot_single_tile",
+        1,
+        [(8, 16), (16, 8)],
+        (8, 8),
+        np.float32,
+        lambda lhs, rhs: lhs @ rhs,
+        False,
+    )
+
+  def test_dot_fusion_single_tile(self):
+    ir = """
+      module @dot_fusion_single_tile {
+        xtile.entry_func @dot_fusion_single_tile(
+            %lhs_0: memref<8x16xf32>,
+            %lhs_1: memref<8x16xf32>,
+            %rhs: memref<16x1xf32>,
+            %output: memref<8x1xf32>,
+            %tile_id: index) attributes {xtile.tiling_info = #xtile.tiling_info<tile_count:1, tiles_per_workgroup:1>} {
+          %offset = arith.constant 0 : index
+          %lhs_0_tile = xtile.extract %lhs_0[%offset, %offset][8, 16][1, 1] : memref<8x16xf32> -> tensor<8x16xf32>
+          %lhs_1_tile = xtile.extract %lhs_1[%offset, %offset][8, 16][1, 1] : memref<8x16xf32> -> tensor<8x16xf32>
+          %add_lhs = arith.addf %lhs_0_tile, %lhs_1_tile : tensor<8x16xf32>
+          %rhs_tile = xtile.extract %rhs[%offset, %offset][16, 1][1, 1] : memref<16x1xf32> -> tensor<16xf32>
+          %result = stablehlo.dot_general %add_lhs, %rhs_tile, contracting_dims = [1] x [0] : (tensor<8x16xf32>, tensor<16xf32>) -> tensor<8xf32>
+          %tanh_result = math.tanh %result : tensor<8xf32>
+          xtile.insert %tanh_result into %output[%offset, %offset][8, 1][1, 1] : tensor<8xf32> -> memref<8x1xf32>
+          xtile.return
+        }
+      }
+    """
+
+    compare_kernel(
+        ir,
+        "dot_fusion_single_tile",
+        1,
+        [(8, 16), (8, 16), (16, 1)],
+        (8, 1),
+        np.float32,
+        lambda lhs_0, lhs_1, rhs: np.tanh((lhs_0 + lhs_1) @ rhs),
+        False,
+    )
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc b/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
@@ -14,13 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cassert>
+#include <cstdint>
 #include <memory>
 #include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -42,6 +47,158 @@ namespace xla::cpu {
 
 namespace {
 
+mlir::VectorType GetVectorType(mlir::RankedTensorType tensor_type) {
+  return mlir::VectorType::get(tensor_type.getShape(),
+                               tensor_type.getElementType());
+}
+
+mlir::TypedValue<mlir::VectorType> CastToVector(
+    mlir::PatternRewriter& rewriter,
+    mlir::TypedValue<mlir::RankedTensorType> tensor_value) {
+  auto vector_type = GetVectorType(tensor_value.getType());
+  auto cast_op = rewriter.create<mlir::UnrealizedConversionCastOp>(
+      tensor_value.getLoc(), vector_type, tensor_value);
+  return mlir::cast<mlir::TypedValue<mlir::VectorType>>(cast_op.getResult(0));
+}
+
+mlir::AffineMapAttr GetOperandIndexingMap(
+    mlir::OpBuilder& builder, int64_t iterator_count, int64_t rank,
+    llvm::ArrayRef<int64_t> batch_dims,
+    llvm::ArrayRef<int64_t> contracting_dims, int64_t free_dim_offset) {
+  llvm::SmallVector<unsigned> targets(rank, -1);
+  unsigned idx = 0;
+  for (int64_t dim : batch_dims) {
+    targets[dim] = idx++;
+  }
+  for (int64_t dim : contracting_dims) {
+    targets[dim] = idx++;
+  }
+  for (unsigned& target : targets) {
+    if (target == -1) {
+      target = free_dim_offset + idx++;
+    }
+  }
+  auto affine_map = mlir::AffineMap::getMultiDimMapWithTargets(
+      iterator_count, targets, builder.getContext());
+
+  return mlir::AffineMapAttr::get(affine_map);
+}
+
+mlir::AffineMapAttr GetOutputIndexingMap(mlir::OpBuilder& builder,
+                                         int64_t iterator_count,
+                                         int64_t batch_dim_count,
+                                         int64_t contracting_dim_count) {
+  llvm::SmallVector<unsigned> targets(iterator_count - contracting_dim_count);
+  unsigned idx = 0;
+  for (int64_t dim = 0; dim != batch_dim_count; ++dim) {
+    targets[dim] = idx++;
+  }
+  idx += contracting_dim_count;
+  int64_t total_free_dims =
+      iterator_count - batch_dim_count - contracting_dim_count;
+  for (int64_t dim = 0; dim != total_free_dims; ++dim) {
+    targets[batch_dim_count + dim] = idx++;
+  }
+  auto affine_map = mlir::AffineMap::getMultiDimMapWithTargets(
+      iterator_count, targets, builder.getContext());
+
+  return mlir::AffineMapAttr::get(affine_map);
+}
+
+mlir::ArrayAttr GetIteratorTypes(mlir::OpBuilder& builder,
+                                 int64_t iterator_count,
+                                 int64_t batch_dim_count,
+                                 int64_t contracting_dim_count) {
+  llvm::SmallVector<mlir::Attribute> iterator_types;
+  iterator_types.reserve(iterator_count);
+  for (int64_t dim = 0; dim != batch_dim_count; ++dim) {
+    iterator_types.push_back(builder.getAttr<mlir::vector::IteratorTypeAttr>(
+        mlir::vector::IteratorType::parallel));
+  }
+  for (int64_t dim = 0; dim != contracting_dim_count; ++dim) {
+    iterator_types.push_back(builder.getAttr<mlir::vector::IteratorTypeAttr>(
+        mlir::vector::IteratorType::reduction));
+  }
+  int64_t free_dims = iterator_count - batch_dim_count - contracting_dim_count;
+  for (int64_t dim = 0; dim != free_dims; ++dim) {
+    iterator_types.push_back(builder.getAttr<mlir::vector::IteratorTypeAttr>(
+        mlir::vector::IteratorType::parallel));
+  }
+
+  return mlir::ArrayAttr::get(builder.getContext(), iterator_types);
+}
+
+// Lowers from stablehlo.dot_general to vector.contract.
+// The vector contract is very general as described here:
+// https://mlir.llvm.org/docs/Dialects/Vector/#vectorcontract-vectorcontractionop
+// In this lowering the iteration order attribute passed is of the form:
+// (batch..., contracting..., free_lhs..., free_rhs...)
+// TODO(willfroom): Check if there is any performance impact on the order.
+struct LowerDotGeneral : mlir::OpRewritePattern<mlir::stablehlo::DotGeneralOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::DotGeneralOp op,
+      mlir::PatternRewriter& rewriter) const override {
+    auto lhs_vector = CastToVector(rewriter, op.getLhs());
+    auto lhs_rank = lhs_vector.getType().getRank();
+
+    auto rhs_vector = CastToVector(rewriter, op.getRhs());
+    auto rhs_rank = rhs_vector.getType().getRank();
+
+    auto result_vector_type = GetVectorType(op.getResult().getType());
+    auto zero_const = rewriter.create<mlir::arith::ConstantOp>(
+        op->getLoc(), result_vector_type.getElementType(),
+        rewriter.getZeroAttr(result_vector_type.getElementType()));
+    // TODO(willfroom): Ensure this is being folded into the accumilator in the
+    // dot loop.
+    mlir::Value accumulator = rewriter.create<mlir::vector::BroadcastOp>(
+        op->getLoc(), result_vector_type, zero_const);
+
+    mlir::stablehlo::DotDimensionNumbersAttr dimension_numbers =
+        op.getDotDimensionNumbers();
+
+    llvm::ArrayRef<int64_t> lhs_batch =
+        dimension_numbers.getLhsBatchingDimensions();
+    llvm::ArrayRef<int64_t> lhs_contracting =
+        dimension_numbers.getLhsContractingDimensions();
+
+    llvm::ArrayRef<int64_t> rhs_batch =
+        dimension_numbers.getRhsBatchingDimensions();
+    llvm::ArrayRef<int64_t> rhs_contracting =
+        dimension_numbers.getRhsContractingDimensions();
+
+    int64_t lhs_free_dims =
+        lhs_rank - lhs_batch.size() - lhs_contracting.size();
+    int64_t rhs_free_dims =
+        rhs_rank - rhs_batch.size() - rhs_contracting.size();
+    int64_t iterator_count = lhs_batch.size() + lhs_contracting.size() +
+                             lhs_free_dims + rhs_free_dims;
+
+    mlir::Attribute lhs_indexing_map = GetOperandIndexingMap(
+        rewriter, iterator_count, lhs_rank, lhs_batch, lhs_contracting, 0);
+    mlir::Attribute rhs_indexing_map =
+        GetOperandIndexingMap(rewriter, iterator_count, rhs_rank, rhs_batch,
+                              rhs_contracting, lhs_free_dims);
+    mlir::Attribute output_indexing_map = GetOutputIndexingMap(
+        rewriter, iterator_count, lhs_batch.size(), lhs_contracting.size());
+
+    mlir::ArrayAttr indexing_maps = rewriter.getArrayAttr(
+        {lhs_indexing_map, rhs_indexing_map, output_indexing_map});
+    mlir::ArrayAttr iterator_types = GetIteratorTypes(
+        rewriter, iterator_count, lhs_batch.size(), lhs_contracting.size());
+
+    mlir::Value result_vector = rewriter.create<mlir::vector::ContractionOp>(
+        op->getLoc(), lhs_vector, rhs_vector, accumulator, indexing_maps,
+        iterator_types);
+
+    rewriter.replaceOpWithNewOp<mlir::UnrealizedConversionCastOp>(
+        op, op.getResult().getType(), result_vector);
+
+    return mlir::success();
+  }
+};
+
 struct LowerTranspose : mlir::OpRewritePattern<mlir::stablehlo::TransposeOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -79,7 +236,7 @@ class ShloToVectorPass : public impl::ShloToVectorPassBase<ShloToVectorPass> {
   void runOnOperation() override {
     mlir::MLIRContext* context = &getContext();
     mlir::RewritePatternSet patterns(context);
-    patterns.add<LowerTranspose>(context);
+    patterns.add<LowerTranspose, LowerDotGeneral>(context);
     if (mlir::failed(
             mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       signalPassFailure();
diff --git a/xla/backends/cpu/codegen/tiled/transforms/tests/shlo_to_vector.mlir b/xla/backends/cpu/codegen/tiled/transforms/tests/shlo_to_vector.mlir
@@ -6,3 +6,19 @@ func.func @transpose(%input : tensor<1024x32xf32>) -> tensor<32x1024xf32> {
   return %transposed : tensor<32x1024xf32>
 }
 // -----
+
+// CHECK-DAG: #[[LHS_MAP:.*]] = affine_map<(d0, d1, d2) -> (d1, d0)>
+// CHECK-DAG: #[[RHS_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK-DAG: #[[OUTPUT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+func.func @dot_general(%lhs : tensor<1024x32xf32>, %rhs : tensor<32x1024xf32>) -> tensor<1024x1024xf32> {
+  // CHECK: %[[ACCUMULATOR:.*]] = arith.constant dense<0.000000e+00> : vector<1024x1024xf32>
+  // CHECK: vector.contract
+  // CHECK-SAME: {indexing_maps = [#[[LHS_MAP]], #[[RHS_MAP]], #[[OUTPUT_MAP]]],
+  // CHECK-SAME: iterator_types = ["reduction", "parallel", "parallel"],
+  // CHECK-SAME: kind = #vector.kind<add>}
+  // CHECK-SAME: %[[ACCUMULATOR]] : vector<1024x32xf32>, vector<32x1024xf32> into vector<1024x1024xf32>
+  %result = stablehlo.dot_general %lhs, %rhs, contracting_dims = [1] x [0] : (tensor<1024x32xf32>, tensor<32x1024xf32>) -> tensor<1024x1024xf32>
+  return %result : tensor<1024x1024xf32>
+}
+
+// -----