[Mosaic TPU] Support i32 vector multi reduction except cross lane.

bythew3i · Google-ML-Automation · commit 3a5c4da4ef23 · 2024-12-18T16:49:07.000-08:00
PiperOrigin-RevId: 707708236
diff --git a/jax/_src/pallas/mosaic/lowering.py b/jax/_src/pallas/mosaic/lowering.py
@@ -1459,10 +1459,14 @@ def _proxy_fun(val, *, axes):
       kind = type_to_kind[jnp.floating]
       val = type_to_identity[jnp.floating]
       val = ir.FloatAttr.get(aval_to_ir_type(x_aval, shape=()), val)
-    elif jnp.issubdtype(x_aval.dtype, jnp.signedinteger):
-      raise NotImplementedError("Reductions over integers not implemented.")
+    elif x_aval.dtype == jnp.int32:
+      kind = type_to_kind[jnp.signedinteger]
+      val = type_to_identity[jnp.signedinteger]
+      val = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), val)
     elif jnp.issubdtype(x_aval.dtype, jnp.unsignedinteger):
-      raise NotImplementedError("Reductions over integers not implemented.")
+      raise NotImplementedError(
+          "Reductions over unsigned integers not implemented."
+      )
     else:
       raise NotImplementedError(
           f"Reductions over {x_aval.dtype} not implemented.")
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc
@@ -51,9 +51,11 @@
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
+#include "llvm/include/llvm/ADT/APInt.h"
 #include "mlir/include/mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/include/mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/include/mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/include/mlir/IR/Attributes.h"
 #include "mlir/include/mlir/IR/Builders.h"
 #include "mlir/include/mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/include/mlir/IR/OperationSupport.h"
@@ -554,7 +556,7 @@ VectorType getNativeVregType(Type elem_ty,
 FailureOr<Value> maskOOB(RewriteContext &ctx, OpBuilder &builder,
                          TypedValue<VectorType> value,
                          const VRegDataBounds &bounds,
-                         const TypedAttr neutral) {
+                         const Attribute neutral) {
   auto native_vreg_ty =
       getNativeVregType(value.getType().getElementType(), ctx.target_shape);
   TPU_ASSERT_LOC(value.getLoc(), llvm::equal(value.getType().getShape(),
@@ -3926,6 +3928,7 @@ LogicalResult vector_multi_reduction_rule(RewriteContext &ctx, Operation &op,
   ImplicitLocOpBuilder builder(op.getLoc(), &op);
   auto multi_reduction_op = cast<vector::MultiDimReductionOp>(op);
   const VectorType src_ty = multi_reduction_op.getSourceVectorType();
+  auto element_type = src_ty.getElementType();
   int64_t src_rank = src_ty.getRank();
   const auto res_ty = dyn_cast<VectorType>(multi_reduction_op.getDestType());
   if (res_ty == nullptr) {
@@ -3953,44 +3956,56 @@ LogicalResult vector_multi_reduction_rule(RewriteContext &ctx, Operation &op,
     return multi_reduction_op.emitOpError(
         "Not implemented: Only constant accumulator supported");
   }
-  if (!src_ty.getElementType().isF32() && !src_ty.getElementType().isBF16()) {
+  if (!element_type.isF32() && !element_type.isBF16() &&
+      !element_type.isSignlessInteger((32))) {
     return multi_reduction_op.emitOpError(
-               "Not implemented: Only FP32 and BF16 reductions supported, but "
-               "got ")
-           << src_ty;
+        "Not implemented: unsupported element type");
   }
-  auto element_type = cast<FloatType>(src_ty.getElementType());
-  const auto acc_def_value = dyn_cast<DenseFPElementsAttr>(acc_def.getValue());
+  bool is_int = element_type.isSignlessInteger(32);
+  const auto acc_def_value = dyn_cast<DenseElementsAttr>(acc_def.getValue());
   if (acc_def_value == nullptr || !acc_def_value.isSplat()) {
     return multi_reduction_op.emitOpError("Expected a splat constant");
   }
   TPU_ASSERT_OP(acc_def_value.getElementType() == element_type);
-  const auto val = acc_def_value.getSplatValue<FloatAttr>();
-  FloatAttr neutral;
+  Attribute neutral;
   switch (multi_reduction_op.getKind()) {
     case vector::CombiningKind::ADD:
-      neutral = builder.getFloatAttr(element_type, 0);
+      neutral = builder.getZeroAttr(element_type);
       break;
     case vector::CombiningKind::MAXIMUMF: {
       // TODO(b/322836633): The semantics of maximumf don't match the lowering
       // for older TPU versions because older TPU versions don't respect the
       // -0.0 vs +0.0 ordering.
       neutral = builder.getFloatAttr(
-          element_type, APFloat::getInf(element_type.getFloatSemantics(),
-                                        /*Negative=*/true));
+          element_type,
+          APFloat::getInf(cast<FloatType>(element_type).getFloatSemantics(),
+                          /*Negative=*/true));
     } break;
     case vector::CombiningKind::MINIMUMF: {
       neutral = builder.getFloatAttr(
-          element_type, APFloat::getInf(element_type.getFloatSemantics(),
-                                        /*Negative=*/false));
+          element_type,
+          APFloat::getInf(cast<FloatType>(element_type).getFloatSemantics(),
+                          /*Negative=*/false));
+    } break;
+    case vector::CombiningKind::MAXSI: {
+      neutral = builder.getIntegerAttr(
+          element_type,
+          APInt::getSignedMinValue(element_type.getIntOrFloatBitWidth()));
+    } break;
+    case vector::CombiningKind::MINSI: {
+      neutral = builder.getIntegerAttr(
+          element_type,
+          APInt::getSignedMaxValue(element_type.getIntOrFloatBitWidth()));
     } break;
     default:
       return multi_reduction_op.emitOpError(
           "Not implemented: unsupported kind");
   }
-  if (val != neutral) {
+  if (auto val = acc_def_value.getSplatValue<Attribute>(); val != neutral) {
     return multi_reduction_op.emitOpError(
-        "Not implemented: Only neutral accumulator supported");
+               "Not implemented: Only neutral accumulator supported for "
+               "float reduction. Expected ")
+           << neutral << ", but got " << val;
   }
 
   std::array<bool, 2> reduces;
@@ -4074,9 +4089,11 @@ LogicalResult vector_multi_reduction_rule(RewriteContext &ctx, Operation &op,
       tpu_kind = tpu::ReductionKind::SUM;
       break;
     case vector::CombiningKind::MAXIMUMF:
+    case vector::CombiningKind::MAXSI:
       tpu_kind = tpu::ReductionKind::MAX;
       break;
     case vector::CombiningKind::MINIMUMF:
+    case vector::CombiningKind::MINSI:
       tpu_kind = tpu::ReductionKind::MIN;
       break;
     default:
@@ -4103,14 +4120,29 @@ LogicalResult vector_multi_reduction_rule(RewriteContext &ctx, Operation &op,
             src_vregs.Slice(src_slice_start, src_slice_end);
         std::optional<Value> acc_vreg;
         auto reduce_elementwise = [&](Value lhs, Value rhs) -> Value {
+          Value result;
           switch (tpu_kind) {
             case tpu::ReductionKind::SUM:
-              return builder.create<arith::AddFOp>(loc, lhs, rhs);
+              result =
+                  is_int
+                      ? builder.create<arith::AddIOp>(loc, lhs, rhs).getResult()
+                      : builder.create<arith::AddFOp>(loc, lhs, rhs)
+                            .getResult();
+              break;
             case tpu::ReductionKind::MAX:
-              return builder.create<arith::MaximumFOp>(loc, lhs, rhs);
+              result = is_int ? builder.create<arith::MaxSIOp>(loc, lhs, rhs)
+                                    .getResult()
+                              : builder.create<arith::MaximumFOp>(loc, lhs, rhs)
+                                    .getResult();
+              break;
             case tpu::ReductionKind::MIN:
-              return builder.create<arith::MinimumFOp>(loc, lhs, rhs);
+              result = is_int ? builder.create<arith::MinSIOp>(loc, lhs, rhs)
+                                    .getResult()
+                              : builder.create<arith::MinimumFOp>(loc, lhs, rhs)
+                                    .getResult();
+              break;
           }
+          return result;
         };
         auto reduction_status = reduced_vregs.EachStatus(
             [&](const absl::Span<const int64_t> red_idx,
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/canonicalize_mosaic.cc b/jaxlib/mosaic/dialect/tpu/transforms/canonicalize_mosaic.cc
@@ -382,7 +382,14 @@ LogicalResult canonicalize_multi_dim_reduction(int hardware_generation,
       op.erase();
     }
     return success();
+  } else if (element_type.isSignlessInteger(32) &&
+             // TODO(b/384774084): Add support for u32 reductions.
+             (op.getKind() == vector::CombiningKind::ADD ||
+              op.getKind() == vector::CombiningKind::MAXSI ||
+              op.getKind() == vector::CombiningKind::MINSI)) {
+    return success();
   }
+  op.emitOpError("Unsupported element type for the selected reduction");
   return failure();
 }
 
diff --git a/tests/pallas/tpu_ops_test.py b/tests/pallas/tpu_ops_test.py
@@ -282,6 +282,32 @@ def kernel(x_ref, mask_ref, o_ref):
     expected = jnp.where(mask, x, jnp.zeros_like(x))
     self.assertArraysEqual(out, expected)
 
+  @parameterized.product(
+      dtype = [jnp.float32, jnp.bfloat16, jnp.int32],
+      axis = [0, 1, 2],
+      reduce_func = [jnp.sum, jnp.max, jnp.min]
+  )
+  def test_reduction(self, dtype, axis, reduce_func):
+    if dtype == jnp.int32 and axis == 2:
+      self.skipTest("Int32 reduction on minor is not supported.")
+    # TODO(b/384127570): fix bfloat16 reduction.
+    if dtype == jnp.bfloat16 and reduce_func != jnp.sum:
+      self.skipTest("b/384127570")
+    in_shape = (2, 16, 128)
+    out_shape = list(in_shape)
+    out_shape[axis] = 1
+
+    def kernel(x, out):
+      out[:] = reduce_func(x[:], axis, keepdims=True)
+
+    x = jnp.arange(np.prod(in_shape), dtype=dtype).reshape(in_shape)
+    result = pl.pallas_call(
+        kernel,
+        out_shape=jax.ShapeDtypeStruct(out_shape, x.dtype),
+    )(x)
+    expected = reduce_func(x, axis, keepdims=True)
+    np.testing.assert_array_equal(result, expected)
+
 
 class OpsInterpretTest(OpsTest):
   INTERPRET = True

Original file line number	Diff line number	Diff line change
`@@ -382,7 +382,14 @@ LogicalResult canonicalize_multi_dim_reduction(int hardware_generation,`
`382`	`382`	`op.erase();`
`383`	`383`	`}`
`384`	`384`	`return success();`
	`385`	`+ } else if (element_type.isSignlessInteger(32) &&`
	`386`	`+ // TODO(b/384774084): Add support for u32 reductions.`
	`387`	`+ (op.getKind() == vector::CombiningKind::ADD \|\|`
	`388`	`+ op.getKind() == vector::CombiningKind::MAXSI \|\|`
	`389`	`+ op.getKind() == vector::CombiningKind::MINSI)) {`
	`390`	`+ return success();`
`385`	`391`	`}`
	`392`	`+ op.emitOpError("Unsupported element type for the selected reduction");`
`386`	`393`	`return failure();`
`387`	`394`	`}`
`388`	`395`