[Mosaic TPU] Support packed type matmul with arbitrary shapes.

bythew3i · Google-ML-Automation · commit 9e5edb7015ac · 2024-12-03T14:58:42.000-08:00
This cl removes all the shape constrains in matmul for all types.

We only need to mask out subelement on contracting dim. Instead of unpacking data and applying masks, we create a VREG-sized i32 "mask" which contains subelement mask info to logical and with target vreg. Through this way, in order to mask sub-elements, each target vreg only needs to apply 1 op (logical_and) instead of 3 ops (unpacking + select + packing).

PiperOrigin-RevId: 702480077
diff --git a/jaxlib/mosaic/dialect/tpu/tpu_ops.cc b/jaxlib/mosaic/dialect/tpu/tpu_ops.cc
@@ -544,6 +544,15 @@ LogicalResult MatmulOp::verify() {
   // however, a good start and the recommended place to add more invariants.
   const VectorType lhs_ty = getLhs().getType();
   const VectorType rhs_ty = getRhs().getType();
+  const VectorType acc_ty = getAcc().getType();
+  const VectorType res_ty = getResult().getType();
+  if (acc_ty != res_ty) {
+    return emitOpError(
+        "Not implemented: matmul acc and result have different types");
+  }
+  if (acc_ty.getElementTypeBitWidth() != 32) {
+    return emitOpError("Expected matmul acc to be 32-bit");
+  }
 
   if (getTransposeLhs()) {
     emitOpError(
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc
@@ -1764,19 +1764,6 @@ LogicalResult tpu_matmul_rule(RewriteContext &ctx, Operation &op,
   // TODO(tlongeri): This should be part of the tpu::MatmulOp verifier
   TPU_ASSERT_EQ_OP(lhs_shape.size(), 2);
   TPU_ASSERT_EQ_OP(rhs_shape.size(), 2);
-  // The code below puts no constraints on the second dimension of both lhs and
-  // rhs. However, leading axis of lhs and rhs needs to be a multiple of native
-  // tiling for packed types.
-  if (layout_lhs.packing() != 1 && lhs_shape[0] % layout_lhs.tiling()[0] != 0) {
-    return op.emitOpError(
-        "Not implemented: Unsupported LHS shape with padded tiling and "
-        "narrower data type");
-  }
-  if (layout_rhs.packing() != 1 && rhs_shape[0] % layout_rhs.tiling()[0] != 0) {
-    return op.emitOpError(
-        "Not implemented: Unsupported RHS shape with padded tiling and "
-        "narrower data type");
-  }
 
   const int64_t padded_lhs_rows =
       llvm::alignTo(lhs_shape[0], layout_lhs.tiling()[0]);
@@ -1787,10 +1774,6 @@ LogicalResult tpu_matmul_rule(RewriteContext &ctx, Operation &op,
   const int64_t padded_rhs_cols =
       llvm::alignTo(rhs_shape[1], layout_rhs.tiling()[1]);
 
-  if (llvm::alignTo(lhs_shape[0], layout_acc.tiling()[0]) != padded_lhs_rows) {
-    return op.emitOpError(
-        "Not implemented: Matmul acc requires less padding than lhs");
-  }
   FAILUREOR_ASSIGN_OR_RETURN(
       xla::Array<Value> lhs_vregs,
       disassemble(builder, layout_lhs, lhs, ctx.target_shape));
@@ -1801,7 +1784,6 @@ LogicalResult tpu_matmul_rule(RewriteContext &ctx, Operation &op,
       xla::Array<Value> rhs_vregs,
       disassemble(builder, layout_rhs, rhs, ctx.target_shape));
   TPU_ASSERT_EQ_OP(padded_lhs_rows, lhs_vregs.dim(0) * layout_lhs.tiling()[0]);
-  TPU_ASSERT_EQ_OP(padded_lhs_rows, acc_vregs.dim(0) * layout_acc.tiling()[0]);
   TPU_ASSERT_EQ_OP(padded_rhs_rows, rhs_vregs.dim(0) * layout_rhs.tiling()[0]);
 
   const VectorType i32_vreg_ty =
@@ -1823,27 +1805,64 @@ LogicalResult tpu_matmul_rule(RewriteContext &ctx, Operation &op,
 
   // We can also extend this helper function with padding_top and padding_left
   // based on the offsets in vregs.
-  // TODO(b/341729764): Support mask subelements.
+  const Value i32_zeros_vreg = builder.create<arith::ConstantOp>(
+      op.getLoc(),
+      DenseElementsAttr::get(i32_vreg_ty, builder.getI32IntegerAttr(0)));
+  const Value i32_max_vreg = builder.create<arith::ConstantOp>(
+      op.getLoc(), DenseElementsAttr::get(
+                       i32_vreg_ty, builder.getI32IntegerAttr(0xffffffff)));
   auto maskVregs = [&](xla::Array<Value> &vregs, int64_t padding_bottom,
                        int64_t padding_right) {
-    const Value i32_zeros_vreg = builder.create<arith::ConstantOp>(
-        op.getLoc(),
-        DenseElementsAttr::get(i32_vreg_ty, builder.getI32IntegerAttr(0)));
     auto vreg_ty = cast<VectorType>(vregs.begin()->getType());
     int packing = vreg_ty.getRank() > 2 ? vreg_ty.getShape()[2] : 1;
     // Mask out the bottom.
     if (padding_bottom > 0) {
       // We have limited the row size of LHS and RHS need to be a multiple of
       // native tiling at the beginning of this rule. Therefore, it is safe to
       // bitcast to x32 vreg for masking.
-      CHECK_EQ(padding_bottom % packing, 0);
-      padding_bottom /= packing;
-      auto mask_bottom = getX32VmaskByPaddingEnd(0, padding_bottom);
+      int sub_padding = padding_bottom % packing;
+      int x32_padding_bottom = padding_bottom / packing;
+      auto mask_bottom = getX32VmaskByPaddingEnd(0, x32_padding_bottom);
+      // Create an int32 vreg which contains subelement masking and then
+      // logical_and with target vreg to mask out the unaligned paddings.
+      // Eg. if padding_bottom = 5, packing = 2, and assume the vreg shape is
+      // [8, 128], then the mask will be:
+      //
+      // sublane 0: [0xffffffff, 0xffffffff, ..., 0xffffffff]
+      // sublane 1: [0xffffffff, 0xffffffff, ..., 0xffffffff]
+      // sublane 2: [0xffffffff, 0xffffffff, ..., 0xffffffff]
+      // sublane 3: [0xffffffff, 0xffffffff, ..., 0xffffffff]
+      // sublane 4: [0xffffffff, 0xffffffff, ..., 0xffffffff]
+      // sublane 5: [0x0000ffff, 0x0000ffff, ..., 0x0000ffff]
+      // sublane 6: [0         , 0         , ..., 0         ]
+      // sublane 7: [0         , 0         , ..., 0         ]
+      //
+      // Through this way, in order to mask sub-elements, each target vreg only
+      // needs to apply 1 op (logical_and) instead of 3 ops (unpacking + select
+      // + packing).
+      Value partial_sublane_mask = builder.create<arith::ConstantOp>(
+          op.getLoc(),
+          DenseElementsAttr::get(
+              i32_vreg_ty,
+              builder.getI32IntegerAttr(
+                  0xffffffff >>
+                  (sub_padding * vreg_ty.getElementTypeBitWidth()))));
+      // Insert 0xffffffff above the blended sublane.
+      Value sublane_mask = builder.create<arith::SelectOp>(
+          getX32VmaskByPaddingEnd(0, x32_padding_bottom + 1), i32_max_vreg,
+          partial_sublane_mask);
+      // Insert 0 below the blended sublane.
+      sublane_mask = builder.create<arith::SelectOp>(mask_bottom, sublane_mask,
+                                                     i32_zeros_vreg);
       for (int64_t i = 0; i < vregs.dim(1); ++i) {
         Value &vreg = vregs({vregs.dim(0) - 1, i});
         Value i32_vreg = builder.create<tpu::BitcastVregOp>(i32_vreg_ty, vreg);
-        i32_vreg = builder.create<arith::SelectOp>(mask_bottom, i32_vreg,
-                                                   i32_zeros_vreg);
+        if (sub_padding > 0) {
+          i32_vreg = builder.create<arith::AndIOp>(i32_vreg, sublane_mask);
+        } else {
+          i32_vreg = builder.create<arith::SelectOp>(mask_bottom, i32_vreg,
+                                                     i32_zeros_vreg);
+        }
         vreg = builder.create<tpu::BitcastVregOp>(vreg_ty, i32_vreg);
       }
     }
@@ -1929,8 +1948,9 @@ LogicalResult tpu_matmul_rule(RewriteContext &ctx, Operation &op,
                                      lhs_zeros_vreg);
   xla::Array<Value> target_rhs_vregs(
       {target_rhs_row_vregs, target_rhs_col_vregs}, rhs_zeros_vreg);
-  xla::Array<Value> target_acc_vregs({acc_vregs.dim(0), target_acc_col_vregs},
-                                     acc_zeros_vreg);
+  xla::Array<Value> target_acc_vregs(
+      {lhs_vregs.dim(0) * layout_lhs.packing(), target_acc_col_vregs},
+      acc_zeros_vreg);
   target_lhs_vregs.UpdateSlice(lhs_vregs, {0, 0});
   target_rhs_vregs.UpdateSlice(rhs_vregs, {0, 0});
   target_acc_vregs.UpdateSlice(acc_vregs, {0, 0});
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/infer_vector_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/infer_vector_layout.cc
@@ -903,66 +903,21 @@ class VectorLayoutInferer {
   }
 
   LogicalResult infer(tpu::MatmulOp op) {
-    auto get_operand_layout =
-        [&](Value v, llvm::StringRef operand_name,
-            std::optional<int64_t> major_multiple = std::nullopt,
-            std::optional<int64_t> minor_multiple =
-                std::nullopt) -> std::optional<VectorLayout> {
-      auto layout = getLayout(v);
-      if (!layout.has_value()) {
-        op->emitOpError("Internal error: assert failed: Operand ")
-            << operand_name << " has no vector layout";
-        return std::nullopt;
-      }
-      auto vty = cast<VectorType>(v.getType());
-      auto tiling = nativeTiling(vty.getElementTypeBitWidth());
-      auto shape = vty.getShape().take_back(2);
-      if (shape[0] % major_multiple.value_or(tiling[0]) != 0 ||
-          shape[1] % minor_multiple.value_or(tiling[1]) != 0) {
-        op->emitOpError("Matmul operand ")
-            << operand_name << " must have a shape divisible by ("
-            << major_multiple.value_or(tiling[0]) << ", "
-            << minor_multiple.value_or(tiling[1]) << "), but got: (" << shape[0]
-            << ", " << shape[1] << ")";
-        return std::nullopt;
-      }
-      // Override tiling to match the native one.
-      return VectorLayout(layout->bitwidth(), {0, 0}, tiling,
-                          ImplicitDim::kNone);
-    };
-    auto res_ty = dyn_cast<VectorType>(op->getResult(0).getType());
-    TPU_CHECK_OP(res_ty, "only vector results supported");
-    TPU_CHECK_OP(res_ty.getElementTypeBitWidth() == kNativeBitwidth,
-                 "only 32-bit matmul results supported");
-    std::array<Layout, 3> in_layout;
-    CHECK_EQ(op->getNumOperands(), 3);
-    std::optional<int64_t> lhs_major_multiple;
-    std::optional<int64_t> rhs_major_multiple;
-    // We don't restrict the first lhs axis when the data is not packed.
-    if (cast<VectorType>(op->getOperand(0).getType())
-            .getElementTypeBitWidth() == kNativeBitwidth) {
-      lhs_major_multiple = 1;
-    }
-    // We don't restrict the first rhs axis when the data is not packed.
-    if (cast<VectorType>(op->getOperand(1).getType())
-            .getElementTypeBitWidth() == kNativeBitwidth) {
-      rhs_major_multiple = 1;
-    }
-    in_layout[0] =
-        get_operand_layout(op->getOperand(0), "lhs", lhs_major_multiple, 1);
-    if (!in_layout[0].has_value()) {
-      return failure();
-    }
-    in_layout[1] =
-        get_operand_layout(op->getOperand(1), "rhs", rhs_major_multiple, 1);
-    if (!in_layout[1].has_value()) {
-      return failure();
-    }
-    in_layout[2] = get_operand_layout(op->getOperand(2), "result", 1, 1);
-    if (!in_layout[2].has_value()) {
-      return failure();
-    }
-    setLayout(op, in_layout,
+    auto lhs_bitwidth = op.getLhs().getType().getElementTypeBitWidth();
+    auto rhs_bitwidth = op.getRhs().getType().getElementTypeBitWidth();
+    auto acc_bitwidth = op.getAcc().getType().getElementTypeBitWidth();
+    auto res_bitwidth = op.getResult().getType().getElementTypeBitWidth();
+    TPU_CHECK_OP(acc_bitwidth == kNativeBitwidth,
+                 "Expected 32-bit acc in tpu::MatmulOp");
+    TPU_CHECK_OP(res_bitwidth == kNativeBitwidth,
+                 "Expected 32-bit result in tpu::MatmulOp");
+    auto lhs_layout = VectorLayout(
+        lhs_bitwidth, {0, 0}, nativeTiling(lhs_bitwidth), ImplicitDim::kNone);
+    auto rhs_layout = VectorLayout(
+        rhs_bitwidth, {0, 0}, nativeTiling(rhs_bitwidth), ImplicitDim::kNone);
+    auto acc_layout = VectorLayout(
+        acc_bitwidth, {0, 0}, nativeTiling(acc_bitwidth), ImplicitDim::kNone);
+    setLayout(op, {lhs_layout, rhs_layout, acc_layout},
               VectorLayout(kNativeBitwidth, {0, 0}, default_tiling_,
                            ImplicitDim::kNone));
     return success();