[Mosaic TPU] Support dynamic DMA and ref slice on the 2nd minor when memref is untiled

bythew3i · Google-ML-Automation · commit 38d062dbee63 · 2024-11-11T16:14:27.000-08:00
* Generalize any untiled memref to have tiling (packing, 128)
* Support dynamic index on 2nd minor.
* Support dynamic shape on 2nd minor.

PiperOrigin-RevId: 695516124
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc
@@ -2996,7 +2996,7 @@ LogicalResult vector_load_rule(RewriteContext &ctx, Operation &op,
       // TODO(b/295393167): need to support strided load for bitwidth < 32.
     } else if (layout_out.bitwidth() == 32 &&
                canReinterpretToUntiledMemref(
-                   memref_ty, ctx.target_shape,
+                   load_op.getBase(), ctx.target_shape,
                    /*allow_minormost_padding=*/true)) {
       // In this case, if the memref can be reinterpreted to untiled, it is
       // valid to use any tiling for output. But using native tiling can save us
@@ -4204,7 +4204,7 @@ LogicalResult vector_store_rule(RewriteContext &ctx, Operation &op,
                // We accept padding in the minormost dim, because
                // apply_vector_layout will properly mask stores。
                canReinterpretToUntiledMemref(
-                   memref_ty, ctx.target_shape,
+                   store_op.getBase(), ctx.target_shape,
                    /*allow_minormost_padding=*/true)) {
       // In this case, if the memref can be reinterpreted to untiled, it is
       // valid to use any tiling for to_store. But using native tiling can save
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/infer_memref_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/infer_memref_layout.cc
@@ -87,6 +87,16 @@ FailureOr<TiledLayoutAttr> inferLayout(MemRefType memref_ty,
                                        int64_t leading_tile_rows = 0) {
   if (auto tiled_layout_attr =
           dyn_cast<TiledLayoutAttr>(memref_ty.getLayout())) {
+    if (leading_tile_rows > 0 && !tiled_layout_attr.getTiles().empty() &&
+        tiled_layout_attr.getTiles().front().dimensions().size() == 2 &&
+        tiled_layout_attr.getTiles().front().dimensions()[0] !=
+            leading_tile_rows) {
+      return emitError(UnknownLoc::get(memref_ty.getContext()),
+                       "Trying to infer memref layout with sublane tiling ")
+             << leading_tile_rows
+             << ", but the memref already has sublane tiling "
+             << tiled_layout_attr.getTiles().front().dimensions()[0];
+    }
     return tiled_layout_attr;
   }
   if (auto affine_map_attr = dyn_cast<AffineMapAttr>(memref_ty.getLayout())) {
@@ -226,13 +236,25 @@ LogicalResult inferOp(Operation &op, const int hardware_generation,
   if (auto alloca_op = dyn_cast<memref::AllocaOp>(op)) {
     TypedValue<MemRefType> arg = alloca_op.getResult();
     const MemRefType memref_ty = alloca_op.getResult().getType();
-    FAILUREOR_ASSIGN_OR_RETURN(const MemRefType new_memref_ty,
-                               inferMemref(memref_ty, hardware_generation,
-                                           target_shape, tpu_tiling_flags));
+    // If the memref can be reinterpreted to untiled, force to use tiling
+    // {1, target.lane_count} for 32 bit.
+    int64_t leading_tile_rows = 0;
+    // TODO(b/375038685): generalize untiled memref with packed type which
+    // needs to update load/store rules.
+    if (memref_ty.getElementTypeBitWidth() == 32 && memref_ty.getRank() > 1 &&
+        *(memref_ty.getShape().end() - 1) <= target_shape[1]) {
+      leading_tile_rows = 1;
+    }
+    FAILUREOR_ASSIGN_OR_RETURN(
+        const MemRefType new_memref_ty,
+        inferMemref(memref_ty, hardware_generation, target_shape,
+                    tpu_tiling_flags, leading_tile_rows));
     alloca_op.getResult().setType(new_memref_ty);
     if (memref_ty != new_memref_ty) {
       OpBuilder builder(alloca_op->getContext());
       builder.setInsertionPointAfter(alloca_op);
+      // TODO(b/376130272): add a canonicalizer for EraseLayoutOp so that if we
+      // have erase(erase(x)) then we rewrite it to erase(x).
       auto erase_op = builder.create<tpu::EraseLayoutOp>(
           arg.getLoc(),
           MemRefType::get(new_memref_ty.getShape(), memref_ty.getElementType(),
@@ -296,22 +318,56 @@ LogicalResult inferFunc(func::FuncOp f, const int hardware_generation,
     }
 
     FAILUREOR_ASSIGN_OR_RETURN(
-        const MemRefType new_memref_ty,
+        MemRefType new_memref_ty,
         inferMemref(memref_ty, hardware_generation, target_shape,
                     tpu_tiling_flags, leading_tile_rows));
     arg.setType(new_memref_ty);
     new_arg_types.push_back(arg.getType());
     if (memref_ty != new_memref_ty) {
+      Value val = arg;
+      Operation * arg_use_op = nullptr;
+      // If the arg memref can be reinterpreted to untiled, we can insert
+      // ReinterpretCastOp to use tiling {packing, target.lane_count} before
+      // EraseLayoutOp for only the arg memrefs and expect the rest memref
+      // layout inference is based on the casted layout automatically. This
+      // would help lift many restrictions in alignment check when consuming
+      // this memref.
+      if (canReinterpretToUntiledMemref(cast<TypedValue<MemRefType>>(val),
+                                        target_shape,
+                                        /*allow_minormost_padding=*/true) &&
+          // TODO(b/375038685): generalize untiled memref with packed type which
+          // needs to update load/store rules.
+          new_memref_ty.getElementTypeBitWidth() == 32) {
+        auto tiled_layout =
+            cast<tpu::TiledLayoutAttr>(new_memref_ty.getLayout());
+        SmallVector<xla::Tile> tiles(tiled_layout.getTiles());
+        SmallVector<int64_t> new_tile_strides(tiled_layout.getTileStrides());
+        for (int i = 0; i < new_tile_strides.size() - 2; ++i) {
+          new_tile_strides[i] *= tiles[0].dimension(0);
+        }
+        tiles[0] = ::xla::Tile({1, target_shape[1]});
+        new_memref_ty = MemRefType::get(
+            new_memref_ty.getShape(), new_memref_ty.getElementType(),
+            TiledLayoutAttr::get(new_memref_ty.getContext(), tiles,
+                                 new_tile_strides),
+            new_memref_ty.getMemorySpace());
+        arg_use_op = builder.create<tpu::ReinterpretCastOp>(val.getLoc(),
+                                                            new_memref_ty, val);
+        val = arg_use_op->getResult(0);
+      }
       // Some standard MLIR ops have static checks that seems unreasonable,
       // and we know they hold in the way they are used in Mosaic. Still,
       // verification with layouts likes to fail, because it can't statically
       // prove the properties.
       auto erase_op = builder.create<tpu::EraseLayoutOp>(
-          arg.getLoc(),
+          val.getLoc(),
           MemRefType::get(new_memref_ty.getShape(), memref_ty.getElementType(),
                           /*layout=*/nullptr, new_memref_ty.getMemorySpace()),
-          arg);
-      arg.replaceAllUsesExcept(erase_op.getResult(), erase_op);
+          val);
+      if (!arg_use_op) {
+        arg_use_op = erase_op;
+      }
+      arg.replaceAllUsesExcept(erase_op.getResult(), arg_use_op);
     }
   }
   f.setFunctionType(
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/infer_vector_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/infer_vector_layout.cc
@@ -1283,7 +1283,8 @@ class VectorLayoutInferer {
                                layout_tiling, ImplicitDim::kNone));
       } else if (bitwidth == 32 &&
                  canReinterpretToUntiledMemref(
-                     src_ty, target_shape_, /*allow_minormost_padding=*/true) &&
+                     op.getBase(), target_shape_,
+                     /*allow_minormost_padding=*/true) &&
                  *(src_ty.getShape().end() - 2) > 1) {
         // Since it is untiled, we can load from any arbitrary address which
         // means we can always set the sublane offset to 0.
@@ -1620,7 +1621,8 @@ class VectorLayoutInferer {
                  // We accept padding in the minormost dim, because
                  // apply_vector_layout will properly mask stores.
                  canReinterpretToUntiledMemref(
-                     ref_ty, target_shape_, /*allow_minormost_padding=*/true)) {
+                     op.getBase(), target_shape_,
+                     /*allow_minormost_padding=*/true)) {
         // Since it is untiled, we can store to any arbitrary address which
         // means the sublane offset can be any value and we can fold it to
         // 2nd minor index.
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/memory_space_specialization.cc b/jaxlib/mosaic/dialect/tpu/transforms/memory_space_specialization.cc
@@ -70,6 +70,10 @@ LogicalResult specializeMemorySpace(TypedValue<MemRefType> value,
     to_update.pop_back();
     // Here we only have to handle the operations allowed on refs with
     // unspecified memory space.
+    if (auto op = dyn_cast<tpu::ReinterpretCastOp>(some_op)) {
+      updateResultFrom(op, op.getInput().getType());
+      continue;
+    }
     if (auto op = dyn_cast<tpu::MemRefSliceOp>(some_op)) {
       updateResultFrom(op, op.getMemRef().getType());
       continue;
diff --git a/jaxlib/mosaic/dialect/tpu/util.cc b/jaxlib/mosaic/dialect/tpu/util.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "llvm/Support/MathExtras.h"
 #include "absl/types/span.h"
 #include "mlir/include/mlir/IR/BuiltinTypes.h"
+#include "mlir/include/mlir/IR/Value.h"
+#include "mlir/include/mlir/IR/ValueRange.h"
 #include "mlir/include/mlir/Support/LLVM.h"
 #include "jaxlib/mosaic/dialect/tpu/tpu_dialect.h"
 
@@ -69,31 +71,74 @@ std::optional<std::pair<bool, bool>> isTransposedMatmul(
   return std::pair<bool, bool>{lhs_transposed, rhs_transposed};
 }
 
-bool canReinterpretToUntiledMemref(MemRefType tiled_memref_ty,
+bool canReinterpretToUntiledMemref(TypedValue<MemRefType> tiled_memref,
                                    const std::array<int64_t, 2>& target_shape,
                                    bool allow_minormost_padding) {
+  MemRefType tiled_memref_ty = tiled_memref.getType();
   auto tiled_layout =
       dyn_cast<tpu::TiledLayoutAttr>(tiled_memref_ty.getLayout());
+  ValueRange dynamic_sizes = {};
+  if (!tiled_layout) {
+    if (auto erase_op = tiled_memref.getDefiningOp<tpu::EraseLayoutOp>()) {
+      tiled_memref = erase_op.getOperand();
+      tiled_memref_ty = tiled_memref.getType();
+      tiled_layout =
+          dyn_cast<tpu::TiledLayoutAttr>(tiled_memref_ty.getLayout());
+      // TODO(b/375641258): Currently we rely on the pattern `slice ->
+      // (squeeze)* -> eraseLayout` to get the dynamic sizes, but other patterns
+      // may not work here: eg., slice -> eraseLayout -> reshape ->
+      // eraseLayout`. We should fix this! For now, if we can not get the
+      // expected dynamic sizes, we consider the memref cannot be reinterpreted
+      // to untiled.
+      Value ref = tiled_memref;
+      while (auto squeeze_op = ref.getDefiningOp<tpu::MemRefSqueezeOp>()) {
+        ref = squeeze_op.getInput();
+      }
+      if (auto slice_op = ref.getDefiningOp<tpu::MemRefSliceOp>()) {
+        dynamic_sizes = slice_op.getDynamicSizes();
+      }
+    }
+  }
   if (!tiled_layout) {
     // We expect the tiled memref to have a tiled layout.
     return false;
   }
+  if (tiled_memref_ty.getNumDynamicDims() != dynamic_sizes.size()) {
+    return false;
+  }
   if (tiled_layout.getTiles().empty() ||
       tiled_layout.getTiles().front().dimensions().size() != 2 ||
       tiled_memref_ty.getRank() < 2) {
-    // TODO(jevinjiang): Currently we only support >= 2D memref, we might
+    // TODO(b/375642202): Currently we only support >= 2D memref, we might
     // need to handle 1D memref if we find a use case.
     return false;
   }
-  if (!allow_minormost_padding &&
-      *(tiled_memref_ty.getShape().end() - 1) != target_shape[1]) {
-    return false;
-  }
+  auto rank = tiled_memref_ty.getRank();
   auto packing = 32 / tiled_memref_ty.getElementTypeBitWidth();
-  return (*(tiled_memref_ty.getShape().end() - 1) <= target_shape[1] &&
-          *(tiled_memref_ty.getShape().end() - 2) % packing == 0 &&
-          *(tiled_layout.getTileStrides().end() - 1) == 1 &&
-          *(tiled_layout.getTileStrides().end() - 2) == 1);
+  if (tiled_memref_ty.isDynamicDim(rank - 1)) {
+    // TODO(jevinjiang): we can still allow the minormost padding if we know the
+    // max bound of the dynamic size is not larger than the target_shape[1].
+    if (!isGuaranteedDivisible(dynamic_sizes.back(), target_shape[1])) {
+      return false;
+    }
+    dynamic_sizes = dynamic_sizes.drop_back();
+  } else {
+    if (!allow_minormost_padding &&
+        tiled_memref_ty.getShape()[rank - 1] != target_shape[1]) {
+      return false;
+    }
+  }
+  if (tiled_memref_ty.isDynamicDim(rank - 2)) {
+    if (!isGuaranteedDivisible(dynamic_sizes.back(), packing)) {
+      return false;
+    }
+  } else {
+    if (tiled_memref_ty.getShape()[rank - 2] % packing != 0) {
+      return false;
+    }
+  }
+  // Check if the minormost dim has a single tile.
+  return *(tiled_layout.getTileStrides().end() - 1) == 1 &&
+         *(tiled_layout.getTileStrides().end() - 2) == 1;
 }
-
 }  // namespace mlir::tpu
diff --git a/jaxlib/mosaic/dialect/tpu/util.h b/jaxlib/mosaic/dialect/tpu/util.h
@@ -2,7 +2,6 @@
 #define THIRD_PARTY_PY_JAX_JAXLIB_MOSAIC_DIALECT_TPU_UTIL_H_
 
 #include <array>
-#include <cstddef>
 #include <cstdint>
 #include <sstream>
 #include <string>
@@ -17,7 +16,7 @@
 #include "mlir/Support/LogicalResult.h"
 #include "absl/types/span.h"
 #include "jaxlib/mosaic/dialect/tpu/tpu_dialect.h"
-#include "tsl/platform/statusor.h"
+#include "mlir/include/mlir/IR/Value.h"
 
 // TODO: Instead of CHECK_EQs, can we do something like TF_RET_CHECK but with
 // MLIR diagnostics?
@@ -112,7 +111,7 @@ std::optional<std::pair<bool, bool>> isTransposedMatmul(
 // considered as an untiled memref, except for potential padding in the
 // minormost dimension up to target_shape[1] (if allow_minormost_padding is
 // true).
-bool canReinterpretToUntiledMemref(MemRefType tiled_memref_ty,
+bool canReinterpretToUntiledMemref(TypedValue<MemRefType> tiled_memref,
                                    const std::array<int64_t, 2> &target_shape,
                                    bool allow_minormost_padding = false);
 
diff --git a/tests/pallas/tpu_pallas_test.py b/tests/pallas/tpu_pallas_test.py
@@ -1472,6 +1472,40 @@ def kernel(index, x, y, sem):
       np.testing.assert_array_equal(y, i)
       del y
 
+  def test_dynamic_dma_on_2nd_minor(self):
+    def kernel(array, data, index, size, _, sem):
+      pltpu.async_copy(
+            data.at[pl.ds(0, size[0])], array.at[pl.ds(index[0], size[0])], sem
+        ).wait()
+
+    def run(array, data, index, size):
+      return pl.pallas_call(
+            kernel,
+            out_shape=array,
+            in_specs=[
+                pl.BlockSpec(memory_space=pltpu.ANY),
+                pl.BlockSpec(memory_space=pltpu.VMEM),
+                pl.BlockSpec(memory_space=pltpu.SMEM),
+                pl.BlockSpec(memory_space=pltpu.SMEM),
+            ],
+            scratch_shapes=[
+                pltpu.SemaphoreType.DMA,
+            ],
+            out_specs=pl.BlockSpec(memory_space=pltpu.ANY),
+            input_output_aliases={0: 0},
+        )(array, data, index, size)
+
+    array = jnp.zeros((1024, 128), jnp.int32)
+    data = jnp.ones((8, 128), jnp.int32)
+    index = jnp.array([3], jnp.int32)
+    size = jnp.array([5], jnp.int32)
+
+    expected = array.at[index[0] : index[0] + size[0]].set(
+        data[index[0] : index[0] + size[0]]
+    )
+    result = run(array, data, index, size)
+    np.testing.assert_array_equal(result, expected)
+
 
 class PallasCallDMAInterpretTest(PallasCallDMATest):
   INTERPRET = True