[XLA:GPU] Add xla.get_dynamic_dim_size op and its lowering.

olegshyshkov · Google-ML-Automation · commit 6bb91193c603 · 2025-11-26T07:17:06.000-08:00
The new op is needed to implement PadToStatic custom call.

PiperOrigin-RevId: 837107619
diff --git a/xla/codegen/emitters/ir/tests/ops.mlir b/xla/codegen/emitters/ir/tests/ops.mlir
@@ -167,3 +167,12 @@ func.func @workgroup_id_op() -> (index, index, index) {
 // CHECK: [[WORKGROUP_ID_X:.*]] = xla.workgroup_id x {xla.range = [0 : index, 1023 : index]}
 // CHECK: [[WORKGROUP_ID_Y:.*]] = xla.workgroup_id y
 // CHECK: [[WORKGROUP_ID_Z:.*]] = xla.workgroup_id z
+
+// -----
+
+func.func @get_dynamic_dim_size(%in: tensor<16x8x4xf32>) -> (i32) {
+  %out = xla.get_dynamic_dim_size %in 1 : tensor<16x8x4xf32>
+  func.return %out : i32
+}
+// CHECK-LABEL: @get_dynamic_dim_size
+// CHECK: xla.get_dynamic_dim_size
diff --git a/xla/codegen/emitters/ir/xla_ops.td b/xla/codegen/emitters/ir/xla_ops.td
@@ -298,6 +298,19 @@ def WorkGroupIdOp : XLA_Op<"workgroup_id", [
   let results = (outs Index);
 }
 
+def GetDynamicDimSizeOp : XLA_Op<"get_dynamic_dim_size", [
+    Pure,
+  ]> {
+  let summary = "Returns the dynamic size of a dimension. The dynamic sizes are "
+                "stored in the same buffer, after the main values as an array "
+                "of s32. The `dim` argument can be larger than `tensor`'s rank, "
+                "because XLA has passes like flatten_tensors that only change "
+                "the view of the memory.";
+  let arguments =(ins AnyStaticShapeTensor:$tensor, I64Attr:$dim);
+  let results = (outs I32:$result);
+
+  let assemblyFormat = "$tensor $dim attr-dict `:` type($tensor)";
+}
 
 #endif // XLA_CODEGEN_EMITTERS_IR_XLA_OPS
 
diff --git a/xla/codegen/emitters/transforms/flatten_tensors.cc b/xla/codegen/emitters/transforms/flatten_tensors.cc
@@ -31,11 +31,11 @@ limitations under the License.
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeRange.h"
@@ -46,11 +46,12 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/WalkResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "xla/backends/cpu/codegen/emitters/ir/xla_cpu_ops.h"
 #include "xla/backends/gpu/codegen/emitters/ir/xla_gpu_ops.h"
+#include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/layout_util.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
@@ -748,6 +749,28 @@ struct RewriteSyncThreads : OpRewritePattern<gpu::SyncThreadsOp> {
   }
 };
 
+struct RewriteGetDynamicDimSizeOp : OpRewritePattern<GetDynamicDimSizeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GetDynamicDimSizeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto tensor = op.getTensor();
+    auto tensor_type = tensor.getType();
+    if (tensor_type.getRank() < 2) {
+      return rewriter.notifyMatchFailure(op, "the tensor is already flat");
+    }
+
+    auto tensor_1D = rewriter
+                         .create<UnrealizedConversionCastOp>(
+                             op.getLoc(), GetFlattenedType(tensor_type), tensor)
+                         .getResult(0);
+    rewriter.replaceOpWithNewOp<GetDynamicDimSizeOp>(op, tensor_1D,
+                                                     op.getDim());
+
+    return mlir::success();
+  }
+};
+
 class FlattenTensorsPass
     : public impl::FlattenTensorsPassBase<FlattenTensorsPass> {
  public:
@@ -760,8 +783,10 @@ class FlattenTensorsPass
         RewriteAllocateShared,
         RewriteAtomicRMW,
         RewriteConstant,
+        RewriteCpuLoad,
         RewriteFor,
         RewriteFunctionSignatures,
+        RewriteGetDynamicDimSizeOp,
         RewriteIf,
         RewriteIndexSwitch,
         RewritePureCall,
@@ -771,8 +796,7 @@ class FlattenTensorsPass
         RewriteVectorExtract,
         RewriteVectorFromElements,
         RewriteVectorInsert,
-        RewriteVectorTransferRead,
-        RewriteCpuLoad
+        RewriteVectorTransferRead
     >(mlir_context);
     // clang-format on
     ApplyIndexingOp::getCanonicalizationPatterns(patterns, mlir_context);
diff --git a/xla/codegen/emitters/transforms/lower_tensors.cc b/xla/codegen/emitters/transforms/lower_tensors.cc
@@ -1325,6 +1325,55 @@ class RewriteAtomicRMW : public OpRewritePattern<AtomicRMWOp> {
   const DeviceSpec& device_spec_;
 };
 
+class RewriteGetDynamicDimSize : public OpRewritePattern<GetDynamicDimSizeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(
+      GetDynamicDimSizeOp op, mlir::PatternRewriter& rewriter) const override {
+    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    auto tensor = op.getTensor();
+    auto tensor_type = mlir::dyn_cast<mlir::RankedTensorType>(tensor.getType());
+
+    Type element_type = tensor_type.getElementType();
+    int64_t num_elements = tensor_type.getNumElements();
+    std::optional<int> sub_byte_width = GetSubByteBitWidth(element_type);
+    if (sub_byte_width) {
+      element_type = b.getI8Type();
+      // Elements are packed.
+      num_elements = CeilOfRatio<int64_t>(num_elements, 8 / *sub_byte_width);
+    }
+
+    // The offset of the dim size from the start of the buffer. The dynamic dim
+    // sizes are stored after the tensor data as a tail-allocated metadata of
+    // s32 type.
+    int64_t dynamic_size_offset_in_bytes =
+        num_elements * element_type.getIntOrFloatBitWidth() / 8 +
+        op.getDim() * b.getI32Type().getWidth() / 8;
+
+    int64_t alignment = dynamic_size_offset_in_bytes % 4;
+    // TODO(b/463569416): Support unaligned loads.
+    if (alignment != 0) {
+      return op->emitOpError("dynamic size offset is not 4-byte aligned");
+    }
+
+    auto ptr_type = ml::LLVMPointerType::get(b.getContext());
+    Value tensor_ptr =
+        b.create<UnrealizedConversionCastOp>(ptr_type, tensor).getResult(0);
+
+    Value addr_offset =
+        b.create<ml::ConstantOp>(b.getI64Type(), dynamic_size_offset_in_bytes);
+
+    Value addr_int = b.create<ml::PtrToIntOp>(b.getI64Type(), tensor_ptr);
+    Value metadata_addr_int = b.create<ml::AddOp>(addr_int, addr_offset);
+    Value metadata_addr = b.create<ml::IntToPtrOp>(ptr_type, metadata_addr_int);
+
+    rewriter.replaceOpWithNewOp<ml::LoadOp>(op, b.getI32Type(), metadata_addr);
+
+    return success();
+  }
+};
+
 class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
  public:
   explicit LowerTensorsPass(const LowerTensorsPassOptions& options)
@@ -1351,10 +1400,11 @@ class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
     mlir::RewritePatternSet tensor_patterns(mlir_context);
 
     tensor_patterns.add<RewriteAtomicRMW>(mlir_context, device_spec_);
-    tensor_patterns
-        .add<RewriteAllocateShared, RewriteNonScalarConstants,
-             RewriteSyncThreads, RewriteTensorExtract, RewriteTransferRead,
-             RewriteTensorInsert, RewriteTransferWrite>(mlir_context);
+    tensor_patterns.add<RewriteAllocateShared, RewriteGetDynamicDimSize,
+                        RewriteNonScalarConstants, RewriteSyncThreads,
+                        RewriteTensorExtract, RewriteTensorInsert,
+                        RewriteTransferRead, RewriteTransferWrite>(
+        mlir_context);
     if (mlir::failed(mlir::applyPatternsGreedily(getOperation(),
                                                  std::move(tensor_patterns)))) {
       signalPassFailure();
@@ -1396,14 +1446,8 @@ class LowerTensorsPass : public impl::LowerTensorsPassBase<LowerTensorsPass> {
           if (func.getArgAttr(base.getArgNumber(), "xla.invariant")) {
             load.setInvariant(true);
           }
-          return;
         }
       }
-      if (!device_spec_.IsCpu()) {
-        load.emitOpError(
-            "load op address is not (a GEP of) a function argument");
-        signalPassFailure();
-      }
     });
   }
 
diff --git a/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir b/xla/codegen/emitters/transforms/tests/flatten_tensors.mlir
@@ -398,3 +398,13 @@ func.func @constant_vector() -> vector<2x3xf32> {
 // CHECK-LABEL: func.func @constant_vector
 // CHECK-SAME: -> vector<6xf32>
 // CHECK-NOT:  builtin.unrealized_conversion_cast
+
+// -----
+
+func.func @get_dynamic_dim_size(%in: tensor<16x8x4xf32>) -> (i32) {
+  %out = xla.get_dynamic_dim_size %in 1 : tensor<16x8x4xf32>
+  func.return %out : i32
+}
+// CHECK-LABEL: func.func @get_dynamic_dim_size(
+// CHECK-SAME:      %[[TENSOR:.*]]: tensor<512xf32>) -> i32 {
+// CHECK:         xla.get_dynamic_dim_size %[[TENSOR]] 1 : tensor<512xf32>
diff --git a/xla/codegen/emitters/transforms/tests/lower_tensors.mlir b/xla/codegen/emitters/transforms/tests/lower_tensors.mlir
@@ -1106,3 +1106,30 @@ func.func @transfer_write_f4(%arg0: tensor<43xf4E2M1FN> {xla.slice_index = 1},
 // CHECK-LABEL: @transfer_write_f4
 // CHECK: %[[PTR:.*]] = llvm.getelementptr inbounds %arg0[0, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<22 x i8>
 // CHECK: %[[OUT:.*]] = builtin.unrealized_conversion_cast %{{.*}} : vector<2xf4E2M1FN> to vector<2xi4>
+
+// -----
+
+func.func @get_dynamic_dim_size(%arg0: tensor<512xf32>) -> i32 {
+  %0 = xla.get_dynamic_dim_size %arg0 1 : tensor<512xf32>
+  func.return %0 : i32
+}
+// CHECK-LABEL: @get_dynamic_dim_size
+// CHECK: llvm.mlir.constant(2052 : i64) : i64
+
+// -----
+
+func.func @get_dynamic_dim_size_sub_byte_width(%arg0: tensor<512xi4>) -> i32 {
+  %0 = xla.get_dynamic_dim_size %arg0 1 : tensor<512xi4>
+  func.return %0 : i32
+}
+// CHECK-LABEL: @get_dynamic_dim_size_sub_byte_width
+// CHECK: llvm.mlir.constant(260 : i64) : i64
+
+// // -----
+
+func.func @get_dynamic_dim_size_unaligned(%arg0: tensor<7xf16>) -> i32 {
+// expected-error @+1 {{'xla.get_dynamic_dim_size' op dynamic size offset is not 4-byte aligned}}
+  %0 = xla.get_dynamic_dim_size %arg0 1 : tensor<7xf16>
+  func.return %0 : i32
+}
+