Fixing definition of ShardShapeOp and lowering it to MPI

fschlimb · fschlimb · commit 3e7d98dcdec0 · 2025-02-27T17:59:12.000+01:00
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
@@ -345,24 +345,32 @@ def Mesh_GetShardingOp : Mesh_Op<"get_sharding", [Pure]> {
   }];
 }
 
-def Mesh_ShardShapeOp : Mesh_Op<"shard_shape", [Pure]> {
-  let summary = "Get the shard shape of a given process/device.";
+def Mesh_ShardShapeOp : Mesh_Op<"shard_shape", [
+    Pure, AttrSizedOperandSegments,
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>
+  ]> {
+  let summary = "Get the shard shape for a given process/device.";
   let description = [{
-    The device/process id is a linearized id of the device/process in the mesh.
+    The device/process id is a multi-index of the device/process in the mesh.
     This operation might be used during spmdization when the shard shape depends
     on (non-constant) values used in `mesh.sharding`.
   }];
   let arguments = (ins
-    DenseI64ArrayAttr:$shape,
+    DenseI64ArrayAttr:$dims,
+    Variadic<Index>:$dims_dynamic,
     Mesh_Sharding:$sharding,
-    Index:$device
+    DenseI64ArrayAttr:$device,
+    Variadic<Index>:$device_dynamic
   );
   let results = (outs Variadic<Index>:$result);
   let assemblyFormat = [{
-      custom<DimensionList>($shape) $sharding $device attr-dict `:` type($result)
+      `dims` `=` custom<DynamicIndexList>($dims_dynamic, $dims)
+      `sharding` `=` $sharding
+      `device` `=` custom<DynamicIndexList>($device_dynamic, $device)
+      attr-dict `:` type(results)
   }];
   let builders = [
-    OpBuilder<(ins "ArrayRef<int64_t>":$shape, "Value":$sharding, "Value":$device)>
+    OpBuilder<(ins "ArrayRef<int64_t>":$dims, "ArrayRef<Value>":$dims_dyn, "Value":$sharding, "ValueRange":$device)>
   ];
 }
 
diff --git a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp
@@ -380,23 +380,159 @@ struct ConvertNeighborsLinearIndicesOp
         [&](OpBuilder &builder, Location loc) {
           SmallVector<Value> tmp = mIdx;
           tmp[axes[0]] =
-              rewriter.create<arith::AddIOp>(op.getLoc(), orgIdx, one)
-                  .getResult();
+              rewriter.create<arith::AddIOp>(op.getLoc(), orgIdx, one);
           builder.create<scf::YieldOp>(
               loc, multiToLinearIndex(loc, rewriter, tmp, dims));
         });
     rewriter.replaceOp(op, ValueRange{down.getResult(0), up.getResult(0)});
-    return mlir::success();
+    return success();
   }
 };
 
-struct ConvertUpdateHaloOp
-    : public mlir::OpRewritePattern<mlir::mesh::UpdateHaloOp> {
-  using OpRewritePattern::OpRewritePattern;
+struct ConvertShardShapeOp : public OpConversionPattern<ShardShapeOp> {
+  using OpConversionPattern::OpConversionPattern;
 
-  mlir::LogicalResult
-  matchAndRewrite(mlir::mesh::UpdateHaloOp op,
-                  mlir::PatternRewriter &rewriter) const override {
+  LogicalResult
+  matchAndRewrite(ShardShapeOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto sharding = op.getSharding().getDefiningOp<ShardingOp>();
+    if (!sharding) {
+      return op->emitError()
+             << "Expected SharingOp as defining op for sharding"
+             << " but found " << adaptor.getSharding()[0].getDefiningOp();
+    }
+
+    // Compute the sharded shape by applying the sharding to the input shape.
+    // Without shardedDimsOffsets in the sharding, the shard shape is computed
+    // by dividing the dimension size by the number of shards in that dimension
+    // (which is given by the size of the mesh axes provided in split-axes).
+    // Odd elements get distributed to trailing shards.
+    // If a shardedDimsOffsets is provided, the shard shape is computed by
+    // subtracting the offset of the current shard from the offset of the next
+    // shard.
+
+    Location loc = op.getLoc();
+    Type index = rewriter.getIndexType();
+
+    // This is a 1:N conversion because the sharding op is a 1:3 conversion.
+    // The operands in the adaptor are a vector<ValeRange>. For dims and device
+    // we have a 1:1 conversion.
+    // For simpler access fill a vector with the dynamic dims.
+    SmallVector<Value> dynDims, dynDevice;
+    for (auto dim : adaptor.getDimsDynamic()) {
+      // type conversion should be 1:1 for ints
+      assert(dim.size() == 1);
+      dynDims.emplace_back(dim[0]);
+    }
+    // same for device
+    for (auto device : adaptor.getDeviceDynamic()) {
+      assert(device.size() == 1);
+      dynDevice.emplace_back(device[0]);
+    }
+
+    // To keep the code simple, convert dims/device to values when they are
+    // attributes. Count on canonicalization to fold static values.
+    auto shape = getMixedAsValues(rewriter, loc, op.getDims(), dynDims, index);
+    auto multiIdx =
+        getMixedAsValues(rewriter, loc, adaptor.getDevice(), dynDevice, index);
+
+    // Get the MeshOp, the mesh shape is needed to compute the sharded shape.
+    SymbolTableCollection symbolTableCollection;
+    auto meshOp = getMesh(sharding, symbolTableCollection);
+    // For now we only support static mesh shapes
+    if (ShapedType::isDynamicShape(meshOp.getShape()))
+      return failure();
+
+    auto splitAxes = sharding.getSplitAxes().getAxes();
+    // shardedDimsOffsets are optional and might be Values (not attributes).
+    // Also, the shardId might be dynamic which means the position in the
+    // shardedDimsOffsets is not statically known. Create a tensor of the
+    // shardedDimsOffsets and later extract the offsets for computing the
+    // local shard-size.
+    Value shardedDimsOffs;
+    {
+      auto tmp = getMixedAsValues(
+          rewriter, loc, sharding.getStaticShardedDimsOffsets(),
+          sharding.getDynamicShardedDimsOffsets(), index);
+      if (!tmp.empty())
+        shardedDimsOffs = rewriter.create<tensor::FromElementsOp>(
+            loc, RankedTensorType::get({(int64_t)tmp.size()}, index), tmp);
+    }
+
+    // With static mesh shape the sizes of the split axes are known.
+    // Hence the start/pos for each split axes in shardDimsOffsets can be
+    // computed statically.
+    int64_t pos = 0;
+    SmallVector<Value> shardShape;
+    Value zero =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getZeroAttr(index));
+    Value one =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getOneAttr(index));
+
+    // Iterate over the dimensions of the tensor shape, get their split Axes,
+    // and compute the sharded shape.
+    for (auto [i, dim] : llvm::enumerate(shape)) {
+      // Trailing dimensions might not be annotated.
+      if (i < splitAxes.size() && !splitAxes[i].empty()) {
+        auto axes = splitAxes[i];
+        // The current dimension might not be sharded.
+        // Create a value from the static position in shardDimsOffsets.
+        Value posVal =
+            rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(pos));
+        // Get the index of the local shard in the mesh axis.
+        Value idx = multiIdx[axes[0]];
+        auto _numShards =
+            collectiveProcessGroupSize(axes.asArrayRef(), meshOp.getShape());
+        if (shardedDimsOffs) {
+          // If sharded dims offsets are provided, use them to compute the
+          // sharded shape.
+          if (axes.size() > 1) {
+            return op->emitError() << "Only single axis sharding is "
+                                   << "supported for each dimension.";
+          }
+          idx = rewriter.create<arith::AddIOp>(loc, posVal, idx);
+          // Compute size = shardedDimsOffs[idx+1] - shardedDimsOffs[idx].
+          Value off =
+              rewriter.create<tensor::ExtractOp>(loc, shardedDimsOffs, idx);
+          idx = rewriter.create<arith::AddIOp>(loc, idx, one);
+          Value nextOff =
+              rewriter.create<tensor::ExtractOp>(loc, shardedDimsOffs, idx);
+          Value sz = rewriter.create<arith::SubIOp>(loc, nextOff, off);
+          shardShape.emplace_back(sz);
+        } else {
+          auto numShards = rewriter.create<arith::ConstantOp>(
+              loc, rewriter.getIndexAttr(_numShards));
+          // Compute shard dim size by distributing odd elements to trailing
+          // shards:
+          // sz = dim / numShards
+          //      + (idx >= (numShards - (dim % numShards)) ? 1 : 0)
+          Value sz = rewriter.create<arith::DivSIOp>(loc, dim, numShards);
+          Value sz1 = rewriter.create<arith::RemSIOp>(loc, dim, numShards);
+          sz1 = rewriter.create<arith::SubIOp>(loc, numShards, sz1);
+          auto cond = rewriter.create<arith::CmpIOp>(
+              loc, arith::CmpIPredicate::sge, idx, sz1);
+          Value odd = rewriter.create<arith::SelectOp>(loc, cond, one, zero);
+          sz = rewriter.create<arith::AddIOp>(loc, sz, odd);
+          shardShape.emplace_back(sz);
+        }
+        pos += _numShards + 1; // add one for the total size.
+      } // else no sharding if split axis is empty or no split axis
+      // If no size was added -> no sharding in this dimension.
+      if (shardShape.size() <= i)
+        shardShape.emplace_back(dim);
+    }
+    assert(shardShape.size() == shape.size());
+    rewriter.replaceOp(op, shardShape);
+    return success();
+  }
+};
+
+struct ConvertUpdateHaloOp : public OpConversionPattern<UpdateHaloOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(UpdateHaloOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
 
     // The input/output memref is assumed to be in C memory order.
     // Halos are exchanged as 2 blocks per dimension (one for each side: down
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -831,12 +831,19 @@ MeshSharding MeshSharding::get(::mlir::FlatSymbolRefAttr mesh_,
 // mesh.shard_shape
 //===----------------------------------------------------------------------===//
 
+void ShardShapeOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  setNameFn(getResult()[0], "shard_shape");
+}
+
 void ShardShapeOp::build(::mlir::OpBuilder &odsBuilder,
                          ::mlir::OperationState &odsState,
-                         ::llvm::ArrayRef<int64_t> shape,
-                         ::mlir::Value sharding, ::mlir::Value device) {
-  SmallVector<mlir::Type> resType(shape.size(), odsBuilder.getIndexType());
-  build(odsBuilder, odsState, resType, shape, sharding, device);
+                         ::llvm::ArrayRef<int64_t> dims,
+                         ArrayRef<Value> dims_dyn, ::mlir::Value sharding,
+                         ::mlir::ValueRange device) {
+  SmallVector<mlir::Type> resType(dims.size(), odsBuilder.getIndexType());
+  build(odsBuilder, odsState, resType, dims, dims_dyn, sharding,
+        SmallVector<int64_t>(device.size(), ShapedType::kDynamic), device);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp b/mlir/lib/Dialect/Tensor/Extensions/MeshShardingExtensions.cpp
@@ -50,10 +50,10 @@ struct CreatorOpShardingInterface
                         IRMapping &spmdizationMap,
                         SymbolTableCollection &symbolTable,
                         OpBuilder &builder) const {
-    auto shardType = cast<ShapedType>(mesh::shardType(
-        op->getResult(0).getType(),
-        mesh::getMesh(op, resultShardings[0].getMeshAttr(), symbolTable),
-        resultShardings[0]));
+    auto mesh =
+        mesh::getMesh(op, resultShardings[0].getMeshAttr(), symbolTable);
+    auto shardType = cast<ShapedType>(
+        mesh::shardType(op->getResult(0).getType(), mesh, resultShardings[0]));
     Operation *newOp = nullptr;
     // if the sharding introduces a new dynamic dimension, we take it from
     // the dynamic sharding info. For now bail out if it's not
@@ -66,18 +66,19 @@ struct CreatorOpShardingInterface
       assert(oldType.getRank() == shardType.getRank());
       int currOldOprndNum = -1;
       mesh::ShardShapeOp shapeForDevice;
-      Value device;
+      ValueRange device;
       Operation *newSharding = nullptr;
       for (auto i = 0; i < oldType.getRank(); ++i) {
         if (!oldType.isDynamicDim(i) && shardType.isDynamicDim(i)) {
           if (!newSharding) {
             newSharding =
                 builder.create<ShardingOp>(op->getLoc(), resultShardings[0]);
-            device = builder.create<mesh::ProcessLinearIndexOp>(
-                op->getLoc(), resultShardings[0].getMesh());
+            device =
+                builder.create<mesh::ProcessMultiIndexOp>(op->getLoc(), mesh)
+                    .getResults();
             shapeForDevice = builder.create<mesh::ShardShapeOp>(
-                op->getLoc(), oldType.getShape(), newSharding->getResult(0),
-                device);
+                op->getLoc(), oldType.getShape(), spmdizedOperands,
+                newSharding->getResult(0), device);
           }
           newOperands.emplace_back(shapeForDevice.getResult()[i]);
         } else if (oldType.isDynamicDim(i)) {
diff --git a/mlir/test/Conversion/MeshToMPI/convert-shardshape-to-mpi.mlir b/mlir/test/Conversion/MeshToMPI/convert-shardshape-to-mpi.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-opt %s --convert-mesh-to-mpi -canonicalize | FileCheck %s
+
+module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 24> } {
+
+  // CHECK: mesh.mesh @mesh0
+  mesh.mesh @mesh0(shape = 3x4x5)
+  
+  // Notice: comm_world_rank/linear index 24 is multiindex [1, 0, 4] in @mesh0
+
+  // all shards are equal
+  // CHECK-LABEL: func.func @shard_shape_equal() -> (index, index, index) {
+  func.func @shard_shape_equal() -> (index, index, index) {
+    %sharding = mesh.sharding @mesh0 split_axes = [[0], [1], [2]] : !mesh.sharding
+    %0:3 = mesh.process_multi_index on @mesh0 : index, index, index
+    %c9 = arith.constant 9 : index
+    %c12 = arith.constant 12 : index
+    // CHECK: [[vc3:%.*]] = arith.constant 3 : index
+    %1:3 = mesh.shard_shape dims = [%c9, %c12, 15] sharding = %sharding device = [%0#0, %0#1, %0#2] : index, index, index
+    // CHECK: return [[vc3]], [[vc3]], [[vc3]] : index, index, index
+    return %1#0, %1#1, %1#2 : index, index, index
+  }
+
+  // last shard in last dim gets an extra element
+  // CHECK-LABEL: func.func @shard_shape_odd_1() -> (index, index, index) {
+  func.func @shard_shape_odd_1() -> (index, index, index) {
+    %sharding = mesh.sharding @mesh0 split_axes = [[0], [1], [2]] : !mesh.sharding
+    %0:3 = mesh.process_multi_index on @mesh0 : index, index, index
+    %c9 = arith.constant 9 : index
+    %c12 = arith.constant 12 : index
+    // CHECK-DAG: [[vc3:%.*]] = arith.constant 3 : index
+    // CHECK-DAG: [[vc4:%.*]] = arith.constant 4 : index
+    %1:3 = mesh.shard_shape dims = [%c9, %c12, 16] sharding = %sharding device = [%0#0, %0#1, %0#2] : index, index, index
+    // CHECK: return [[vc3]], [[vc3]], [[vc4]] : index, index, index
+    return %1#0, %1#1, %1#2 : index, index, index
+  }
+
+  // all except first shard in second dim get an extra element
+  // CHECK-LABEL: func.func @shard_shape_odd_2() -> (index, index, index) {
+  func.func @shard_shape_odd_2() -> (index, index, index) {
+    %sharding = mesh.sharding @mesh0 split_axes = [[0], [1], [2]] : !mesh.sharding
+    %0:3 = mesh.process_multi_index on @mesh0 : index, index, index
+    %c9 = arith.constant 9 : index
+    // CHECK: [[vc3:%.*]] = arith.constant 3 : index
+    %1:3 = mesh.shard_shape dims = [%c9, 15, 15] sharding = %sharding device = [%0#0, %0#1, %0#2] : index, index, index
+    // CHECK: return [[vc3]], [[vc3]], [[vc3]] : index, index, index
+    return %1#0, %1#1, %1#2 : index, index, index
+  }
+
+  // all except first shard in first dim get an extra element
+  // CHECK-LABEL: func.func @shard_shape_odd_3() -> (index, index, index) {
+  func.func @shard_shape_odd_3() -> (index, index, index) {
+    %sharding = mesh.sharding @mesh0 split_axes = [[0], [1], [2]] : !mesh.sharding
+    %0:3 = mesh.process_multi_index on @mesh0 : index, index, index
+    // CHECK-DAG: [[vc3:%.*]] = arith.constant 3 : index
+    // CHECK-DAG: [[vc4:%.*]] = arith.constant 4 : index
+    %1:3 = mesh.shard_shape dims = [11, 12, 15] sharding = %sharding device = [%0#0, %0#1, %0#2] : index, index, index
+    // CHECK: return [[vc4]], [[vc3]], [[vc3]] : index, index, index
+    return %1#0, %1#1, %1#2 : index, index, index
+  }
+
+  // extract from sharded_dims_offsets
+  // CHECK-LABEL: func.func @shard_shape_sharded_dims_offs() -> (index, index, index) {
+  func.func @shard_shape_sharded_dims_offs() -> (index, index, index) {
+    %sharding = mesh.sharding @mesh0 split_axes = [[0], [1], [2]]
+        sharded_dims_offsets = [0, 1, 4, 9, 0, 2, 6, 12, 12, 0, 3, 6, 9, 12, 15]: !mesh.sharding
+    %0:3 = mesh.process_multi_index on @mesh0 : index, index, index
+    %c9 = arith.constant 9 : index
+    %c12 = arith.constant 12 : index
+    // CHECK: [[vc3:%.*]] = arith.constant 3 : index
+    // CHECK: [[vc2:%.*]] = arith.constant 2 : index
+    %1:3 = mesh.shard_shape dims = [%c9, %c12, 15] sharding = %sharding device = [%0#0, %0#1, %0#2] : index, index, index
+    // CHECK: return [[vc3]], [[vc2]], [[vc3]] : index, index, index
+    return %1#0, %1#1, %1#2 : index, index, index
+  }
+}
diff --git a/mlir/test/Dialect/Mesh/ops.mlir b/mlir/test/Dialect/Mesh/ops.mlir
@@ -157,10 +157,12 @@ func.func @mesh_shard_shape() {
   %c3 = arith.constant 3 : index
   // CHECK-NEXT: %[[S:.*]] = mesh.sharding @mesh0 split_axes = {{\[\[}}]] : !mesh.sharding
   %s = mesh.sharding @mesh0 split_axes = [[]] : !mesh.sharding
-  // CHECK-NEXT: mesh.shard_shape 8x? %[[S]] %[[C3]] : index, index
-  %shp:2 = mesh.shard_shape 8x? %s %c3 : index, index
-  // CHECK-NEXT: mesh.shard_shape 8x4 %[[S]] %[[C3]] : index, index
-  %shp1:2 = mesh.shard_shape 8x4 %s %c3 : index, index
+  // CHECK-NEXT: mesh.shard_shape dims = [8, %[[C3]]
+  // CHECK-SAME: ] sharding = %[[S]] device = [%[[C3]]
+  // CHECK-SAME: ] : index, index
+  %shp:2 = mesh.shard_shape dims = [8, %c3] sharding = %s device = [%c3] : index, index
+  // CHECK-NEXT: mesh.shard_shape dims = [8, 4] sharding = %[[S]] device = [3] : index, index
+  %shp1:2 = mesh.shard_shape dims = [8, 4] sharding = %s device = [3] : index, index
   return
 }
 
diff --git a/mlir/test/Dialect/Tensor/mesh-spmdization.mlir b/mlir/test/Dialect/Tensor/mesh-spmdization.mlir
@@ -10,8 +10,9 @@ func.func @tensor_empty_static_sharded_dims_offsets() -> () {
   %sharding = mesh.sharding @mesh_1d_4 split_axes = [[0]] sharded_dims_offsets = [0, 1, 4, 7, 8] : !mesh.sharding
   %sharded= mesh.shard %b to %sharding : tensor<8x16xf32>
   // CHECK:  %[[sharding:.*]] = mesh.sharding @mesh_1d_4 split_axes = {{\[\[}}0]] sharded_dims_offsets = [0, 1, 4, 7, 8] : !mesh.sharding
-  // CHECK:  %[[proc_linear_idx:.*]] = mesh.process_linear_index on @mesh_1d_4 : index
-  // CHECK:  %[[V0:.*]]:2 = mesh.shard_shape 8x16 %[[sharding]] %[[proc_linear_idx]] : index, index
+  // CHECK:  %[[proc_multi_idx:.*]] = mesh.process_multi_index on @mesh_1d_4 : index
+  // CHECK:  %[[V0:.*]]:2 = mesh.shard_shape dims = [8, 16] sharding = %[[sharding]] device = [%[[proc_multi_idx]]
+  // CHECK-SAME: ] : index, index
   // CHECK:  tensor.empty(%[[V0]]#0) : tensor<?x16xf32>
 
   return
@@ -24,8 +25,10 @@ func.func @tensor_empty_dynamic_sharded_dims_offsets(%arg0 : index) -> () {
   %sharding = mesh.sharding @mesh_1d_4 split_axes = [[0]] sharded_dims_offsets = [0, 1, 4, 7, 8] : !mesh.sharding
   %sharded= mesh.shard %b to %sharding : tensor<8x?xf32>
   // CHECK:  %[[sharding:.*]] = mesh.sharding @mesh_1d_4 split_axes = {{\[\[}}0]] sharded_dims_offsets = [0, 1, 4, 7, 8] : !mesh.sharding
-  // CHECK:  %[[proc_linear_idx:.*]] = mesh.process_linear_index on @mesh_1d_4 : index
-  // CHECK:  %[[V0:.*]]:2 = mesh.shard_shape 8x? %[[sharding]] %[[proc_linear_idx]] : index, index
+  // CHECK:  %[[proc_multi_idx:.*]] = mesh.process_multi_index on @mesh_1d_4 : index
+  // CHECK:  %[[V0:.*]]:2 = mesh.shard_shape dims = [8, %[[A0]]
+  // CHECK-SAME: ] sharding = %[[sharding]] device = [%[[proc_multi_idx]]
+  // CHECK-SAME: ] : index, index
   // CHECK:  tensor.empty(%[[V0]]#0, %[[A0]]) : tensor<?x?xf32>
 
   return