[Auto Parallel] Add co_shard spmd_rule for bmm (PaddlePaddle#75555)

ooooo-create · web-flow · commit 57d59ba6c9da · 2025-10-17T14:08:29.000+08:00
diff --git a/paddle/phi/infermeta/spmd_rules/bmm.cc b/paddle/phi/infermeta/spmd_rules/bmm.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/bmm.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/matmul.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+
+namespace {
+
+std::vector<int64_t> CheckBmmTensorMeta(const DistMetaTensor& tensor,
+                                        const char* tensor_name,
+                                        const char* rule_name) {
+  const auto shape = common::vectorize(tensor.dims());
+  const auto& dims_mapping = tensor.dist_attr().multi_dims_mapping();
+
+  PADDLE_ENFORCE_EQ(shape.size(),
+                    3,
+                    common::errors::InvalidArgument(
+                        "%s expects %s to be a 3-D tensor, but it has rank %d.",
+                        rule_name,
+                        tensor_name,
+                        static_cast<int>(shape.size())));
+  PADDLE_ENFORCE_EQ(
+      dims_mapping.size(),
+      shape.size(),
+      common::errors::InvalidArgument(
+          "%s expects dims_mapping length of %s (%d) to match its rank (%d).",
+          rule_name,
+          tensor_name,
+          static_cast<int>(dims_mapping.size()),
+          static_cast<int>(shape.size())));
+
+  return shape;
+}
+
+inline void CheckDimEqual(int64_t lhs,
+                          int64_t rhs,
+                          const char* lhs_desc,
+                          const char* rhs_desc,
+                          const char* rule_name) {
+  if (lhs != -1 && rhs != -1) {
+    PADDLE_ENFORCE_EQ(lhs,
+                      rhs,
+                      common::errors::InvalidArgument(
+                          "%s expects %s (%d) to be equal to %s (%d).",
+                          rule_name,
+                          lhs_desc,
+                          lhs,
+                          rhs_desc,
+                          rhs));
+  }
+}
+
+}  // namespace
+
+SpmdInfo BmmInferSpmd(const DistMetaTensor& x, const DistMetaTensor& y) {
+  const auto x_shape = CheckBmmTensorMeta(x, "Input(X)", "BmmInferSpmd");
+  const auto y_shape = CheckBmmTensorMeta(y, "Input(Y)", "BmmInferSpmd");
+
+  CheckDimEqual(x_shape[2],
+                y_shape[1],
+                "the last dimension of Input(X)",
+                "the second dimension of Input(Y)",
+                "BmmInferSpmd");
+  CheckDimEqual(x_shape[0],
+                y_shape[0],
+                "the batch dimension of Input(X)",
+                "the batch dimension of Input(Y)",
+                "BmmInferSpmd");
+
+  VLOG(6) << "BmmInferSpmd delegates to MatmulInferSpmd (trans_x=false, "
+             "trans_y=false).";
+
+  return MatmulInferSpmd(x, y, false, false);
+}
+
+SpmdInfo BmmGradInferSpmd(const DistMetaTensor& x,
+                          const DistMetaTensor& y,
+                          const DistMetaTensor& out_grad) {
+  const auto x_shape = CheckBmmTensorMeta(x, "Input(X)", "BmmGradInferSpmd");
+  const auto y_shape = CheckBmmTensorMeta(y, "Input(Y)", "BmmGradInferSpmd");
+  const auto out_grad_shape =
+      CheckBmmTensorMeta(out_grad, "Output@Grad", "BmmGradInferSpmd");
+
+  CheckDimEqual(x_shape[2],
+                y_shape[1],
+                "the last dimension of Input(X)",
+                "the second dimension of Input(Y)",
+                "BmmGradInferSpmd");
+  CheckDimEqual(x_shape[0],
+                y_shape[0],
+                "the batch dimension of Input(X)",
+                "the batch dimension of Input(Y)",
+                "BmmGradInferSpmd");
+  CheckDimEqual(x_shape[0],
+                out_grad_shape[0],
+                "the batch dimension of Input(X)",
+                "the batch dimension of Output@Grad",
+                "BmmGradInferSpmd");
+  CheckDimEqual(x_shape[1],
+                out_grad_shape[1],
+                "the second dimension of Input(X)",
+                "the second dimension of Output@Grad",
+                "BmmGradInferSpmd");
+  CheckDimEqual(y_shape[2],
+                out_grad_shape[2],
+                "the last dimension of Input(Y)",
+                "the last dimension of Output@Grad",
+                "BmmGradInferSpmd");
+
+  VLOG(6)
+      << "BmmGradInferSpmd delegates to MatmulGradInferSpmd (trans_x=false, "
+         "trans_y=false).";
+
+  return MatmulGradInferSpmd(x, y, out_grad, false, false);
+}
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/bmm.h b/paddle/phi/infermeta/spmd_rules/bmm.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+
+SpmdInfo BmmInferSpmd(const DistMetaTensor& x, const DistMetaTensor& y);
+
+SpmdInfo BmmGradInferSpmd(const DistMetaTensor& x,
+                          const DistMetaTensor& y,
+                          const DistMetaTensor& out_grad);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.cc b/paddle/phi/infermeta/spmd_rules/rules.cc
@@ -40,7 +40,9 @@ PD_REGISTER_SPMD_RULE(matmul,
 PD_REGISTER_SPMD_RULE(matmul_v2,  // static mode
                       PD_INFER_SPMD(phi::distributed::MatmulInferSpmd),
                       PD_INFER_SPMD(phi::distributed::MatmulInferSpmdReverse));
-
+PD_REGISTER_SPMD_RULE(bmm,
+                      PD_INFER_SPMD(phi::distributed::BmmInferSpmd),
+                      PD_INFER_SPMD(phi::distributed::BmmGradInferSpmd));
 PD_REGISTER_SPMD_RULE(
     elementwise_unary,
     PD_INFER_SPMD(phi::distributed::ElementwiseUnaryInferSpmd),
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/argmin.h"
 #include "paddle/phi/infermeta/spmd_rules/argsort.h"
 #include "paddle/phi/infermeta/spmd_rules/batch_norm.h"
+#include "paddle/phi/infermeta/spmd_rules/bmm.h"
 #include "paddle/phi/infermeta/spmd_rules/c_embedding.h"
 #include "paddle/phi/infermeta/spmd_rules/c_softmax_with_cross_entropy.h"
 #include "paddle/phi/infermeta/spmd_rules/c_softmax_with_multi_label_cross_entropy.h"
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -323,6 +323,7 @@
   output : Tensor(x_grad), Tensor(y_grad)
   infer_meta :
     func : BmmGradInferMeta
+    spmd_rule : BmmGradInferSpmd
   kernel :
     func : bmm_grad
     data_type : out_grad
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -773,6 +773,7 @@
   output : Tensor(out)
   infer_meta :
     func : BmmInferMeta
+    spmd_rule: BmmInferSpmd
   kernel :
     func : bmm
   backward : bmm_grad
diff --git a/test/cpp/auto_parallel/matmul_co_shard_spmd_rule_test.cc b/test/cpp/auto_parallel/matmul_co_shard_spmd_rule_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <set>
+#include "paddle/phi/infermeta/spmd_rules/bmm.h"
 #include "test/cpp/auto_parallel/spmd_rule_test_util.h"
 
 namespace paddle {
@@ -411,6 +412,94 @@ TEST(MatmulGradInferSpmd, Ctor) {
   }
 }
 
+TEST(BmmInferSpmd, CoShard) {
+  std::vector<int64_t> mesh_shape = {2, 2, 2};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+  std::vector<std::string> dim_names = {"x", "y", "z"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  std::vector<int64_t> x_shape = {4, 16, 8};
+  std::vector<std::vector<int64_t>> x_dims_mapping = {{0, 1}, {2}, {}};
+  TensorDistAttr x_dist_attr;
+  x_dist_attr.set_process_mesh(process_mesh);
+  x_dist_attr.set_dims_mapping(x_dims_mapping);
+  x_dist_attr.set_dynamic_dims(std::vector<bool>(x_shape.size(), false));
+  phi::distributed::DistMetaTensor x(common::make_ddim(x_shape), x_dist_attr);
+
+  std::vector<int64_t> y_shape = {4, 8, 32};
+  std::vector<std::vector<int64_t>> y_dims_mapping = {{0, 1}, {}, {}};
+  TensorDistAttr y_dist_attr;
+  y_dist_attr.set_process_mesh(process_mesh);
+  y_dist_attr.set_dims_mapping(y_dims_mapping);
+  y_dist_attr.set_dynamic_dims(std::vector<bool>(y_shape.size(), false));
+  phi::distributed::DistMetaTensor y(common::make_ddim(y_shape), y_dist_attr);
+
+  auto bmm_spmd_info = phi::distributed::BmmInferSpmd(x, y);
+
+  ASSERT_EQ(bmm_spmd_info.first.size(), static_cast<size_t>(2));
+  ASSERT_EQ(bmm_spmd_info.second.size(), static_cast<size_t>(1));
+
+  check_multi_dims_mapping(bmm_spmd_info.first[0], x_dims_mapping);
+  EXPECT_FALSE(is_partial(bmm_spmd_info.first[0]));
+  check_multi_dims_mapping(bmm_spmd_info.first[1], y_dims_mapping);
+  EXPECT_FALSE(is_partial(bmm_spmd_info.first[1]));
+
+  const std::vector<std::vector<int64_t>> expected_out_dims_mapping = {
+      {0, 1}, {2}, {}};
+  check_multi_dims_mapping(bmm_spmd_info.second[0], expected_out_dims_mapping);
+  EXPECT_FALSE(is_partial(bmm_spmd_info.second[0]));
+}
+
+TEST(BmmGradInferSpmd, CoShard) {
+  std::vector<int64_t> mesh_shape = {2, 2, 2};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+  std::vector<std::string> dim_names = {"x", "y", "z"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  std::vector<int64_t> x_shape = {4, 16, 8};
+  std::vector<std::vector<int64_t>> x_dims_mapping = {{0, 1}, {2}, {}};
+  TensorDistAttr x_dist_attr;
+  x_dist_attr.set_process_mesh(process_mesh);
+  x_dist_attr.set_dims_mapping(x_dims_mapping);
+  x_dist_attr.set_dynamic_dims(std::vector<bool>(x_shape.size(), false));
+  phi::distributed::DistMetaTensor x(common::make_ddim(x_shape), x_dist_attr);
+
+  std::vector<int64_t> y_shape = {4, 8, 32};
+  std::vector<std::vector<int64_t>> y_dims_mapping = {{0, 1}, {}, {}};
+  TensorDistAttr y_dist_attr;
+  y_dist_attr.set_process_mesh(process_mesh);
+  y_dist_attr.set_dims_mapping(y_dims_mapping);
+  y_dist_attr.set_dynamic_dims(std::vector<bool>(y_shape.size(), false));
+  phi::distributed::DistMetaTensor y(common::make_ddim(y_shape), y_dist_attr);
+
+  std::vector<int64_t> out_grad_shape = {4, 16, 32};
+  std::vector<std::vector<int64_t>> out_grad_dims_mapping = {{0, 1}, {2}, {}};
+  TensorDistAttr out_grad_dist_attr;
+  out_grad_dist_attr.set_process_mesh(process_mesh);
+  out_grad_dist_attr.set_dims_mapping(out_grad_dims_mapping);
+  out_grad_dist_attr.set_dynamic_dims(
+      std::vector<bool>(out_grad_shape.size(), false));
+  phi::distributed::DistMetaTensor out_grad(common::make_ddim(out_grad_shape),
+                                            out_grad_dist_attr);
+
+  auto bmm_grad_spmd_info = phi::distributed::BmmGradInferSpmd(x, y, out_grad);
+
+  ASSERT_EQ(bmm_grad_spmd_info.first.size(), static_cast<size_t>(3));
+  ASSERT_EQ(bmm_grad_spmd_info.second.size(), static_cast<size_t>(2));
+
+  check_multi_dims_mapping(bmm_grad_spmd_info.first[0], x_dims_mapping);
+  EXPECT_FALSE(is_partial(bmm_grad_spmd_info.first[0]));
+  check_multi_dims_mapping(bmm_grad_spmd_info.first[1], y_dims_mapping);
+  EXPECT_FALSE(is_partial(bmm_grad_spmd_info.first[1]));
+  check_multi_dims_mapping(bmm_grad_spmd_info.first[2], out_grad_dims_mapping);
+  EXPECT_FALSE(is_partial(bmm_grad_spmd_info.first[2]));
+
+  check_multi_dims_mapping(bmm_grad_spmd_info.second[0], x_dims_mapping);
+  EXPECT_FALSE(is_partial(bmm_grad_spmd_info.second[0]));
+  check_multi_dims_mapping(bmm_grad_spmd_info.second[1], y_dims_mapping);
+  EXPECT_TRUE(is_partial(bmm_grad_spmd_info.second[1]));
+  check_partial_dims(bmm_grad_spmd_info.second[1], {2});
+}
 }  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace paddle