jd-opensource
diff --git a/‎xllm/core/framework/parallel_state/CMakeLists.txt‎
Lines changed: 2 additions & 3 deletions b/‎xllm/core/framework/parallel_state/CMakeLists.txt‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎xllm/core/framework/parallel_state/collective_communicator.cpp‎
Lines changed: 54 additions & 16 deletions b/‎xllm/core/framework/parallel_state/collective_communicator.cpp‎
Lines changed: 54 additions & 16 deletions
diff --git a/‎xllm/core/framework/parallel_state/collective_communicator.h‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/framework/parallel_state/collective_communicator.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/framework/parallel_state/cuda_process_group.cpp‎
Lines changed: 0 additions & 69 deletions b/‎xllm/core/framework/parallel_state/cuda_process_group.cpp‎
Lines changed: 0 additions & 69 deletions
diff --git a/‎xllm/core/framework/parallel_state/cuda_process_group.h‎
Lines changed: 35 additions & 17 deletions b/‎xllm/core/framework/parallel_state/cuda_process_group.h‎
Lines changed: 35 additions & 17 deletions
diff --git a/‎xllm/core/framework/parallel_state/mlu_process_group.cpp‎
Lines changed: 0 additions & 67 deletions b/‎xllm/core/framework/parallel_state/mlu_process_group.cpp‎
Lines changed: 0 additions & 67 deletions
@@ -16,9 +16,8 @@ cc_library(
   SRCS
     mapping_npu.cpp
     parallel_state.cpp
+    process_group.cpp
     $<$<BOOL:${USE_NPU}>:npu_process_group.cpp>
-    $<$<BOOL:${USE_MLU}>:mlu_process_group.cpp>
-    $<$<BOOL:${USE_CUDA}>:cuda_process_group.cpp>
     collective_communicator.cpp
   DEPS
     :common
@@ -45,4 +44,4 @@ if(USE_NPU)
       c_sec
       spdlog::spdlog
   )
-endif()
+endif()
@@ -22,8 +22,6 @@ limitations under the License.
 #include "xllm_kernels/core/include/atb_speed/utils/singleton.h"
 #include "xllm_kernels/models/base/param/mapping.h"
 #elif defined(USE_MLU)
-#include <torch_mlu/csrc/framework/distributed/process_group_cncl.hpp>
-
 #include "mlu_process_group.h"
 #elif defined(USE_CUDA)
 #include "cuda_process_group.h"
@@ -33,25 +31,20 @@ limitations under the License.
 #include "util/net.h"
 
 namespace {
+#if defined(USE_NPU)
 std::unique_ptr<xllm::ProcessGroup> create_process_group(
     int rank,
     int world_size,
     int rank_size,
     int port,
+    bool trans,
     const std::string& host,
     const std::string& group_name,
     const torch::Device& device) {
-#if defined(USE_MLU)
-  return std::make_unique<xllm::ProcessGroupCncl>(
-      rank, world_size, rank_size, port, host, group_name, device);
-#elif defined(USE_CUDA)
-  return std::make_unique<xllm::ProcessGroupNccl>(
-      rank, world_size, rank_size, port, host, group_name, device);
-#else
   LOG(FATAL) << "Unsupported device type";
   return nullptr;
-#endif
 }
+#endif
 }  // namespace
 
 namespace xllm {
@@ -130,24 +123,69 @@ void CollectiveCommunicator::create_process_groups(
   int global_rank = parallel_args_->rank();
   int world_size = parallel_args_->world_size();
   int dp_size = parallel_args_->dp_size();
-
-  process_group_ = create_process_group(
-      global_rank, world_size, world_size, ++port, host, "world_group", device);
+  int ep_size = parallel_args_->ep_size();
+  process_group_ = create_process_group(global_rank,
+                                        world_size,
+                                        world_size,
+                                        ++port,
+                                        false,
+                                        host,
+                                        "world_group",
+                                        device);
+  parallel_args_->process_group_ = process_group_.get();
 
   int tp_size = world_size / dp_size;
   CHECK_EQ(tp_size * dp_size, world_size);
   int port_offset = global_rank / tp_size + 1;
-
   tp_group_ = create_process_group(global_rank,
                                    world_size,
                                    tp_size,
                                    port + port_offset,
+                                   false,
                                    host,
                                    "tp_group",
                                    device);
-
-  parallel_args_->process_group_ = process_group_.get();
   parallel_args_->tp_group_ = tp_group_.get();
+  port += dp_size;
+
+  if (dp_size > 1) {
+    port_offset = global_rank % tp_size + 1;
+    dp_local_process_group_ = create_process_group(global_rank,
+                                                   world_size,
+                                                   dp_size,
+                                                   port + port_offset,
+                                                   true,
+                                                   host,
+                                                   "dp_group",
+                                                   device);
+    parallel_args_->dp_local_process_group_ = dp_local_process_group_.get();
+    port += tp_size;
+  }
+
+  if (ep_size > 1) {
+    int moe_tp_size = world_size / ep_size;
+    port_offset = global_rank / moe_tp_size + 1;
+    moe_tp_group_ = create_process_group(global_rank,
+                                         world_size,
+                                         moe_tp_size,
+                                         port + port_offset,
+                                         false,
+                                         host,
+                                         "moe_tp_group",
+                                         device);
+    parallel_args_->moe_tp_group_ = moe_tp_group_.get();
+    port += ep_size;
+    port_offset = global_rank % moe_tp_size + 1;
+    moe_ep_group_ = create_process_group(global_rank,
+                                         world_size,
+                                         ep_size,
+                                         port + port_offset,
+                                         true,
+                                         host,
+                                         "moe_ep_group",
+                                         device);
+    parallel_args_->moe_ep_group_ = moe_ep_group_.get();
+  }
 }
 
 const ParallelArgs* CollectiveCommunicator::parallel_args() {
 
@@ -42,6 +42,8 @@ class CollectiveCommunicator {
   std::unique_ptr<ProcessGroup> process_group_;
   std::unique_ptr<ProcessGroup> dp_local_process_group_;
   std::unique_ptr<ProcessGroup> tp_group_;
+  std::unique_ptr<ProcessGroup> moe_tp_group_;
+  std::unique_ptr<ProcessGroup> moe_ep_group_;
 };
 
 }  // namespace xllm
@@ -23,30 +23,48 @@ namespace xllm {
 
 class ProcessGroupNccl : public ProcessGroup {
  public:
-  ProcessGroupNccl(int rank,
+  ProcessGroupNccl(int global_rank,
                    int world_size,
                    int rank_size,
                    int port,
+                   bool trans,
                    const std::string& host,
                    const std::string& group_name,
-                   const torch::Device& device);
+                   const torch::Device& device)
+      : ProcessGroup(device) {
+    c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> pg_options =
+        c10d::ProcessGroupNCCL::Options::create();
+    pg_options->group_name = group_name;
+    int rank = global_rank;
+    if (world_size != rank_size) {
+      auto [local_rank, group_ranks] =
+          get_group_rank(world_size, global_rank, rank_size, trans);
+      pg_options->global_ranks_in_group = group_ranks;
+      rank = local_rank;
+    }
 
-  ~ProcessGroupNccl() override;
+    auto store = create_tcp_store(host, port, rank);
+    pg_ = std::make_unique<c10d::ProcessGroupNCCL>(
+        store, rank, rank_size, pg_options);
+  }
 
-  void allreduce(torch::Tensor& input) override;
-
-  void allgather(torch::Tensor input,
-                 std::vector<torch::Tensor>& outputs) override;
-
- private:
-  // rank of current process
-  int rank_ = 0;
-
-  // number of processes
-  int world_size_ = 0;
-
-  // nccl process group
-  std::unique_ptr<c10d::ProcessGroupNCCL> nccl_pg_;
+  ~ProcessGroupNccl() override {
+    if (pg_) {
+      pg_->shutdown();
+    }
+  }
 };
 
+std::unique_ptr<xllm::ProcessGroup> create_process_group(
+    int rank,
+    int world_size,
+    int rank_size,
+    int port,
+    bool trans,
+    const std::string& host,
+    const std::string& group_name,
+    const torch::Device& device) {
+  return std::make_unique<ProcessGroupNccl>(
+      rank, world_size, rank_size, port, trans, host, group_name, device);
+}
 }  // namespace xllm