jd-opensource · yingxudeng · Nov 3, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -298,6 +298,7 @@ else()
 endif()
 
 if(USE_NPU)
+  # add_definitions(-DUSE_NPU_TORCH)
   add_definitions(-DUSE_NPU)
   add_definitions(-DBUILD_LIBTORCH)
   add_definitions(-DTORCH_SETCUSTOMHANDLER=ON)
@@ -309,6 +310,7 @@ if(USE_NPU)
       $ENV{PYTORCH_INSTALL_PATH}/include
       $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/api/include
       $ENV{PYTORCH_NPU_INSTALL_PATH}/include
+      $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/distributed
       $ENV{NPU_HOME_PATH}/include
       $ENV{ATB_HOME_PATH}/include
       $ENV{NPU_HOME_PATH}/opp/vendors/xllm/op_api/include/

diff --git a/cmake/cc_test.cmake b/cmake/cc_test.cmake
@@ -69,6 +69,14 @@ function(cc_test)
     PRIVATE ${CC_TEST_LINKOPTS}
   )
 
+  if(USE_NPU)
+    set(COMMON_LIBS Python::Python torch_npu torch_python)
+  endif()
+
+  if(USE_NPU AND DEFINED COMMON_LIBS)
+    target_link_libraries(${CC_TEST_NAME} PRIVATE ${COMMON_LIBS})
+  endif()
+
   add_dependencies(all_tests ${CC_TEST_NAME})
 
   gtest_add_tests(

diff --git a/xllm/CMakeLists.txt b/xllm/CMakeLists.txt
@@ -34,7 +34,7 @@ target_link_libraries(xllm PRIVATE glog::glog brpc leveldb::leveldb ZLIB::ZLIB p
 add_dependencies(xllm brpc-static)
 
 if(USE_NPU)
-  set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext)
+  set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext torch_npu torch_python)
 elseif(USE_MLU)
   set(COMMON_LIBS Python::Python)
 endif()

diff --git a/xllm/core/common/CMakeLists.txt b/xllm/core/common/CMakeLists.txt
@@ -28,6 +28,7 @@ cc_library(
     absl::random_random
     absl::strings
     torch
+    $<$<BOOL:${USE_NPU}>:torch_python>
     $<$<BOOL:${USE_NPU}>:torch_npu>
     $<$<BOOL:${USE_MSPTI}>:mspti>
     $<$<BOOL:${USE_NPU}>:ms_tools_ext>

diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -389,3 +389,5 @@ DEFINE_string(reasoning_parser,
 
 // --- qwen3 reranker config ---
 DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
+
+DEFINE_bool(enable_native_npu, true, "Whether to enable native NPU support.");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
@@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker);
 DECLARE_string(reasoning_parser);
 
 DECLARE_bool(enable_shm);
+
+DECLARE_bool(enable_native_npu);
diff --git a/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt b/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt
@@ -12,6 +12,7 @@ cc_binary(
     :models
     :model
     :distributed_runtime
+    :parallel_state
     absl::strings
     xllm_kernels
     ascendcl

diff --git a/xllm/core/distributed_runtime/worker_server.cpp b/xllm/core/distributed_runtime/worker_server.cpp
@@ -100,6 +100,12 @@ void WorkerServer::create_server(
   const ParallelArgs* parallel_args = comm.parallel_args();
 #if defined(USE_MLU) || defined(USE_CUDA)
   comm.create_process_groups(master_node_addr, device);
+#elif defined(USE_NPU)
+  // TODO: Refactor to use model_type or other appropriate enumeration for
+  // condition checking
+  if (FLAGS_enable_native_npu) {
+    comm.create_process_groups(master_node_addr, device);
+  }
 #endif
 
   WorkerType worker_type =

diff --git a/xllm/core/framework/model/CMakeLists.txt b/xllm/core/framework/model/CMakeLists.txt
@@ -17,10 +17,10 @@ set(BASE_DEPS
 if(USE_NPU)
   list(APPEND BASE_DEPS :npu_layers)
   list(APPEND BASE_DEPS :platform_npu)
-else()
-  list(APPEND BASE_DEPS :common_layers)
 endif()
 
+list(APPEND BASE_DEPS :common_layers)
+
 
 # Define the library
 cc_library(

diff --git a/xllm/core/framework/parallel_state/collective_communicator.cpp b/xllm/core/framework/parallel_state/collective_communicator.cpp
@@ -18,6 +18,9 @@ limitations under the License.
 #include "mapping_npu.h"
 
 #if defined(USE_NPU)
+#include <torch_npu/csrc/distributed/ProcessGroupHCCL.hpp>
+
+#include "npu_process_group.h"
 #include "xllm_kernels/core/include/atb_speed/base/external_comm_manager.h"
 #include "xllm_kernels/core/include/atb_speed/utils/singleton.h"
 #include "xllm_kernels/models/base/param/mapping.h"
@@ -30,23 +33,6 @@ limitations under the License.
 #include "parallel_args.h"
 #include "util/net.h"
 
-namespace {
-#if defined(USE_NPU)
-std::unique_ptr<xllm::ProcessGroup> create_process_group(
-    int rank,
-    int world_size,
-    int rank_size,
-    int port,
-    bool trans,
-    const std::string& host,
-    const std::string& group_name,
-    const torch::Device& device) {
-  LOG(FATAL) << "Unsupported device type";
-  return nullptr;
-}
-#endif
-}  // namespace
-
 namespace xllm {
 
 CollectiveCommunicator::CollectiveCommunicator(int global_rank,

diff --git a/xllm/core/framework/parallel_state/npu_process_group.cpp b/xllm/core/framework/parallel_state/npu_process_group.cpp
@@ -14,6 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "npu_process_group.h"
+#ifdef TORCH_HIGHER_THAN_PTA6
+#include <torch_npu/csrc/framework/OpCommand.h>
+#else
+#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
+#include <torch_npu/csrc/framework/utils/OpPreparation.h>
+#endif
+
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/TCPStore.hpp>
+#include <torch_npu/csrc/distributed/ProcessGroupHCCL.hpp>
 
 namespace {
 
@@ -24,113 +34,65 @@ namespace {
       LOG(FATAL) << "Failed, HCCL error :" << HcclGetErrorString(r); \
     }                                                                \
   } while (0)
+}  // namespace
 
-inline bool is_npu(const at::Tensor& tensor) {
-  if (!tensor.defined()) {
-    return false;
-  }
-  return tensor.device().is_privateuseone();
-}
-
-inline bool is_npu(const at::TensorOptions& options) {
-  return options.device().is_privateuseone();
-}
+namespace xllm {
 
-inline bool is_npu(const at::Device& device) {
-  return device.is_privateuseone();
-}
+ProcessGroupHCCL::ProcessGroupHCCL(int global_rank,
+                                   int world_size,
+                                   int rank_size,
+                                   int port,
+                                   bool trans,
+                                   const std::string& host,
+                                   const std::string& group_name,
+                                   const torch::Device& device)
+    : ProcessGroup(device) {
+  c10::intrusive_ptr<c10d_npu::ProcessGroupHCCL::Options> hccl_pg_options =
+      c10d_npu::ProcessGroupHCCL::Options::create();
+  // hccl_pg_options->group_name = group_name;
+  int rank = global_rank;
+  if (world_size != rank_size) {
+    auto [local_rank, group_ranks] =
+        get_group_rank(world_size, global_rank, rank_size, trans);
+    std::vector<uint32_t> uint32_ranks;
+    for (auto rank : group_ranks) {
+      uint32_ranks.push_back(static_cast<uint32_t>(rank));
+    }
+    hccl_pg_options->global_ranks_in_group = uint32_ranks;
+    rank = local_rank;
+  }
 
-at::Tensor flatten_for_scatter_gather(std::vector<at::Tensor>& tensors) {
-  auto& t = tensors[0];
-  std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
-  sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return at::empty(sizes, t.options());
+  auto store = create_tcp_store(host, port, rank);
+  pg_ = std::make_unique<c10d_npu::ProcessGroupHCCL>(
+      store, rank, rank_size, hccl_pg_options);
 }
 
-HcclDataType to_hccl_data_type(const torch::Tensor& input) {
-  const auto type = input.scalar_type();
-  switch (type) {
-    case at::kFloat:
-      return HCCL_DATA_TYPE_FP32;
-    case at::kHalf:
-      return HCCL_DATA_TYPE_FP16;
-    case at::kDouble:
-      return HCCL_DATA_TYPE_FP64;
-    case at::kLong:
-      return HCCL_DATA_TYPE_INT64;
-    case at::kInt:
-      return HCCL_DATA_TYPE_INT32;
-    case at::kChar:
-      return HCCL_DATA_TYPE_INT8;
-    case at::kByte:
-      return HCCL_DATA_TYPE_UINT8;
-    case at::kBool:
-      return HCCL_DATA_TYPE_UINT8;
-    case at::kBFloat16:
-      return HCCL_DATA_TYPE_BFP16;
-    default:
-      TORCH_CHECK(false, "Unconvertible HCCL type ", type);
+// Destructor.
+ProcessGroupHCCL::~ProcessGroupHCCL() {
+  if (pg_) {
+    pg_->shutdown();
+  } else {
+    HCCLCHECK(HcclCommDestroy(comm_));
   }
 }
 
-void check_input(torch::Tensor input) {
-  CHECK(is_npu(input)) << "input should be npu tensor";
-  CHECK(input.is_contiguous()) << "input should be contiguous";
-  CHECK(!input.is_sparse()) << "input have to be npu dense tensor";
-}
-
-}  // namespace
-
-namespace xllm {
-
 ProcessGroupHCCL::ProcessGroupHCCL(int rank,
                                    int world_size,
                                    const torch::Device& device,
                                    HcclComm comm)
     : ProcessGroup(device), comm_(comm) {}
-// Destructor.
-ProcessGroupHCCL::~ProcessGroupHCCL() { HCCLCHECK(HcclCommDestroy(comm_)); }
 
-void ProcessGroupHCCL::allreduce(torch::Tensor& input) {
-  DCHECK(input.device() == device())
-      << "input should be on the same device as the process group";
-  check_input(input);
-  // inplace all reduce
-  // const auto count = input.numel();
-  // const auto data_type = to_hccl_data_type(input);
-  // auto stream = c10_npu::getCurrentNPUStream();
-  // torch::DeviceGuard device_guard(device());
-  // HCCLCHECK(HcclAllReduce(
-  //     /*sendbuff=*/input.data_ptr(),
-  //     /*recvbuff=*/input.data_ptr(),
-  //     /*count=*/count,
-  //     /*datatype=*/data_type,
-  //     /*op=*/HCCL_REDUCE_SUM,
-  //     /*comm=*/comm_,
-  //     /*stream=*/stream));
-}
-void ProcessGroupHCCL::allgather(const torch::Tensor& input,
-                                 std::vector<torch::Tensor>& outputs) {
-  check_input(input);
-  // CHECK(outputs.size() == world_size())
-  //     << "outputs should have the same size as world_size";
-  // DCHECK(input.device() == device())
-  //     << "input should be on the same device as the process group";
-  // torch::DeviceGuard device_guard(device());
-  // torch::Tensor flattened_output = flatten_for_scatter_gather(outputs);
-  // const auto count = input.numel();
-  // const auto data_type = to_hccl_data_type(input);
-  // auto stream = c10_npu::getCurrentNPUStream();
-  // HCCLCHECK(HcclAllGather(
-  //     /*sendbuff=*/input.data_ptr(),
-  //     /*recvbuff=*/flattened_output.data_ptr(),
-  //     /*sendcount=*/count,
-  //     /*datatype=*/data_type,
-  //     /*comm=*/comm_,
-  //     /*stream=*/stream));
-  // // copy the flattened output tensors to the outputs.
-  // for (int i = 0; i < outputs.size(); ++i) {
-  //   outputs[i].copy_(flattened_output[i], /*non_blocking=*/true);
-  // }
+std::unique_ptr<xllm::ProcessGroup> create_process_group(
+    int rank,
+    int world_size,
+    int rank_size,
+    int port,
+    bool trans,
+    const std::string& host,
+    const std::string& group_name,
+    const torch::Device& device) {
+  return std::make_unique<ProcessGroupHCCL>(
+      rank, world_size, rank_size, port, trans, host, group_name, device);
 }
+
 }  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/npu_process_group.h b/xllm/core/framework/parallel_state/npu_process_group.h
@@ -18,6 +18,10 @@ limitations under the License.
 #include "hccl/hccl.h"
 #include "process_group.h"
 
+namespace c10d_npu {
+class ProcessGroupHCCL;
+}
+
 namespace xllm {
 
 class ProcessGroupHCCL : public ProcessGroup {
@@ -28,16 +32,30 @@ class ProcessGroupHCCL : public ProcessGroup {
                    const torch::Device& device,
                    HcclComm comm);
 
+  ProcessGroupHCCL(int rank,
+                   int world_size,
+                   int rank_size,
+                   int port,
+                   bool trans,
+                   const std::string& host,
+                   const std::string& group_name,
+                   const torch::Device& device);
+
   // Destructor.
   ~ProcessGroupHCCL() override;
 
-  void allreduce(torch::Tensor& input) override;
-
-  void allgather(const torch::Tensor& input,
-                 std::vector<torch::Tensor>& outputs) override;
-
  private:
   HcclComm comm_ = nullptr;
 };
 
+std::unique_ptr<xllm::ProcessGroup> create_process_group(
+    int rank,
+    int world_size,
+    int rank_size,
+    int port,
+    bool trans,
+    const std::string& host,
+    const std::string& group_name,
+    const torch::Device& device);
+
 }  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/process_group.h b/xllm/core/framework/parallel_state/process_group.h
@@ -19,6 +19,11 @@ limitations under the License.
 
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
+
+#if defined(USE_NPU)
+#include <torch_npu/csrc/distributed/ProcessGroupHCCL.hpp>
+#endif
+
 namespace xllm {
 std::pair<int, std::vector<uint64_t>> get_group_rank(int world_size,
                                                      int global_rank,
@@ -60,7 +65,11 @@ class ProcessGroup {
   torch::Device device_;
 
  protected:
+#if defined(USE_NPU)
+  std::unique_ptr<c10d_npu::ProcessGroupHCCL> pg_{nullptr};
+#else
   std::unique_ptr<c10d::Backend> pg_{nullptr};
+#endif
 };
 
 }  // namespace xllm
Original file line number	Diff line number	Diff line change
Expand Up		@@ -389,3 +389,5 @@ DEFINE_string(reasoning_parser,

		// --- qwen3 reranker config ---
		DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");

		DEFINE_bool(enable_native_npu, true, "Whether to enable native NPU support.");
Original file line number	Diff line number	Diff line change
Expand Up		@@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker);
		DECLARE_string(reasoning_parser);

		DECLARE_bool(enable_shm);

		DECLARE_bool(enable_native_npu);