jd-opensource
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎xllm/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/common/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎xllm/core/common/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎xllm/core/common/global_flags.cpp‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/common/global_flags.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/common/global_flags.h‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/common/global_flags.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎xllm/core/distributed_runtime/worker_server.cpp‎
Lines changed: 6 additions & 0 deletions b/‎xllm/core/distributed_runtime/worker_server.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎xllm/core/framework/model/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎xllm/core/framework/model/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎xllm/core/framework/parallel_state/collective_communicator.cpp‎
Lines changed: 3 additions & 17 deletions b/‎xllm/core/framework/parallel_state/collective_communicator.cpp‎
Lines changed: 3 additions & 17 deletions
@@ -298,6 +298,7 @@ else()
 endif()
 
 if(USE_NPU)
+  add_definitions(-DUSE_NPU_TORCH)
   add_definitions(-DUSE_NPU)
   add_definitions(-DBUILD_LIBTORCH)
   add_definitions(-DTORCH_SETCUSTOMHANDLER=ON)
@@ -309,6 +310,7 @@ if(USE_NPU)
       $ENV{PYTORCH_INSTALL_PATH}/include
       $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/api/include
       $ENV{PYTORCH_NPU_INSTALL_PATH}/include
+      $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/distributed
       $ENV{NPU_HOME_PATH}/include
       $ENV{ATB_HOME_PATH}/include
       $ENV{NPU_HOME_PATH}/opp/vendors/xllm/op_api/include/
 
@@ -17,7 +17,7 @@
 from setuptools.command.bdist_wheel import bdist_wheel
 from setuptools.command.build_ext import build_ext
 
-BUILD_TEST_FILE = True
+BUILD_TEST_FILE = False
 BUILD_EXPORT = True
 
 # get cpu architecture
 
@@ -34,7 +34,7 @@ target_link_libraries(xllm PRIVATE glog::glog brpc leveldb::leveldb ZLIB::ZLIB p
 add_dependencies(xllm brpc-static)
 
 if(USE_NPU)
-  set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext)
+  set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext torch_npu torch_python)
 elseif(USE_MLU)
   set(COMMON_LIBS Python::Python)
 endif()
 
@@ -28,6 +28,7 @@ cc_library(
     absl::random_random
     absl::strings
     torch
+    $<$<BOOL:${USE_NPU}>:torch_python>
     $<$<BOOL:${USE_NPU}>:torch_npu>
     $<$<BOOL:${USE_MSPTI}>:mspti>
     $<$<BOOL:${USE_NPU}>:ms_tools_ext>
 
@@ -389,3 +389,5 @@ DEFINE_string(reasoning_parser,
 
 // --- qwen3 reranker config ---
 DEFINE_bool(enable_qwen3_reranker, false, "Whether to enable qwen3 reranker.");
+
+DEFINE_bool(enable_native_npu, true, "Whether to enable native NPU support.");
@@ -202,3 +202,5 @@ DECLARE_bool(enable_qwen3_reranker);
 DECLARE_string(reasoning_parser);
 
 DECLARE_bool(enable_shm);
+
+DECLARE_bool(enable_native_npu);
@@ -12,6 +12,7 @@ cc_binary(
     :models
     :model
     :distributed_runtime
+    :parallel_state
     absl::strings
     xllm_kernels
     ascendcl
 
@@ -100,6 +100,12 @@ void WorkerServer::create_server(
   const ParallelArgs* parallel_args = comm.parallel_args();
 #if defined(USE_MLU) || defined(USE_CUDA)
   comm.create_process_groups(master_node_addr, device);
+#elif defined(USE_NPU)
+  // TODO: Refactor to use model_type or other appropriate enumeration for
+  // condition checking
+  if (FLAGS_enable_native_npu) {
+    comm.create_process_groups(master_node_addr, device);
+  }
 #endif
 
   WorkerType worker_type =
 
@@ -17,10 +17,10 @@ set(BASE_DEPS
 if(USE_NPU)
   list(APPEND BASE_DEPS :npu_layers)
   list(APPEND BASE_DEPS :platform_npu)
-else()
-  list(APPEND BASE_DEPS :common_layers)
 endif()
 
+list(APPEND BASE_DEPS :common_layers)
+
 
 # Define the library
 cc_library(
 
@@ -18,6 +18,9 @@ limitations under the License.
 #include "mapping_npu.h"
 
 #if defined(USE_NPU)
+#include <torch_npu/csrc/distributed/ProcessGroupHCCL.hpp>
+
+#include "npu_process_group.h"
 #include "xllm_kernels/core/include/atb_speed/base/external_comm_manager.h"
 #include "xllm_kernels/core/include/atb_speed/utils/singleton.h"
 #include "xllm_kernels/models/base/param/mapping.h"
@@ -30,23 +33,6 @@ limitations under the License.
 #include "parallel_args.h"
 #include "util/net.h"
 
-namespace {
-#if defined(USE_NPU)
-std::unique_ptr<xllm::ProcessGroup> create_process_group(
-    int rank,
-    int world_size,
-    int rank_size,
-    int port,
-    bool trans,
-    const std::string& host,
-    const std::string& group_name,
-    const torch::Device& device) {
-  LOG(FATAL) << "Unsupported device type";
-  return nullptr;
-}
-#endif
-}  // namespace
-
 namespace xllm {
 
 CollectiveCommunicator::CollectiveCommunicator(int global_rank,