bugfix: fix unsupported synchronizing streams. (jd-opensource#351)

yq33victor · web-flow · commit 47bcce57af59 · 2025-11-11T17:14:27.000+08:00
Signed-off-by: Tao Peng &lt;pengtao.156@jd.com&gt;
diff --git a/xllm/core/platform/device.cpp b/xllm/core/platform/device.cpp
@@ -101,7 +101,7 @@ int64_t Device::free_memory() { return get_device_mem().free_memory; }
 
 int Device::synchronize_default_stream() {
 #if defined(USE_NPU)
-  c10_npu::getCurrentNPUStream(index()).synchronize();
+  return aclrtSynchronizeStream(c10_npu::getCurrentNPUStream(index()).stream());
 #elif defined(USE_MLU)
   torch_mlu::getCurrentMLUStream(index()).synchronize();
 #elif defined(USE_CUDA)
diff --git a/xllm/core/platform/stream.cpp b/xllm/core/platform/stream.cpp
@@ -26,8 +26,12 @@ Stream::Stream() : stream_(c10::cuda::getStreamFromPool()) {}
 #endif
 
 int Stream::synchronize() const {
+#if defined(USE_NPU)
+  return aclrtSynchronizeStream(stream_.stream());
+#else
   stream_.unwrap().synchronize();
   return 0;
+#endif
 }
 
 c10::StreamGuard Stream::set_stream_guard() const {
diff --git a/xllm/core/scheduler/continuous_scheduler.cpp b/xllm/core/scheduler/continuous_scheduler.cpp
@@ -960,7 +960,8 @@ void ContinuousScheduler::generate() {
   while (num_pending_requests() > 0 || !batch_empty ||
          request_queue_.size() > 0) {
     // build a batch of requests/sequences
-    auto batch = prepare_batch();
+    const auto timeout = absl::Milliseconds(500);
+    std::vector<Batch> batch = schedule_request(timeout);
     batch_empty = true;
     for (auto& b : batch) {
       batch_empty &= b.empty();
diff --git a/xllm/models/llm/llm_model_base.h b/xllm/models/llm/llm_model_base.h
@@ -96,7 +96,6 @@ class LlmDecoderLayerImplBase : public torch::nn::Module {
                                 int node_id,
                                 std::vector<aclrtEvent*> event,
                                 std::vector<std::atomic<bool>*> event_flag) {
-#if defined(USE_NPU)
     auto micro_batch_num = x.size();
     for (auto i = 0; i < micro_batch_num; ++i) {
       if (input_params[i].src_block_indices.numel() > 0) {
@@ -108,7 +107,7 @@ class LlmDecoderLayerImplBase : public torch::nn::Module {
                     0);
       }
     }
-#endif
+
     return decoder_layer_(x,
                           cos_pos,
                           sin_pos,