bugfix: add workspace for flashinfer.

XuZhang99 · XuZhang99 · commit a533db20310c · 2025-10-23T14:42:32.000+08:00
diff --git a/xllm/core/common/CMakeLists.txt b/xllm/core/common/CMakeLists.txt
@@ -15,6 +15,7 @@ cc_library(
     rate_limiter.h
     types.h
     device_monitor.h
+    flashinfer_workspace.h
   SRCS
     etcd_client.cpp
     global_flags.cpp
@@ -23,6 +24,7 @@ cc_library(
     options.cpp
     rate_limiter.cpp
     device_monitor.cpp
+    flashinfer_workspace.cpp
   DEPS
     util
     absl::random_random
diff --git a/xllm/core/common/flashinfer_workspace.cpp b/xllm/core/common/flashinfer_workspace.cpp
@@ -0,0 +1,46 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "flashinfer_workspace.h"
+
+#include "global_flags.h"
+
+namespace xllm {
+
+void FlashinferWorkspace::initialize(const torch::Device& device) {
+  float_workspace_buffer_ =
+      torch::empty({FLAGS_workspace_buffer_size},
+                   torch::dtype(torch::kUInt8).device(device));
+  int_workspace_buffer_ =
+      torch::empty({FLAGS_workspace_buffer_size},
+                   torch::dtype(torch::kUInt8).device(device));
+  page_locked_int_workspace_buffer_ = torch::empty(
+      {FLAGS_workspace_buffer_size},
+      torch::dtype(torch::kUInt8).device(torch::kCPU).pinned_memory(true));
+}
+
+torch::Tensor FlashinferWorkspace::get_float_workspace_buffer() {
+  return float_workspace_buffer_;
+}
+
+torch::Tensor FlashinferWorkspace::get_int_workspace_buffer() {
+  return int_workspace_buffer_;
+}
+
+torch::Tensor FlashinferWorkspace::get_page_locked_int_workspace_buffer() {
+  return page_locked_int_workspace_buffer_;
+}
+
+}  // namespace xllm
diff --git a/xllm/core/common/flashinfer_workspace.h b/xllm/core/common/flashinfer_workspace.h
@@ -0,0 +1,49 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <torch/torch.h>
+
+#include <cstdint>
+
+#include "macros.h"
+
+namespace xllm {
+
+class FlashinferWorkspace {
+ public:
+  static FlashinferWorkspace& get_instance() {
+    static FlashinferWorkspace instance;
+    return instance;
+  };
+
+  void initialize(const torch::Device& device);
+
+  torch::Tensor get_float_workspace_buffer();
+  torch::Tensor get_int_workspace_buffer();
+  torch::Tensor get_page_locked_int_workspace_buffer();
+
+ private:
+  FlashinferWorkspace() = default;
+  ~FlashinferWorkspace() = default;
+  DISALLOW_COPY_AND_ASSIGN(FlashinferWorkspace);
+
+  torch::Tensor float_workspace_buffer_;
+  torch::Tensor int_workspace_buffer_;
+  torch::Tensor page_locked_int_workspace_buffer_;
+};
+
+}  // namespace xllm
diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -343,7 +343,7 @@ DEFINE_string(store_metadata_connstring,
               "",
               "The address of the kv cache store metadata service.");
 
-// --- for computation communication parallel ---
+// --- computation communication parallel config ---
 
 DEFINE_bool(
     enable_multi_stream_parallel,
@@ -355,7 +355,7 @@ DEFINE_int32(default_micro_batch_num,
              2,
              "Default use two micro batches for multi-stream parallel.");
 
-// --- for dit ---
+// --- dit config ---
 DEFINE_int32(max_requests_per_batch, 1, "Max number of request per batch.");
 
 // --- continuous kv cache config ---
@@ -378,3 +378,9 @@ DEFINE_int64(cache_size_per_token,
 DEFINE_int64(buffer_size_per_seq,
              0,
              "Buffer size per sequence in bytes, default 0.");
+
+// --- flashinfer config ---
+DEFINE_int32(workspace_buffer_size,
+             512 * 1024 * 1024,
+             "The user reserved workspace buffer used to store intermediate "
+             "attention results in split-k algorithm.");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
@@ -189,7 +189,6 @@ DECLARE_int32(max_global_ttft_ms);
 
 DECLARE_int32(max_global_tpot_ms);
 
-// dit
 DECLARE_int32(max_requests_per_batch);
 
 DECLARE_bool(enable_continuous_kvcache);
@@ -199,3 +198,5 @@ DECLARE_int64(granularity_size);
 DECLARE_int64(cache_size_per_token);
 
 DECLARE_int64(buffer_size_per_seq);
+
+DECLARE_int32(workspace_buffer_size);
diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp
@@ -140,7 +140,7 @@ void batch_decode(AttentionParams& params) {
                     params.k_cache,
                     params.output,
                     params.block_table,
-                    params.seq_lens,
+                    params.kv_seq_lens,
                     params.v_cache,
                     params.output_lse,
                     params.q_quant_scale,
diff --git a/xllm/core/layers/mlu/attention.cpp b/xllm/core/layers/mlu/attention.cpp
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "attention.h"
 
+#include "common/flashinfer_workspace.h"
 #include "kernels/ops_api.h"
 
 DECLARE_bool(enable_chunked_prefill);
@@ -99,6 +100,14 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>> AttentionImpl::forward(
   attention_params.window_size_left = sliding_window_;
   attention_params.scale = scale_;
   attention_params.compute_dtype = attn_metadata.compute_dtype;
+  // for flashinfer
+  attention_params.float_workspace_buffer =
+      FlashinferWorkspace::get_instance().get_float_workspace_buffer();
+  attention_params.int_workspace_buffer =
+      FlashinferWorkspace::get_instance().get_int_workspace_buffer();
+  attention_params.page_locked_int_workspace_buffer =
+      FlashinferWorkspace::get_instance()
+          .get_page_locked_int_workspace_buffer();
   attention_params.kv_cu_seq_lens = attn_metadata.kv_cu_seq_lens;
   attention_params.q_cu_seq_lens = attn_metadata.q_cu_seq_lens;
 
diff --git a/xllm/core/runtime/llm_worker_impl.cpp b/xllm/core/runtime/llm_worker_impl.cpp
@@ -26,6 +26,7 @@ limitations under the License.
 #include <utility>
 
 #include "common/device_monitor.h"
+#include "common/flashinfer_workspace.h"
 #include "common/metrics.h"
 #include "common/types.h"
 #include "core/common/global_flags.h"
@@ -41,7 +42,10 @@ namespace xllm {
 LLMWorkerImpl::LLMWorkerImpl(const ParallelArgs& parallel_args,
                              const torch::Device& device,
                              const runtime::Options& options)
-    : WorkerImpl(parallel_args, device, options) {}
+    : WorkerImpl(parallel_args, device, options) {
+  // initialize flashinfer workspace
+  FlashinferWorkspace::get_instance().initialize(device_);
+}
 
 bool LLMWorkerImpl::init_model(ModelContext& context) {
   CHECK(model_ == nullptr) << "Model is already initialized.";