jd-opensource
diff --git a/‎.gitmodules‎
Lines changed: 12 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 17 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 17 additions & 5 deletions b/‎setup.py‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎third_party/CMakeLists.txt‎
Lines changed: 38 additions & 0 deletions b/‎third_party/CMakeLists.txt‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎third_party/cutlass‎ b/‎third_party/cutlass‎
diff --git a/‎third_party/dlpack‎ b/‎third_party/dlpack‎
diff --git a/‎third_party/flashinfer‎ b/‎third_party/flashinfer‎
diff --git a/‎third_party/tvm-ffi‎ b/‎third_party/tvm-ffi‎
diff --git a/‎xllm/core/common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/common/flashinfer_workspace.cpp‎
Lines changed: 46 additions & 0 deletions b/‎xllm/core/common/flashinfer_workspace.cpp‎
Lines changed: 46 additions & 0 deletions
@@ -28,3 +28,15 @@
 [submodule "third_party/Mooncake"]
 	path = third_party/Mooncake
 	url = https://gitcode.com/xLLM-AI/Mooncake.git
+[submodule "third_party/flashinfer"]
+	path = third_party/flashinfer
+	url = https://gitcode.com/xLLM-AI/flashinfer.git
+[submodule "third_party/cutlass"]
+	path = third_party/cutlass
+	url = https://gitcode.com/xLLM-AI/cutlass.git
+[submodule "third_party/tvm-ffi"]
+	path = third_party/tvm-ffi
+	url = https://gitcode.com/xLLM-AI/tvm-ffi.git
+[submodule "third_party/dlpack"]
+	path = third_party/dlpack
+	url = https://gitcode.com/xLLM-AI/dlpack.git
@@ -101,7 +101,7 @@ set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS ON)
 
-if(USE_NPU)
+if(USE_NPU OR USE_CUDA)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
   add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
 elseif(USE_MLU)
@@ -352,6 +352,22 @@ if(USE_MLU)
   )
 endif()
 
+if(USE_CUDA)
+  add_definitions(-DUSE_CUDA)
+  set(CMAKE_VERBOSE_MAKEFILE ON)
+  include_directories(
+      $ENV{PYTHON_INCLUDE_PATH}
+      $ENV{PYTORCH_INSTALL_PATH}/include
+      $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/api/include
+  )
+
+  link_directories(
+    $ENV{PYTHON_LIB_PATH}
+    $ENV{PYTORCH_INSTALL_PATH}/lib
+    $ENV{CUDA_TOOLKIT_ROOT_DIR}/lib64
+  )
+endif()
+
 # check if USE_CXX11_ABI is set correctly
 # if (DEFINED USE_CXX11_ABI)
 #   parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
 
@@ -212,7 +212,13 @@ def set_mlu_envs():
     os.environ["LIBTORCH_ROOT"] = get_torch_root_path()
     os.environ["PYTORCH_INSTALL_PATH"] = get_torch_root_path()
     os.environ["PYTORCH_MLU_INSTALL_PATH"] = get_torch_mlu_root_path()
-
+    
+def set_cuda_envs():
+    os.environ["PYTHON_INCLUDE_PATH"] = get_python_include_path()
+    os.environ["PYTHON_LIB_PATH"] =  get_torch_root_path()
+    os.environ["LIBTORCH_ROOT"] = get_torch_root_path()
+    os.environ["PYTORCH_INSTALL_PATH"] = get_torch_root_path()
+    
 class CMakeExtension(Extension):
     def __init__(self, name: str, path: str, sourcedir: str = "") -> None:
         super().__init__(name, sources=[])
@@ -223,7 +229,7 @@ def __init__(self, name: str, path: str, sourcedir: str = "") -> None:
 class ExtBuild(build_ext):
     user_options = build_ext.user_options + [
         ("base-dir=", None, "base directory of xLLM project"),
-        ("device=", None, "target device type (a3 or a2 or mlu)"),
+        ("device=", None, "target device type (a3 or a2 or mlu or cuda)"),
         ("arch=", None, "target arch type (x86 or arm)"),
         ("install-xllm-kernels=", None, "install xllm_kernels RPM package (true/false)"),
     ]
@@ -302,8 +308,14 @@ def build_extension(self, ext: CMakeExtension):
             cmake_args += ["-DUSE_MLU=ON"]
             # set mlu environment variables
             set_mlu_envs()
+        elif self.device == "cuda":
+            cuda_architectures = "80;89;90"
+            cmake_args += ["-DUSE_CUDA=ON", 
+                           f"-DCMAKE_CUDA_ARCHITECTURES={cuda_architectures}"]
+            # set cuda environment variables
+            set_cuda_envs()
         else:
-            raise ValueError("Please set --device to a2 or a3 or mlu.")
+            raise ValueError("Please set --device to a2 or a3 or mlu or cuda.")
 
 
         # Adding CMake arguments set as environment variable
@@ -353,7 +365,7 @@ def build_extension(self, ext: CMakeExtension):
 
 class BuildDistWheel(bdist_wheel):
     user_options = bdist_wheel.user_options + [
-        ("device=", None, "target device type (a3 or a2 or mlu)"),
+        ("device=", None, "target device type (a3 or a2 or mlu or cuda)"),
         ("arch=", None, "target arch type (x86 or arm)"),
     ]
 
@@ -530,7 +542,7 @@ def apply_patch():
         idx = sys.argv.index('--device')
         if idx + 1 < len(sys.argv):
             device = sys.argv[idx+1].lower()
-            if device not in ('a2', 'a3', 'mlu'):
+            if device not in ('a2', 'a3', 'mlu', 'cuda'):
                 print("Error: --device must be a2 or a3 or mlu (case-insensitive)")
                 sys.exit(1)
             # Remove the arguments so setup() doesn't see them
 
@@ -20,3 +20,41 @@ target_include_directories(mooncake_store PUBLIC
 )
 
 target_link_libraries(mooncake_store PUBLIC transfer_engine cachelib_memory_allocator)
+
+
+if(USE_CUDA)
+  cc_library(
+    NAME 
+      cutlass
+    INCLUDES
+      cutlass/include
+      cutlass/tools/util/include
+    DEPS 
+      torch # TODO: depends on CUDA instead of torch
+  )
+  cc_library(
+    NAME 
+      dlpack
+    INCLUDES
+      dlpack/include
+  )
+  cc_library(
+    NAME 
+      tvm_ffi
+    INCLUDES
+      tvm_ffi/include
+    DEPS
+      dlpack
+  )
+  cc_library(
+    NAME 
+      flashinfer
+    INCLUDES
+      flashinfer/include
+      flashinfer/csrc
+    DEPS
+      cutlass
+      tvm_ffi
+      dlpack
+  )
+endif()
@@ -15,6 +15,7 @@ cc_library(
     rate_limiter.h
     types.h
     device_monitor.h
+    flashinfer_workspace.h
   SRCS
     etcd_client.cpp
     global_flags.cpp
@@ -23,6 +24,7 @@ cc_library(
     options.cpp
     rate_limiter.cpp
     device_monitor.cpp
+    flashinfer_workspace.cpp
   DEPS
     util
     absl::random_random
 
@@ -0,0 +1,46 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "flashinfer_workspace.h"
+
+#include "global_flags.h"
+
+namespace xllm {
+
+void FlashinferWorkspace::initialize(const torch::Device& device) {
+  float_workspace_buffer_ =
+      torch::empty({FLAGS_workspace_buffer_size},
+                   torch::dtype(torch::kUInt8).device(device));
+  int_workspace_buffer_ =
+      torch::empty({FLAGS_workspace_buffer_size},
+                   torch::dtype(torch::kUInt8).device(device));
+  page_locked_int_workspace_buffer_ = torch::empty(
+      {FLAGS_workspace_buffer_size},
+      torch::dtype(torch::kUInt8).device(torch::kCPU).pinned_memory(true));
+}
+
+torch::Tensor FlashinferWorkspace::get_float_workspace_buffer() {
+  return float_workspace_buffer_;
+}
+
+torch::Tensor FlashinferWorkspace::get_int_workspace_buffer() {
+  return int_workspace_buffer_;
+}
+
+torch::Tensor FlashinferWorkspace::get_page_locked_int_workspace_buffer() {
+  return page_locked_int_workspace_buffer_;
+}
+
+}  // namespace xllm