jd-opensource
diff --git a/‎.gitmodules‎
Lines changed: 6 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎third_party/CMakeLists.txt‎
Lines changed: 21 additions & 0 deletions b/‎third_party/CMakeLists.txt‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎third_party/cutlass‎ b/‎third_party/cutlass‎
diff --git a/‎third_party/flashinfer‎ b/‎third_party/flashinfer‎
diff --git a/‎xllm/core/kernels/CMakeLists.txt‎
Lines changed: 19 additions & 3 deletions b/‎xllm/core/kernels/CMakeLists.txt‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎xllm/core/kernels/mlu/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎xllm/core/kernels/mlu/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎xllm/core/kernels/mlu/active.cpp‎
Lines changed: 38 additions & 0 deletions b/‎xllm/core/kernels/mlu/active.cpp‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎xllm/core/kernels/mlu/attention.cpp‎
Lines changed: 120 additions & 0 deletions b/‎xllm/core/kernels/mlu/attention.cpp‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎xllm/core/kernels/mlu/fused_layernorm.cpp‎
Lines changed: 53 additions & 0 deletions b/‎xllm/core/kernels/mlu/fused_layernorm.cpp‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎xllm/core/kernels/mlu/fused_moe.cpp‎
Lines changed: 2 additions & 1 deletion b/‎xllm/core/kernels/mlu/fused_moe.cpp‎
Lines changed: 2 additions & 1 deletion
@@ -28,3 +28,9 @@
 [submodule "third_party/Mooncake"]
 	path = third_party/Mooncake
 	url = https://github.com/kvcache-ai/Mooncake.git
+[submodule "third_party/flashinfer"]
+	path = third_party/flashinfer
+	url = https://gitcode.com/xLLM-AI/flashinfer.git
+[submodule "third_party/cutlass"]
+	path = third_party/cutlass
+	url = https://gitcode.com/xLLM-AI/cutlass.git
@@ -20,3 +20,24 @@ target_include_directories(mooncake_store PUBLIC
 )
 
 target_link_libraries(mooncake_store PUBLIC transfer_engine cachelib_memory_allocator)
+
+
+if(USE_CUDA)
+  cc_library(
+    NAME 
+      cutlass
+      INCLUDES
+      cutlass/include
+      cutlass/tools/util/include
+    DEPS 
+      torch # TODO: depends on CUDA instead of torch
+  )
+  cc_library(
+    NAME 
+      flashinfer
+      INCLUDES
+      flashinfer/include
+    DEPS
+      cutlass
+  )
+endif()
@@ -1,12 +1,28 @@
 include(cc_library)
 
 if(USE_NPU)
-  include_directories(
-    ${CMAKE_SOURCE_DIR}/third_party/spdlog/include
-  )
   add_subdirectory(npu)
 endif()
 
 if(USE_MLU)
   add_subdirectory(mlu)
 endif()
+
+if(USE_CUDA)
+  add_subdirectory(cuda)
+endif()
+
+cc_library(
+  NAME
+    kernels
+  HDRS
+    param.h
+    torch_ops_api.h
+  SRCS
+    torch_ops_api.cpp
+  DEPS
+    torch
+    $<$<BOOL:${USE_NPU}>:npu_kernels>
+    $<$<BOOL:${USE_MLU}>:mlu_kernels>
+    $<$<BOOL:${USE_CUDA}>:cuda_kernels>
+)
@@ -2,7 +2,6 @@ include(cc_library)
 
 file(GLOB_RECURSE MLU_HEADER_FILES
   "${CMAKE_CURRENT_LIST_DIR}/*.h"
-  "${CMAKE_CURRENT_LIST_DIR}/*.hpp"
 )
 
 file(GLOB_RECURSE MLU_SOURCE_FILES
 
@@ -0,0 +1,38 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+#include "torch_mlu_ops.h"
+
+namespace xllm::mlu {
+
+void active(const torch::Tensor& input,
+            torch::Tensor& output,
+            const std::optional<torch::Tensor>& bias,
+            const std::optional<torch::Tensor>& cusum_token_count,
+            const std::string& act_mode,
+            bool is_gated,
+            int start_expert_id,
+            int expert_size) {
+  tmo::torch_api::active(input,
+                         output,
+                         bias,
+                         cusum_token_count,
+                         act_mode,
+                         is_gated,
+                         start_expert_id,
+                         expert_size);
+}
+}  // namespace xllm::mlu
@@ -0,0 +1,120 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+#include "torch_mlu_ops.h"
+
+namespace xllm::mlu {
+
+void reshape_paged_cache(const torch::Tensor& key,
+                         const torch::Tensor& value,
+                         torch::Tensor& k_cache,
+                         torch::Tensor& v_cache,
+                         const torch::Tensor& slot_mapping,
+                         bool direction) {
+  tmo::torch_api::reshape_paged_cache(
+      key, value, k_cache, v_cache, slot_mapping, direction);
+}
+
+void flash_attention(const torch::Tensor& query,
+                     const torch::Tensor& key,
+                     const torch::Tensor& value,
+                     torch::Tensor& output,
+                     torch::Tensor& output_lse,
+                     int query_start_loc,
+                     int seq_start_loc,
+                     const std::optional<torch::Tensor>& alibi_slope,
+                     const std::optional<torch::Tensor>& attn_bias,
+                     const std::optional<torch::Tensor>& q_quant_scale,
+                     const std::optional<torch::Tensor>& k_quant_scale,
+                     const std::optional<torch::Tensor>& v_quant_scale,
+                     const std::optional<torch::Tensor>& out_quant_scale,
+                     const std::optional<torch::Tensor>& block_tables,
+                     int max_query_len,
+                     int max_seq_len,
+                     float scale,
+                     bool is_causal,
+                     int window_size_left,
+                     int window_size_right,
+                     const std::string& compute_dtype,
+                     bool return_lse) {
+  tmo::torch_api::flash_attention(query,
+                                  key,
+                                  value,
+                                  output,
+                                  output_lse,
+                                  query_start_loc,
+                                  seq_start_loc,
+                                  alibi_slope,
+                                  attn_bias,
+                                  q_quant_scale,
+                                  k_quant_scale,
+                                  v_quant_scale,
+                                  out_quant_scale,
+                                  block_tables,
+                                  max_query_len,
+                                  max_seq_len,
+                                  scale,
+                                  is_causal,
+                                  window_size_left,
+                                  window_size_right,
+                                  compute_dtype,
+                                  return_lse);
+}
+
+void single_query_cached_kv_attn(
+    const torch::Tensor& query,
+    const torch::Tensor& k_cache,
+    torch::Tensor& output,
+    const torch::Tensor& block_table,
+    const torch::Tensor& seq_lens,
+    const torch::Tensor& v_cache,
+    torch::Tensor& output_lse,
+    const std::optional<torch::Tensor>& q_quant_scale,
+    const std::optional<torch::Tensor>& k_cache_quant_scale,
+    const std::optional<torch::Tensor>& v_cache_quant_scale,
+    const std::optional<torch::Tensor>& out_quant_scale,
+    const std::optional<torch::Tensor>& alibi_slope,
+    const std::optional<torch::Tensor>& mask,
+    const std::string& compute_dtype,
+    int max_seq_len,
+    int window_size_left,
+    int window_size_right,
+    float scale,
+    bool return_lse,
+    int kv_cache_quant_bit_size) {
+  tmo::torch_api::single_query_cached_kv_attn(query,
+                                              k_cache,
+                                              output,
+                                              block_table,
+                                              seq_lens,
+                                              v_cache,
+                                              output_lse,
+                                              q_quant_scale,
+                                              k_cache_quant_scale,
+                                              v_cache_quant_scale,
+                                              out_quant_scale,
+                                              alibi_slope,
+                                              mask,
+                                              compute_dtype,
+                                              max_seq_len,
+                                              window_size_left,
+                                              window_size_right,
+                                              scale,
+                                              return_lse,
+                                              kv_cache_quant_bit_size);
+}
+
+}  // namespace xllm::mlu
@@ -0,0 +1,53 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+#include "torch_mlu_ops.h"
+
+namespace xllm::mlu {
+
+void fused_layernorm(const torch::Tensor& input,
+                     torch::Tensor& output,
+                     const std::optional<torch::Tensor>& residual,
+                     const torch::Tensor& weight,
+                     const std::optional<torch::Tensor>& beta,
+                     const std::optional<torch::Tensor>& bias,
+                     const std::optional<torch::Tensor>& quant_scale,
+                     const std::optional<torch::Tensor>& residual_out,
+                     const std::optional<torch::Tensor>& smooth_quant_scale,
+                     const std::optional<torch::Tensor>& normed_out,
+                     const std::string& mode,
+                     double eps,
+                     bool store_output_before_norm,
+                     bool store_output_after_norm,
+                     bool dynamic_quant) {
+  tmo::torch_api::fused_layernorm(input,
+                                  output,
+                                  residual,
+                                  weight,
+                                  beta,
+                                  bias,
+                                  quant_scale,
+                                  residual_out,
+                                  smooth_quant_scale,
+                                  normed_out,
+                                  mode,
+                                  eps,
+                                  store_output_before_norm,
+                                  store_output_after_norm,
+                                  dynamic_quant);
+}
+
+}  // namespace xllm::mlu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlu_ops_api.h"
 #include "torch_mlu_ops.h"
-#include "torch_ops_api.h"
 
 namespace {
 torch::Tensor create_group_gemm_output(const torch::Tensor& a,
@@ -28,6 +28,7 @@ torch::Tensor create_group_gemm_output(const torch::Tensor& a,
 }  // namespace
 
 namespace xllm::mlu {
+
 torch::Tensor fused_moe(const torch::Tensor& hidden_states,
                         const torch::Tensor& gating_output,
                         const torch::Tensor& w1,
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,6 @@ include(cc_library)`
`2`	`2`
`3`	`3`	`file(GLOB_RECURSE MLU_HEADER_FILES`
`4`	`4`	`"${CMAKE_CURRENT_LIST_DIR}/*.h"`
`5`		`- "${CMAKE_CURRENT_LIST_DIR}/*.hpp"`
`6`	`5`	`)`
`7`	`6`
`8`	`7`	`file(GLOB_RECURSE MLU_SOURCE_FILES`