refactor: stop build flash_infer kernel (#386)

guocuimi · web-flow · commit 2c320091b78b · 2025-01-24T12:07:08.000-08:00
diff --git a/scalellm/CMakeLists.txt b/scalellm/CMakeLists.txt
@@ -18,7 +18,6 @@ pybind_extension(
   DEPS
     :llm_handler
     :marlin.kernels
-    :flash_infer.kernels
     torch
     torch_python
     absl::strings
diff --git a/scalellm/csrc/kernels.cu b/scalellm/csrc/kernels.cu
@@ -1,7 +1,6 @@
 #include <pybind11/pybind11.h>
 #include <torch/extension.h>
 
-#include "kernels/attention/flash_infer/attention_wrapper.h"
 #include "kernels/quantization/marlin.h"
 
 namespace llm::csrc {
@@ -53,16 +52,6 @@ void init_kernels(py::module_& m) {
         py::arg("q_weight"),
         py::arg("out"),
         py::arg("num_bits"));
-
-  // flashinfer kernels
-  py::class_<flashinfer::BatchPrefillWrapper>(m, "BatchPrefillWrapper")
-      .def(py::init<bool>())
-      .def("plan", &flashinfer::BatchPrefillWrapper::Plan)
-      .def("is_cuda_graph_enabled",
-           &flashinfer::BatchPrefillWrapper::IsCUDAGraphEnabled)
-      .def("update_page_locked_buffer_size",
-           &flashinfer::BatchPrefillWrapper::UpdatePageLockedBufferSize)
-      .def("run", &flashinfer::BatchPrefillWrapper::Run);
 }
 
 }  // namespace llm::csrc
diff --git a/src/kernels/attention/CMakeLists.txt b/src/kernels/attention/CMakeLists.txt
@@ -80,5 +80,5 @@ cc_binary(
 )
 
 add_subdirectory(flash_attn)
-add_subdirectory(flash_infer)
+# add_subdirectory(flash_infer)
 add_subdirectory(tools)
diff --git a/tests/kernels/attention/flash_infer_kv_fp8_test.py b/tests/kernels/attention/flash_infer_kv_fp8_test.py
@@ -7,6 +7,7 @@
 import scalellm._C.kernels as kernels  # type: ignore
 
 
+@pytest.mark.skip(reason="Not implemented")
 @pytest.mark.parametrize("seq_lens", [[(1, 100)], [(100, 100)], [(1, 100), (15, 15), (111, 234), (1000, 10000)]])
 @pytest.mark.parametrize("num_heads", [(8, 8), (8, 4), (8, 2), (8, 1)])
 @pytest.mark.parametrize("head_size", [64, 128, 256])
diff --git a/tests/kernels/attention/flash_infer_test.py b/tests/kernels/attention/flash_infer_test.py
@@ -7,6 +7,7 @@
 import scalellm._C.kernels as kernels  # type: ignore
 
 
+@pytest.mark.skip(reason="Not implemented")
 @pytest.mark.parametrize("seq_lens", [[(1, 100)], [(100, 100)], [(1, 100), (15, 15), (111, 234), (1000, 10000)]])
 @pytest.mark.parametrize("num_heads", [(8, 8), (8, 4), (8, 2), (8, 1)])
 @pytest.mark.parametrize("head_size", [64, 128, 256])

Original file line number	Diff line number	Diff line change
`@@ -80,5 +80,5 @@ cc_binary(`
`80`	`80`	`)`
`81`	`81`
`82`	`82`	`add_subdirectory(flash_attn)`
`83`		`-add_subdirectory(flash_infer)`
	`83`	`+# add_subdirectory(flash_infer)`
`84`	`84`	`add_subdirectory(tools)`