refactor: skip flash_attn build (#388)

guocuimi · web-flow · commit 790e5ba680db · 2025-01-25T14:14:36.000-08:00
diff --git a/src/kernels/attention/CMakeLists.txt b/src/kernels/attention/CMakeLists.txt
@@ -79,6 +79,4 @@ cc_binary(
     -lineinfo 
 )
 
-add_subdirectory(flash_attn)
-# add_subdirectory(flash_infer)
 add_subdirectory(tools)
diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt
@@ -58,7 +58,6 @@ cc_library(
     :pos_embedding
     :attention
     :kernels
-    :flash_attn.kernels
     glog::glog
     gflags::gflags
     torch
diff --git a/src/layers/attention/CMakeLists.txt b/src/layers/attention/CMakeLists.txt
@@ -8,21 +8,18 @@ cc_library(
   HDRS 
     handler.h
     ref_handler.h
-    flash_attn_handler.h
     scale_attn_handler.h
     attention.h
   SRCS 
     handler.cpp
     ref_handler.cpp
-    flash_attn_handler.cpp
     scale_attn_handler.cpp
     attention.cpp
   DEPS
     :state_dict
     :memory
     :pos_embedding
     :kernels
-    :flash_attn.kernels
     :attention.kernels
     glog::glog
     gflags::gflags
diff --git a/src/layers/attention/flash_attn_handler.cpp b/src/layers/attention/flash_attn_handler.cpp
diff --git a/src/layers/attention/flash_attn_handler.h b/src/layers/attention/flash_attn_handler.h
diff --git a/src/layers/attention/handler.cpp b/src/layers/attention/handler.cpp
@@ -7,15 +7,14 @@
 #include <boost/algorithm/string.hpp>
 #include <memory>
 
-#include "flash_attn_handler.h"
 #include "scale_attn_handler.h"
 #include "layers/pos_embedding.h"
 #include "ref_handler.h"
 
 // decide which attention implementation to use
 DEFINE_string(attention_handler,
               "auto",
-              "attention handler, e.g. auto, pytorch, flash_attn");
+              "attention handler, e.g. auto, pytorch, scale_attn");
 
 namespace llm {
 
@@ -64,15 +63,15 @@ std::unique_ptr<AttentionHandler> AttentionHandler::create_handler_with_alibi(
   }
 
   const bool is_cuda = options.device().is_cuda();
-  if (boost::iequals(FLAGS_attention_handler, "flash_attn")) {
-    CHECK(is_cuda) << "flash_attn only supports cuda device";
-    return std::make_unique<FlashAttnHandler>(
+  if (boost::iequals(FLAGS_attention_handler, "scale_attn")) {
+    CHECK(is_cuda) << "scale_attn only supports cuda device";
+    return std::make_unique<ScaleAttnHandler>(
         sm_scale, args.attn_logit_soft_cap(), alibi_slopes);
   }
 
   // choose the best handler based on device type
   if (is_cuda) {
-    // use flash_attn for cuda device
+    // use scale_attn for cuda device
     return std::make_unique<ScaleAttnHandler>(
         sm_scale, args.attn_logit_soft_cap(), alibi_slopes);
   }
@@ -111,9 +110,9 @@ std::unique_ptr<AttentionHandler> AttentionHandler::create_handler_with_rope(
   }
 
   const bool is_cuda = options.device().is_cuda();
-  if (boost::iequals(FLAGS_attention_handler, "flash_attn")) {
-    CHECK(is_cuda) << "flash_attn only supports cuda device";
-    return std::make_unique<FlashAttnHandler>(sm_scale,
+  if (boost::iequals(FLAGS_attention_handler, "scale_attn")) {
+    CHECK(is_cuda) << "scale_attn only supports cuda device";
+    return std::make_unique<ScaleAttnHandler>(sm_scale,
                                               args.attn_logit_soft_cap(),
                                               rotary_dim,
                                               args.max_position_embeddings(),
@@ -124,7 +123,7 @@ std::unique_ptr<AttentionHandler> AttentionHandler::create_handler_with_rope(
 
   // choose the best handler based on device type
   if (is_cuda) {
-    // use flash_attn for cuda device
+    // use scale_attn for cuda device
     return std::make_unique<ScaleAttnHandler>(sm_scale,
                                               args.attn_logit_soft_cap(),
                                               rotary_dim,

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,4 @@ cc_binary(`
`79`	`79`	`-lineinfo`
`80`	`80`	`)`
`81`	`81`
`82`		`-add_subdirectory(flash_attn)`
`83`		`-# add_subdirectory(flash_infer)`
`84`	`82`	`add_subdirectory(tools)`