perf(llama.cu): 更新 flash attn 提升性能

YdrMaster · YdrMaster · commit 49bb7d8f869b · 2025-07-29T17:41:46.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/llama.cu/Cargo.toml b/llama.cu/Cargo.toml
@@ -4,10 +4,10 @@ version = "0.0.0"
 edition.workspace = true
 
 [dependencies]
-cuda = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
-cublas = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
-nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
-flash-attn = { git = "https://github.com/YdrMaster/learn-flash-attn", rev = "57176f5" }
+cuda = { git = "https://github.com/YdrMaster/cuda-driver", rev = "803d64b" }
+cublas = { git = "https://github.com/YdrMaster/cuda-driver", rev = "803d64b" }
+nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "803d64b" }
+flash-attn = { git = "https://github.com/YdrMaster/learn-flash-attn", rev = "1caeeee" }
 nn = { git = "https://github.com/YdrMaster/InfiniNN", rev = "171c5b0" }
 ggus = { git = "https://github.com/InfiniTensor/gguf", rev = "23c362f" }
 tokeneer = { git = "https://github.com/InfiniTensor/tokeneer", rev = "c48f39f" }
@@ -27,7 +27,7 @@ minijinja = { version = "2.11", default-features = false, features = [
 
 [build-dependencies]
 build-script-cfg = "0.1"
-search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
-search-maca-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
-search-corex-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
-cuda-cc = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
+search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "803d64b" }
+search-maca-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "803d64b" }
+search-corex-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "803d64b" }
+cuda-cc = { git = "https://github.com/YdrMaster/cuda-driver", rev = "803d64b" }
diff --git a/llama.cu/src/exec/step.rs b/llama.cu/src/exec/step.rs
@@ -5,11 +5,11 @@
     utils::{destruct, distinct, offset_ptr, strides},
 };
 use cuda::{CaptureStream, GraphExec, Module, Stream, VirByte};
-use flash_attn::attention::{FlashAttnCfg, KVPage, KernelReq, Strides2D};
+use flash_attn::attention::{AttnType, FlashAttnCfg, KVPage, KernelReq, Strides2D};
 use ggus::ggml_quants::f16;
 use nn::{Arg, Named, Tensor, digit_layout::types};
 use regex::Regex;
-use std::{fmt, sync::LazyLock};
+use std::{fmt, ptr::null, sync::LazyLock};
 
 pub(super) enum Step<'ctx> {
     Graph(GraphExec<'ctx>, Box<[Tensor<*const VirByte, 2>]>),
@@ -144,31 +144,28 @@ impl<'ctx> Handle<'ctx> {
         reqs: &[Req<Tensor<*const VirByte, 2>>],
         stream: &Stream,
     ) {
+        use ::flash_attn::attention::cuda::code as flash_attn_code;
         let Attention { q, k, v, o, .. } = attn;
         let dt = distinct(&[q.dt(), k.dt(), v.dt(), o.dt()]).unwrap();
         // 编译
         let key = [ModuleKey::Text("flash-attn"), ModuleKey::Type(dt)].into_iter();
-        let [t_compute, t_data] = match dt {
-            types::F16 => ["float", "half"],
-            _ => todo!(),
-        };
-        let module = self.compile(key.collect(), || {
-            ::flash_attn::attention::cuda::code(t_compute, t_data)
-        });
         match dt {
-            types::F16 => launch_attn_typed::<f16>(attn, reqs, module, stream),
+            types::F16 => {
+                let module = self.compile(key.collect(), || flash_attn_code::<f16>());
+                launch_attn_typed::<f16>(attn, reqs, module, stream)
+            }
             _ => todo!(),
         }
     }
 }
 
-fn launch_attn_typed<T: Copy>(
+fn launch_attn_typed<T: ::flash_attn::attention::cuda::NVDT>(
     attn: &Attention,
     reqs: &[Req<Tensor<*const VirByte, 2>>],
     module: &Module,
     stream: &Stream,
 ) {
-    const TILE_SEQ: usize = 32;
+    const TILE_SEQ: usize = 8;
     const TILE_CTX: usize = 32;
 
     let Attention { iblk, q, k, v, o } = attn;
@@ -230,25 +227,10 @@ fn launch_attn_typed<T: Copy>(
             })
         })
         .collect::<Box<_>>();
-    // 生成 mask
-    let masks = reqs
-        .iter()
-        .map(|req| {
-            let Req { pos, seq: n, .. } = req;
-            let s = pos + n;
-            let s_ceil = s.div_ceil(TILE_CTX) * TILE_CTX;
-            // 注意力掩码
-            let mask = (0..n * s_ceil)
-                .map(|i| i % s_ceil <= s - n + i / s_ceil)
-                .collect::<Box<_>>();
-            stream.from_host(&mask)
-        })
-        .collect::<Box<_>>();
     // 为每个请求的每个头生成 block
     let reqs_ = reqs
         .iter()
-        .zip(&masks)
-        .scan((0, 0), |(seq, page), (req, mask)| {
+        .scan((0, 0), |(seq, page), req| {
             let &Req {
                 ref cache,
                 pos,
@@ -288,9 +270,10 @@ fn launch_attn_typed<T: Copy>(
                 kv_strides,
                 o: offset_ptr(&o).cast_mut().cast(),
                 o_strides,
-                mask: mask.as_ptr().cast(),
                 n,
                 s: pos + n,
+                ty: AttnType::Causal,
+                mask: null(),
             })
         })
         .collect::<Box<_>>();
diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml
@@ -21,7 +21,7 @@ ratatui = "0.29"
 serde.workspace = true
 serde_json = "1.0"
 toml = "0.8"
-tokio = { version = "1.46", features = ["rt-multi-thread", "net"] }
+tokio = { version = "1.47", features = ["rt-multi-thread", "net"] }
 hyper = { version = "1.6", features = ["http1", "server"] }
 hyper-util = { version = "0.1", features = ["http1", "tokio", "server"] }
 http-body-util = "0.1"
diff --git a/xtask/src/bench.rs b/xtask/src/bench.rs
@@ -54,7 +54,7 @@ impl BenchArgs {
         let mut steps = 0;
         loop {
             let time = Instant::now();
-            let Received { sessions, outputs } = service.recv(Duration::from_millis(100));
+            let Received { sessions, outputs } = service.recv(Duration::MAX);
             let time = time.elapsed();
             println!("{steps:03}. time = {time:?}");
             steps += 1;