fix(llama-cuda): 支持根据空闲内存计算可能的 kv cache 容量

YdrMaster · YdrMaster · commit e73b0fb16955 · 2025-02-08T07:34:59.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/Cargo.toml b/Cargo.toml
@@ -34,7 +34,7 @@ itertools = "0.13"
 env_logger = "0.11"
 build-script-cfg = "0.0"
 
-operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "7886d54", default-features = false }
+operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "359b86a", default-features = false }
 
 search-cl-tools = { git = "https://github.com/InfiniTensor/clrt", rev = "f69b160" }
 search-infini-tools = { git = "https://github.com/InfiniTensor/infini-rt", rev = "e8362c3" }
diff --git a/models/llama/common/src/lib.rs b/models/llama/common/src/lib.rs
@@ -106,6 +106,10 @@ impl LlamaMeta {
         Tensor::new(dt_embd, &[buf, nblk, 2, nkvh, dh])
     }
 
+    pub fn kv_cache_in_size(&self, max: usize, size: usize) -> Tensor<usize> {
+        self.kv_cache((size / self.kv_cache(1).take()).min(max))
+    }
+
     pub fn embd(&self, nt: usize) -> Tensor<usize> {
         let &Self { dt_embd, d, .. } = self;
         Tensor::new(dt_embd, &[nt, d])
diff --git a/models/llama/cuda/src/infer.rs b/models/llama/cuda/src/infer.rs
@@ -72,13 +72,20 @@ fn test_infer() {
         println!("load weights: {:?}", time.elapsed());
 
         let (free, _) = ctx.mem_info();
-        let queue_alloc = StreamMemPool::new(stream);
-        queue_alloc.put((free.0 >> 30) << 30);
+        let mut cache = meta
+            // 用剩余空闲空间的一半存储 kv cache
+            .kv_cache_in_size(nctx, free.0 / 2)
+            .map(|len| ctx.malloc::<u8>(len));
+        println!("cache len = {}", cache.shape()[0]);
 
+        let queue_alloc = StreamMemPool::new(stream);
         let alloc = |size| -> MemPoolBlob { queue_alloc.alloc(size) };
 
+        let (free, _) = ctx.mem_info();
+        // 去除 64MiB 以下的零头
+        queue_alloc.put(free.0 & !((64 << 20) - 1));
+
         let mut worker = Worker::new(0, &gpu, meta.clone(), weights);
-        let mut cache = meta.kv_cache(nctx).map(alloc);
         let sin_cos =
             <Operators as llama::Operators>::build_sin_cos(dt_embd, nctx, dh, &queue_alloc);
         let indices = RandomSample::build_indices(nvoc, &queue_alloc);