refactor(llama.cu): 优化重复惩罚

YdrMaster · YdrMaster · commit 5e8be0432072 · 2025-07-09T10:30:49.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/llama.cu/src/exec/engine.rs b/llama.cu/src/exec/engine.rs
@@ -225,7 +225,6 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
         let gpu = Gpu::new(dev.retain_primary(), Default::default());
         let attn = Attn::new(&gpu);
         gpu.apply(|ctx| {
-            let mut manager = EngineManager::new(chunked_prefill_len, max_toks);
             let mut handle = handle(ctx);
             let mut models = ModelGroup::new(
                 llama,
@@ -237,6 +236,7 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
                 barrier.as_deref(),
             );
 
+            let mut manager = EngineManager::new(chunked_prefill_len, max_toks);
             let mut output_head = OutputHead::new(output_head, ctx);
             let mut sample_manager = SampleManager::new(output_head.nvoc(), eos, ctx);
 
@@ -253,7 +253,9 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
             let mut out_idx_buf = BufN::<utok>::new(len, BUF_LEVEL, ctx);
             let mut fast_embd_buf = BufN::<(utok, utok)>::new(len, BUF_LEVEL, ctx);
             if outputs.send(Output::Ready).is_ok() {
-                while manager.receive(&commands, &outputs).is_ok() {
+                while let Ok(removed) = manager.receive(&commands, &outputs) {
+                    // 处理已移除会话
+                    sample_manager.remove(removed);
                     // 组织请求
                     let Round {
                         overflow,
@@ -264,11 +266,14 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
                         fast_map,
                         finished,
                     } = manager.prepare();
+                    // 处理缓存溢出
+                    sample_manager.remove(overflow.iter().map(|s| s.id));
                     if !overflow.is_empty()
                         && outputs.send(Output::Overflow(overflow.into())).is_err()
                     {
                         break;
                     }
+                    // 如果不需要推理
                     if tokens.is_empty() {
                         assert!(
                             reqs.is_empty()
@@ -279,6 +284,7 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
                         );
                         continue;
                     }
+                    // 更新 host 多级缓存
                     let out_idx = out_idx(&reqs, output.iter().map(|(_, len)| *len));
                     events[out_idx_buf.index()].synchronize();
                     tok_buf.save(&tokens);
@@ -322,6 +328,8 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
                     let kv_pairs = sample_manager.sample(logits, &input, &sample, &stream);
                     stream.free(input);
                     stream.memcpy_d2d(&mut pre_kv_pairs[..kv_pairs.len()], &kv_pairs);
+                    // 处理推理结束
+                    sample_manager.remove(finished.iter().map(|s| s.id));
                     // 生成并发送输出
                     let output = output
                         .into_iter()
diff --git a/llama.cu/src/exec/engine_manager.rs b/llama.cu/src/exec/engine_manager.rs
@@ -1,9 +1,12 @@
 ﻿use super::{Command, Output};
 use crate::{
-    CacheParts,
+    CacheParts, SessionId,
     batch::{BatchStrategy, DefaultStrategy, Round, SessionStub},
 };
-use std::sync::mpsc::{Receiver, Sender, TryRecvError};
+use std::{
+    collections::BTreeSet,
+    sync::mpsc::{Receiver, Sender, TryRecvError},
+};
 
 pub(super) struct EngineManager(DefaultStrategy<CacheParts>);
 
@@ -26,12 +29,13 @@ impl EngineManager {
         &mut self,
         commands: &Receiver<Command>,
         outputs: &Sender<Output>,
-    ) -> Result<(), E> {
+    ) -> Result<BTreeSet<SessionId>, E> {
+        let mut removed = BTreeSet::new();
         loop {
             // 总是尝试进行非阻塞接收
             loop {
                 match commands.try_recv() {
-                    Ok(cmd) => self.apply(cmd, outputs)?,
+                    Ok(cmd) => self.apply(cmd, outputs, &mut removed)?,
                     Err(TryRecvError::Disconnected) => return Err(E::ReceiveError),
                     Err(TryRecvError::Empty) => break,
                 }
@@ -40,14 +44,15 @@ impl EngineManager {
             if self.0.is_empty() {
                 // 也没有待处理的任务，阻塞等待
                 match commands.recv() {
-                    Ok(cmd) => self.apply(cmd, outputs)?,
-                    Err(_) => break Err(E::ReceiveError),
+                    Ok(cmd) => self.apply(cmd, outputs, &mut removed)?,
+                    Err(_) => return Err(E::ReceiveError),
                 }
             } else {
                 // 有待处理的任务，退出循环
-                break Ok(());
+                break;
             }
         }
+        Ok(removed)
     }
 
     /// 准备推理
@@ -59,22 +64,26 @@ impl EngineManager {
         self.0.take_stubs()
     }
 
-    fn apply(&mut self, cmd: Command, outputs: &Sender<Output>) -> Result<(), CommandError> {
+    fn apply(
+        &mut self,
+        cmd: Command,
+        outputs: &Sender<Output>,
+        removed: &mut BTreeSet<SessionId>,
+    ) -> Result<(), CommandError> {
         match cmd {
             Command::ShutDown => Err(CommandError::ShutDown),
             Command::Insert(req) => {
                 self.0.insert(req.into_stub());
                 Ok(())
             }
             Command::Remove(id) => {
-                if self
-                    .0
-                    .remove(&id)
-                    .is_none_or(|stub| outputs.send(Output::Removed(stub.session)).is_ok())
-                {
-                    Ok(())
+                if let Some(stub) = self.0.remove(&id) {
+                    removed.insert(stub.session.id);
+                    outputs
+                        .send(Output::Removed(stub.session))
+                        .map_err(|_| CommandError::SendError)
                 } else {
-                    Err(CommandError::SendError)
+                    Ok(())
                 }
             }
         }
diff --git a/llama.cu/src/exec/sample_manager.rs b/llama.cu/src/exec/sample_manager.rs
@@ -39,6 +39,7 @@ impl<'ctx> SampleManager<'ctx> {
         let logits = logits_.as_mut().map(|mem| mem.as_ptr().cast());
         dims!([out_len, _nvoc] = logits);
 
+        let kv_pair_template = Tensor::from_dim_slice(KV_PAIR, []);
         let kv_pair = stream.malloc::<KVPair>(out_len);
         for (i, (id, info)) in config.into_iter().enumerate() {
             let logits = logits.clone().transform(|layout| layout.index(0, i));
@@ -48,7 +49,7 @@ impl<'ctx> SampleManager<'ctx> {
                 decode_len,
             } = info;
 
-            let scale = state
+            let state = state
                 .entry(*id)
                 .or_insert_with(|| modifier.new_state(stream));
             let tok = if *decode_len == 0 {
@@ -60,23 +61,30 @@ impl<'ctx> SampleManager<'ctx> {
             unsafe {
                 modifier.next(
                     &logits,
-                    scale.as_mut_ptr(),
+                    state.as_mut_ptr(),
                     tok,
                     args.temperature,
                     args.repetition_penalty,
                     stream,
                 )
-            };
+            }
 
-            let kv_pair = Tensor::from_dim_slice(KV_PAIR, [])
+            let kv_pair = kv_pair_template
+                .as_ref()
                 .map(|_| kv_pair[i * size_of::<KVPair>()..].as_ptr().cast());
             if args.is_argmax() {
-                sample.argmax(kv_pair.clone(), logits, stream)
+                sample.argmax(kv_pair, logits, stream)
             } else {
-                sample.sample(kv_pair.clone(), logits, *args, rand::random(), stream)
+                sample.sample(kv_pair, logits, *args, rand::random(), stream)
             }
         }
         stream.free(logits_.take());
         kv_pair
     }
+
+    pub fn remove(&mut self, id: impl IntoIterator<Item = SessionId>) {
+        for id in id {
+            self.state.remove(&id);
+        }
+    }
 }
diff --git a/llama.cu/src/op/random_sample/modifier.rs b/llama.cu/src/op/random_sample/modifier.rs
@@ -1,7 +1,6 @@
 ﻿//! <https://zhuanlan.zhihu.com/p/667025336>
 
 use crate::utils::offset_ptr;
-use ggus::ggml_quants::f16;
 use log::warn;
 use nn::Tensor;
 use operators::cuda::{CurrentCtx, DevByte, DevMem, Module, Ptx, Stream, VirByte, params};
@@ -26,13 +25,13 @@ impl<'ctx> LogitsModifier<'ctx> {
 
 impl LogitsModifier<'_> {
     pub fn new_state<'ctx>(&self, stream: &Stream<'ctx>) -> DevMem<'ctx> {
-        stream.malloc::<f16>(self.n)
+        stream.malloc::<u32>(self.n)
     }
 
     pub unsafe fn next<const N: usize>(
         &self,
         logits: &Tensor<*const VirByte, N>,
-        scale: *mut DevByte,
+        records: *mut DevByte,
         tok: *const DevByte,
         mut temperature: f32,
         penalty: f32,
@@ -47,12 +46,12 @@ impl LogitsModifier<'_> {
             (n.div_ceil(256), 256, 0),
             &params![
                 offset_ptr(logits),
-                scale,
+                records,
                 n,
+                self.eos,
                 temperature,
-                penalty.recip(),
-                tok,
-                self.eos
+                penalty,
+                tok
             ]
             .to_ptrs(),
         );
@@ -61,19 +60,21 @@ impl LogitsModifier<'_> {
     fn compile<'ctx>(ctx: &'ctx CurrentCtx) -> Module<'ctx> {
         const CODE: &str = include_str!("modify.cuh");
         let code = format!(
-            r#"
-{CODE}
+            r#"{CODE}
 
 extern "C" __global__ void next(
-    half *logits,
-    half *scale,
-    unsigned int const n,
-    float const temperature,
-    float const penalty,
-    unsigned int const *tok,
-    unsigned int const eos
+    // 采样分布和状态
+    half *logits,          // 概率分布
+    unsigned int *records, // 每个 token 的出现次数
+    // 词表信息
+    unsigned int const n,   // 词表长度
+    unsigned int const eos, // 结束符
+    // 采样参数
+    float const temperature, // 温度
+    float const penalty,     // 重复惩罚
+    unsigned int const *tok  // 上一次采样结果
 ) {{
-    next_kernel(logits, scale, n, temperature, penalty, tok, eos);
+    next_kernel(logits, records, n, eos, temperature, penalty, tok);
 }}"#
         );
         let (ptx, log) = Ptx::compile(code, ctx.dev().compute_capability());
diff --git a/llama.cu/src/op/random_sample/modify.cuh b/llama.cu/src/op/random_sample/modify.cuh
@@ -1,32 +1,36 @@
 template <typename T>
 __global__ void next_kernel(
-    T *logits,
-    T *scale_,
-    unsigned int const n,
-    float const temperature,
-    float const penalty,
-    unsigned int const *tok,
-    unsigned int const eos) {
+    // 采样分布和状态
+    T *logits,             // 概率分布
+    unsigned int *records, // 每个 token 的出现次数
+    // 词表信息
+    unsigned int const n,   // 词表长度
+    unsigned int const eos, // 结束符
+    // 采样参数
+    float const temperature, // 温度
+    float const penalty,     // 重复惩罚
+    unsigned int const *tok  // 上一次采样结果
+) {
     unsigned int const i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= n) {
         return;
     }
-    float scale;
+    // 更新出现次数
     if (!tok) {
-        // 初始化惩罚权重
-        scale = i == eos ? 0 : 1;
-        scale_[i] = 1;
-    } else {
-        // 更新惩罚权重
-        scale = (float)scale_[i];
-        if (i == *tok) {
-            scale *= penalty;
-            scale_[i] = scale;
-        }
+        records[i] = 0;
+    } else if (i == *tok) {
+        ++records[i];
     }
-    if (((float)logits[i]) > .0) {
-        logits[i] *= (T)(temperature * scale);
+    // 调整分布
+    if (!tok && i == eos) {
+        // 第一轮解码绝不产生 eos
+        ((unsigned int *)logits)[i] = 0xFF800000; // float -∞
     } else {
-        logits[i] /= (T)(temperature * scale);
+        T scale = temperature * powf(penalty, records[i]);
+        if (((float)logits[i]) > .0) {
+            logits[i] /= scale;
+        } else {
+            logits[i] *= scale;
+        }
     }
 }