InfiniTensor
diff --git a/‎Cargo.lock‎
Lines changed: 6 additions & 6 deletions b/‎Cargo.lock‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎llama.cu/src/batch/default.rs‎
Lines changed: 33 additions & 26 deletions b/‎llama.cu/src/batch/default.rs‎
Lines changed: 33 additions & 26 deletions
diff --git a/‎llama.cu/src/batch/mod.rs‎
Lines changed: 10 additions & 2 deletions b/‎llama.cu/src/batch/mod.rs‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎llama.cu/src/exec/engine.rs‎
Lines changed: 34 additions & 28 deletions b/‎llama.cu/src/exec/engine.rs‎
Lines changed: 34 additions & 28 deletions
diff --git a/‎llama.cu/src/exec/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎llama.cu/src/exec/mod.rs‎
Lines changed: 1 addition & 0 deletions
@@ -1,12 +1,13 @@
 use super::{BatchStrategy, Req, Round, SessionId, SessionStub};
+use crate::batch::SampleInfo;
 use log::warn;
-use std::{cmp::min, collections::BTreeMap, iter::repeat_n, mem::take};
+use std::{collections::BTreeMap, mem::take};
 
 pub(crate) struct DefaultStrategy<T> {
     sess: BTreeMap<SessionId, SessionStub<T>>,
     pre_output: BTreeMap<SessionId, usize>,
     // 每次 prefill 的最大长度
-    chunked_prefill_max_len: Option<usize>,
+    chunked_prefill_max_len: usize,
     max_toks: usize,
 }
 
@@ -15,7 +16,7 @@ impl<T> DefaultStrategy<T> {
         Self {
             sess: Default::default(),
             pre_output: Default::default(),
-            chunked_prefill_max_len: chunked_prefill_len,
+            chunked_prefill_max_len: chunked_prefill_len.unwrap_or(usize::MAX),
             max_toks,
         }
     }
@@ -61,32 +62,27 @@ impl<T: 'static + Clone> BatchStrategy<T> for DefaultStrategy<T> {
             let remain_tok_num = self.max_toks - ans.tokens.len();
             assert!(remain_tok_num > 0);
 
+            let input_idx = ans.tokens.len();
             if let Some(prompt) = &stub.prompt {
-                seq = self
-                    .chunked_prefill_max_len
-                    .map_or(min(remain_tok_num, seq), |chunked_prefill_max_len| {
-                        remain_tok_num.min(seq).min(chunked_prefill_max_len)
-                    });
+                seq = self.chunked_prefill_max_len.min(seq).min(remain_tok_num);
+                let (prompt, tail) = prompt[prompt.len() - stub.state.seq..].split_at(seq);
 
-                if seq < stub.state.seq {
-                    // chunked prefill
-                    out = 0;
-                    end = pos + seq;
-
-                    ans.tokens
-                        .extend(prompt.iter().skip(prompt.len() - stub.state.seq).take(seq));
-
-                    //更新stub信息
-                    stub.state.seq -= seq
-                } else {
+                if tail.is_empty() {
                     // 正常 prefill
                     if seq != prompt.len() {
                         log::debug!("{id:?} chunked prefil finished")
                     }
-                    ans.tokens.extend(prompt[prompt.len() - seq..].to_owned());
-
+                    ans.tokens.extend(prompt);
+                    // 更新 stub 信息
                     stub.state.seq = 1;
                     stub.prompt = None
+                } else {
+                    // chunked prefill
+                    out = 0;
+                    end = pos + seq;
+                    ans.tokens.extend(prompt);
+                    // 更新 stub 信息
+                    stub.state.seq = tail.len()
                 }
             } else {
                 // decode
@@ -100,25 +96,36 @@ impl<T: 'static + Clone> BatchStrategy<T> for DefaultStrategy<T> {
             // 尝试填充缓存
             stub.session.cache.len = end;
             // 填充推理信息
-            ans.sample.extend(repeat_n(stub.session.sample_args, out));
+            ans.sample
+                .extend((input_idx..input_idx + out).map(|input_idx| {
+                    (
+                        id,
+                        SampleInfo {
+                            args: stub.session.sample_args,
+                            input_idx,
+                            decode_len: stub.state.decode_len,
+                        },
+                    )
+                }));
             ans.output.push((id, out));
             ans.reqs.push(Req {
                 cache: stub.session.cache.cache.clone(),
                 pos,
                 seq,
             });
+            if out > 0 {
+                stub.state.decode_len += 1
+            }
 
             // 输出处理
-            // 不会溢出 因为 out <= 1
-            stub.state.remain_steps -= out;
-            if stub.state.remain_steps == 0 {
+            if stub.state.decode_len == stub.state.max_steps {
                 // 生成结束
                 ans.finished.push(stub.session)
             } else {
                 // 回填
                 assert!(write_back_sessions.insert(id, stub).is_none());
                 if out != 0 {
-                    assert!(self.pre_output.insert(id, out_idx).is_none());
+                    assert!(self.pre_output.insert(id, out_idx).is_none())
                 }
             }
             out_idx += out;
 
@@ -28,7 +28,7 @@ pub struct Round<T> {
     pub overflow: Vec<Session<T>>,
     pub tokens: Vec<utok>,
     pub reqs: Vec<Req<T>>,
-    pub sample: Vec<SampleArgs>,
+    pub sample: Vec<(SessionId, SampleInfo)>,
     pub output: Vec<(SessionId, usize)>,
     pub fast_map: Vec<(utok, utok)>,
     pub finished: Vec<Session<T>>,
@@ -48,6 +48,13 @@ impl<T> Default for Round<T> {
     }
 }
 
+#[derive(Clone, Copy)]
+pub struct SampleInfo {
+    pub args: SampleArgs,
+    pub input_idx: usize,
+    pub decode_len: usize,
+}
+
 pub struct Session<T> {
     pub id: SessionId,
     pub sample_args: SampleArgs,
@@ -64,7 +71,8 @@ pub struct Cache<T> {
 pub(super) struct State {
     pub seq: usize,
     pub out: usize,
-    pub remain_steps: usize,
+    pub decode_len: usize,
+    pub max_steps: usize,
 }
 
 #[derive(Clone)]
 
@@ -5,7 +5,7 @@
 use crate::{
     CacheParts,
     batch::{Req, Round, SessionStub, State},
-    exec::{group::ModelGroupConfig, upos},
+    exec::{group::ModelGroupConfig, sample_manager::SampleManager, upos},
     handle::Handle,
     op::{FastEmbedding, random_sample::KVPair},
 };
@@ -47,7 +47,8 @@ impl Request {
             state: State {
                 seq: prompt.len(),
                 out,
-                remain_steps: max_steps,
+                decode_len: 0,
+                max_steps,
             },
             prompt: Some(prompt),
         }
@@ -72,6 +73,7 @@ pub struct Progress {
 
 pub(crate) fn engine(
     llama: LLaMA<Tensor<&[u8], 2>>,
+    eos: utok,
     workers: &[(c_int, Option<Arc<Progress>>)],
     commands: Receiver<Command>,
     outputs: Sender<Output>,
@@ -80,6 +82,7 @@ pub(crate) fn engine(
     if let &[(gpu, progress)] = &workers {
         return mono(
             llama,
+            eos,
             Device::new(*gpu),
             progress.clone(),
             commands,
@@ -146,6 +149,7 @@ pub(crate) fn engine(
 
 fn mono(
     mut llama: LLaMA<Tensor<&[u8], 2>>,
+    eos: utok,
     dev: Device,
     progress: Option<Arc<Progress>>,
     commands: Receiver<Command>,
@@ -171,7 +175,7 @@ fn mono(
         task_box: Default::default(),
         chunked_prefill_len: CHUNKED_PREFILL_LEN,
     }
-    .lead(llama, output_head, commands, outputs, |ctx| {
+    .lead(llama, eos, output_head, commands, outputs, |ctx| {
         Handle::new(ctx)
     })
 }
@@ -200,6 +204,7 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
     fn lead(
         self,
         llama: LLaMA<Tensor<&[u8], 2>>,
+        eos: utok,
         output_head: nn::OutputHead<Tensor<&[u8], 2>>,
         commands: Receiver<Command>,
         outputs: Sender<Output>,
@@ -233,6 +238,7 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
             );
 
             let mut output_head = OutputHead::new(output_head, ctx);
+            let mut sample_manager = SampleManager::new(output_head.nvoc(), eos, ctx);
 
             let max_tok = max_toks;
             let mut fast_embd = FastEmbedding::new(max_tok, ctx);
@@ -246,7 +252,6 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
             let mut pos_buf = BufN::<upos>::new(len, BUF_LEVEL, ctx);
             let mut out_idx_buf = BufN::<utok>::new(len, BUF_LEVEL, ctx);
             let mut fast_embd_buf = BufN::<(utok, utok)>::new(len, BUF_LEVEL, ctx);
-
             if outputs.send(Output::Ready).is_ok() {
                 while manager.receive(&commands, &outputs).is_ok() {
                     // 组织请求
@@ -302,33 +307,34 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
                         barrier.wait();
                         models.share_inputs(key, &mut handle, &stream);
                     }
+                    let mut input = stream.malloc::<utok>(tok.len() / size_of::<utok>());
+                    stream.memcpy_d2d(&mut input, tok);
                     // 推理
                     let x = models.launch(key, &reqs, &mut handle, &stream);
-
                     // 如果没有输出，则跳过
-                    if !out_idx.is_empty() {
-                        let output = output
-                            .into_iter()
-                            .filter_map(|(id, len)| if len > 0 { Some((id, len)) } else { None })
-                            .collect::<Vec<_>>();
-                        let kv_pairs = output_head.launch(
-                            x,
-                            &out_idx_buf[..out_idx.len()],
-                            sample,
-                            &mut handle,
-                            &stream,
-                        );
-                        stream.memcpy_d2d(&mut pre_kv_pairs[..kv_pairs.len()], &kv_pairs);
-
-                        let output = Output::Complete {
-                            output: output.into(),
-                            kv_pair: kv_pairs.sporulate(),
-                            event: stream.record().sporulate(),
-                            finished: finished.into(),
-                        };
-                        if outputs.send(output).is_err() {
-                            break;
-                        }
+                    if out_idx.is_empty() {
+                        continue;
+                    }
+                    // 计算输出头
+                    let logits =
+                        output_head.launch(x, &out_idx_buf[..out_idx.len()], &mut handle, &stream);
+                    // 采样
+                    let kv_pairs = sample_manager.sample(logits, &input, &sample, &stream);
+                    stream.free(input);
+                    stream.memcpy_d2d(&mut pre_kv_pairs[..kv_pairs.len()], &kv_pairs);
+                    // 生成并发送输出
+                    let output = output
+                        .into_iter()
+                        .filter_map(|(id, len)| if len > 0 { Some((id, len)) } else { None })
+                        .collect();
+                    let output = Output::Complete {
+                        output,
+                        kv_pair: kv_pairs.sporulate(),
+                        event: stream.record().sporulate(),
+                        finished: finished.into(),
+                    };
+                    if outputs.send(output).is_err() {
+                        break;
                     }
                 }
             }
 
@@ -4,6 +4,7 @@ mod group;
 mod kv_cache;
 mod model;
 mod output_head;
+mod sample_manager;
 mod step;
 
 use crate::{