InfiniTensor
diff --git a/‎Cargo.lock‎
Lines changed: 6 additions & 6 deletions b/‎Cargo.lock‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎llama.cu/Cargo.toml‎
Lines changed: 2 additions & 2 deletions b/‎llama.cu/Cargo.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎llama.cu/src/exec/group.rs‎
Lines changed: 133 additions & 1 deletion b/‎llama.cu/src/exec/group.rs‎
Lines changed: 133 additions & 1 deletion
diff --git a/‎llama.cu/src/exec/mamba.rs‎
Lines changed: 178 additions & 0 deletions b/‎llama.cu/src/exec/mamba.rs‎
Lines changed: 178 additions & 0 deletions
@@ -8,9 +8,9 @@ cuda = { git = "https://github.com/YdrMaster/cuda-driver", rev = "6a97931" }
 cublas = { git = "https://github.com/YdrMaster/cuda-driver", rev = "6a97931" }
 nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "6a97931" }
 flash-attn = { git = "https://github.com/YdrMaster/learn-flash-attn", rev = "616bbac" }
-nn = { git = "https://github.com/YdrMaster/InfiniNN", rev = "fa8aaf6" }
+nn = { git = "https://github.com/CearX/InfiniNN", rev = "6caef2"}
 ggus = { git = "https://github.com/InfiniTensor/gguf", rev = "23c362f" }
-tokeneer = { git = "https://github.com/InfiniTensor/tokeneer", rev = "c48f39f" }
+tokeneer = { git = "https://github.com/CearX/tokeneer.git", rev = "2546d72" }
 
 bytesize = "2.0"
 log.workspace = true
 
@@ -1,8 +1,10 @@
+use super::mamba_cache::MambaCache;
 use super::{CacheParts, Progress, model::ModelExec, upos};
 use crate::{batch::Req, handle::Handle, load::load_weight, memory::MemPages};
 use cuda::{DevByte, DevMem, Stream, VirByte};
 use nn::{
-    Distribution, Graph, GraphBuilder, LLaMA, NNGraph, Tensor, TensorMeta, digit_layout::types, op,
+    Distribution, Graph, GraphBuilder, LLaMA, Mamba, NNGraph, Tensor, TensorMeta,
+    digit_layout::types, op,
 };
 use std::{
     collections::BTreeMap,
@@ -240,3 +242,133 @@ fn builder() -> GraphBuilder {
         .register_op("all-reduce", op::all_reduce::AllReduce);
     ans
 }
+
+//  Mamba GraphBuilder
+fn builder_mamba() -> GraphBuilder {
+    let mut ans = GraphBuilder::default();
+    ans.register_op("embedding", op::embedding::Embedding)
+        .register_op("rms-norm", op::normalization::RmsNorm)
+        .register_op("linear", op::linear::Linear)
+        .register_op("silu", op::activation::SiLU)
+        .register_op("element-mul", op::element_mul::ElementMul)
+        .register_op("split", op::split::Split)
+        .register_op("mamba-causal-conv1d", op::mamba::CausalConv1d)
+        .register_op("mamba-selective-scan", op::mamba::SelectiveScan);
+    ans
+}
+
+pub(crate) struct ModelGroupMamba<'ctx> {
+    internal: Internal<'ctx>,
+    pages: MemPages,
+    _weight: DevMem<'ctx>,
+    next_pos: u32,
+}
+
+impl<'ctx> ModelGroupMamba<'ctx> {
+    pub fn new<T: IntoIterator<Item = usize>>(
+        mamba: Mamba<Tensor<&[u8], 2>>,
+        dist: Distribution,
+        progress: Option<Arc<Progress>>,
+        config: ModelGroupConfig<T>,
+        handle: &mut Handle<'ctx>,
+        barrier: Option<&Barrier>,
+    ) -> Self {
+        let ModelGroupConfig {
+            static_model_keys,
+            mut dyn_cache_size,
+            use_cuda_graph,
+        } = config;
+
+        let NNGraph(Graph { topo, nodes, edges }) = builder_mamba()
+            .build(
+                mamba.tensor_parallel(dist),
+                [
+                    TensorMeta::new(types::U32, ["n_tok".into()]),
+                    TensorMeta::new(types::U32, ["n_tok".into()]),
+                    TensorMeta::new(types::U32, ["n_tok".into()]),
+                ],
+            )
+            .unwrap();
+        handle.ctx.stream().synchronize();
+
+        let dev = handle.ctx.dev();
+        let mut pages = MemPages::new(dev);
+        let (_weight, edges) = load_weight(edges, progress, handle.ctx);
+        let graph = NNGraph(Graph { topo, nodes, edges });
+        let static_models = if use_cuda_graph {
+            static_model_keys
+                .into_iter()
+                .map(|n_tok| {
+                    if let Some(b) = barrier {
+                        b.wait();
+                    }
+                    let key = NonZeroUsize::new(n_tok).unwrap();
+                    let exec = ModelExec::new(graph.clone(), n_tok, handle, &mut pages, true);
+                    (key, exec)
+                })
+                .collect::<BTreeMap<_, _>>()
+        } else {
+            dyn_cache_size += static_model_keys.into_iter().count();
+            Default::default()
+        };
+
+        let internal = Internal::new(graph, static_models, dyn_cache_size);
+        Self {
+            internal,
+            pages,
+            _weight,
+            next_pos: 0,
+        }
+    }
+
+    pub fn load_inputs_mamba_prefill(
+        &mut self,
+        handle: &mut Handle<'ctx>,
+        len: usize,
+        tok: &[utok],
+        stream: &Stream<'ctx>,
+    ) -> (NonZeroUsize, &mut [DevByte]) {
+        let key = self.internal.get_key(NonZeroUsize::new(len).unwrap());
+        let model = self.internal.map_exec(key, handle, &mut self.pages, stream);
+        stream.memcpy_h2d(model.tok_buf(), &tok[..key.get()]);
+        let pos: Vec<upos> = (0..key.get()).map(|i| i as upos).collect();
+        stream.memcpy_h2d(model.pos_buf(), &pos);
+        self.next_pos = key.get() as u32;
+        let out_idx: Vec<utok> = (0..key.get()).map(|i| i as utok).collect();
+        let buf = model.input_buf_at(2);
+        stream.memcpy_h2d(buf, &out_idx);
+        (key, model.tok_buf())
+    }
+
+    pub fn load_input_mamba_decode(
+        &mut self,
+        handle: &mut Handle<'ctx>,
+        tok: utok,
+        stream: &Stream<'ctx>,
+    ) -> (NonZeroUsize, &mut [DevByte]) {
+        let key = self.internal.get_key(NonZeroUsize::new(1).unwrap());
+        let model = self.internal.map_exec(key, handle, &mut self.pages, stream);
+        let tok_buf = model.tok_buf();
+        stream.memcpy_h2d(tok_buf, &[tok]);
+        let pos_buf = model.pos_buf();
+        let cur = self.next_pos;
+        stream.memcpy_h2d(pos_buf, &[cur]);
+        // 更新 next_pos
+        self.next_pos = cur.saturating_add(1);
+        // decode 时 out_idx 固定为 0
+        let out_idx_buf = model.input_buf_at(2);
+        stream.memcpy_h2d(out_idx_buf, &[0u32]);
+        (key, model.tok_buf())
+    }
+
+    pub fn launch_mamba(
+        &mut self,
+        key: NonZeroUsize,
+        cache: &mut MambaCache,
+        handle: &mut Handle,
+        stream: &Stream<'ctx>,
+    ) -> Tensor<*const VirByte, 2> {
+        let model = self.internal.get_mut(&key).unwrap();
+        model.launch_with_mamba_cache(handle, cache, stream)
+    }
+}
@@ -0,0 +1,178 @@
+use crate::exec::group::{ModelGroupConfig, ModelGroupMamba};
+use crate::exec::mamba_cache::MambaCache;
+use crate::exec::output_head::OutputHead;
+use crate::exec::sample_manager::SampleManager;
+use crate::memory::MemPages;
+use crate::op::random_sample::{KVPair, SampleArgs};
+use crate::utils::{self, meta};
+use crate::{handle::Handle, model::map_files};
+use cuda::Device;
+use ggus::GGufMetaMapExt;
+use nn::Distribution;
+use std::env;
+use tokeneer::Bpe;
+
+#[allow(dead_code)]
+pub fn mamba_infer(
+    model_path: std::path::PathBuf,
+    text: &str,
+    use_cuda_graph: bool,
+) -> (Vec<u8>, [usize; 2]) {
+    use crate::model::GGufModel;
+    // 初始化 CUDA
+    assert!(cuda::init().is_ok());
+
+    // 加载模型
+    let maps = map_files(model_path);
+    let gguf = GGufModel::read(maps.iter().map(|x| &**x));
+    let tokenizer = Bpe::from_gguf(&gguf);
+    let mut tokens = tokenizer.encode(text);
+
+    let n_tok = tokens.len();
+
+    // 取出输出头用于 logits 计算
+    let mut mamba = gguf.mamba();
+    let output_head_nn = mamba
+        .output_head
+        .take()
+        .expect("mamba model missing output_head");
+
+    let n_layer: usize = meta![gguf => llm_block_count];
+    let d_inner: usize = 5120; // TODO: ggus
+    let d_conv: usize = 4; // kernel size
+    let d_state: usize = 16; // ssm state size
+
+    // 单卡
+    let device = Device::new(0);
+    device.retain_primary().apply(|ctx| {
+        let mut handle = Handle::new(ctx);
+        let dist = Distribution {
+            start: 0,
+            len: 1,
+            total: 1,
+        };
+        let mut models = ModelGroupMamba::new(
+            mamba,
+            dist,
+            None,
+            ModelGroupConfig {
+                static_model_keys: [n_tok],
+                dyn_cache_size: 1,
+                use_cuda_graph,
+            },
+            &mut handle,
+            None,
+        );
+
+        // 组件：输出头与采样器
+        let stream = ctx.stream();
+        let mut output_head = OutputHead::new(output_head_nn, ctx);
+
+        // 读取词表大小构建采样器与 eos
+        let eos: tokeneer::utok = meta![gguf => tokenizer_ggml_eos_token_id];
+        let nvoc = output_head.nvoc();
+        let mut sample_manager = SampleManager::new(nvoc, eos, ctx);
+
+        // 初始化 MambaCache
+        let mut pages = MemPages::new(device);
+        let mut mcache = MambaCache::new(n_layer, d_inner, d_conv, d_state, &mut pages);
+
+        // Prefill
+        let (key, _tok_buf) =
+            models.load_inputs_mamba_prefill(&mut handle, tokens.len(), &tokens, &stream);
+
+        let mut x = models.launch_mamba(key, &mut mcache, &mut handle, &stream);
+
+        let last_idx: [tokeneer::utok; 1] = [(tokens.len() - 1) as tokeneer::utok];
+        let logits_prefill_last = output_head.launch(x.clone(), &last_idx, &mut handle, &stream);
+
+        let logits_prefill_last_vir = logits_prefill_last.as_ref().map(|mem| mem.as_ptr().cast());
+        utils::fmt(&logits_prefill_last_vir, stream.ctx());
+        // check prefill logits
+
+        let mut next_id: tokeneer::utok;
+        {
+            let mut input = stream.malloc::<tokeneer::utok>(tokens.len());
+            stream.memcpy_h2d(&mut input, &tokens);
+            let cfg0 = vec![(
+                crate::batch::SessionId(0),
+                crate::batch::SampleInfo {
+                    args: SampleArgs::new(0.8, 0.95, 50, 1.2).unwrap(),
+                    input_idx: tokens.len(),
+                    decode_len: tokens.len(),
+                },
+            )];
+            let kv_pairs0 = sample_manager.sample(logits_prefill_last, &input, &cfg0, &stream);
+            stream.free(input);
+            let mut host_kv0 = vec![KVPair::ZERO; 1];
+            stream.memcpy_d2h(&mut host_kv0, &kv_pairs0).free(kv_pairs0);
+            next_id = host_kv0[0].idx as tokeneer::utok;
+        }
+        let mut generated: Vec<tokeneer::utok> = Vec::new();
+        if next_id != eos {
+            tokens.push(next_id);
+            generated.push(next_id);
+            let (key, _tok_buf) = models.load_input_mamba_decode(&mut handle, next_id, &stream);
+            x = models.launch_mamba(key, &mut mcache, &mut handle, &stream);
+        }
+
+        let max_decode_steps: usize = env::var("MAMBA_STEPS")
+            .ok()
+            .and_then(|s| s.parse().ok())
+            .unwrap_or(100);
+        for _step in 1..max_decode_steps {
+            let out_idx: [tokeneer::utok; 1] = [0];
+
+            let logits = output_head.launch(x.clone(), &out_idx, &mut handle, &stream);
+
+            let mut input = stream.malloc::<tokeneer::utok>(tokens.len());
+            stream.memcpy_h2d(&mut input, &tokens);
+            let cfg = vec![(
+                crate::batch::SessionId(0),
+                crate::batch::SampleInfo {
+                    args: SampleArgs::new(0.8, 0.95, 50, 1.2).unwrap(),
+                    input_idx: tokens.len(),
+                    decode_len: tokens.len(),
+                },
+            )];
+            let kv_pairs = sample_manager.sample(logits, &input, &cfg, &stream);
+            stream.free(input);
+            let mut host_kv = vec![KVPair::ZERO; 1];
+            stream.memcpy_d2h(&mut host_kv, &kv_pairs).free(kv_pairs);
+            next_id = host_kv[0].idx as tokeneer::utok;
+
+            if next_id == eos {
+                break;
+            }
+
+            tokens.push(next_id);
+            generated.push(next_id);
+            let (key, _tok_buf) = models.load_input_mamba_decode(&mut handle, next_id, &stream);
+            x = models.launch_mamba(key, &mut mcache, &mut handle, &stream);
+        }
+
+        println!("tokens = {:?}", tokens);
+        let mut text_buf = tokeneer::TextBuf::new();
+        let s = tokenizer.decode(&generated, &mut text_buf);
+        let buf = s.into_bytes();
+
+        let shape = <[usize; 2]>::try_from(x.shape().to_vec()).unwrap();
+        (buf, shape)
+    })
+}
+
+// #[cfg(test)]
+// mod tests {
+//     use super::*;
+//     use std::path::PathBuf;
+
+//     #[test]
+//     fn test_mamba_infer_decode() {
+//         let model = PathBuf::from("../model/Mamba_adf32-2.8B-hf-v1.0-F16.gguf");
+//         let prompt = "Once upon a time,";
+//         let (bytes, _shape) = mamba_infer(model, prompt, false);
+//         let text = String::from_utf8_lossy(&bytes);
+//         println!("prompt = {}", prompt);
+//         println!("mamba infer text = {}", text);
+//     }
+// }