InfiniTensor
diff --git a/‎Cargo.lock‎
Lines changed: 0 additions & 6 deletions b/‎Cargo.lock‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎InfiniLM‎
Lines changed: 1 addition & 0 deletions b/‎InfiniLM‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llama.cu/Cargo.toml‎
Lines changed: 3 additions & 2 deletions b/‎llama.cu/Cargo.toml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎llama.cu/src/exec/group.rs‎
Lines changed: 201 additions & 1 deletion b/‎llama.cu/src/exec/group.rs‎
Lines changed: 201 additions & 1 deletion
@@ -0,0 +1 @@
+Subproject commit 9eef45af70e2d65cf96373d79bdabb9b38003e45
@@ -8,9 +8,10 @@ cuda = { git = "https://github.com/YdrMaster/cuda-driver", rev = "6a97931" }
 cublas = { git = "https://github.com/YdrMaster/cuda-driver", rev = "6a97931" }
 nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "6a97931" }
 flash-attn = { git = "https://github.com/YdrMaster/learn-flash-attn", rev = "616bbac" }
-nn = { git = "https://github.com/YdrMaster/InfiniNN", rev = "fa8aaf6" }
+nn = { path = "/home/cearx/qy/InfiniNN/1_nn" }
 ggus = { git = "https://github.com/InfiniTensor/gguf", rev = "23c362f" }
-tokeneer = { git = "https://github.com/InfiniTensor/tokeneer", rev = "c48f39f" }
+#tokeneer = { git = "https://github.com/InfiniTensor/tokeneer", rev = "c48f39f" }
+tokeneer = {path = "/home/cearx/qy/tokeneer"}
 
 bytesize = "2.0"
 log.workspace = true
 
@@ -1,8 +1,10 @@
+use super::mamba_cache::MambaCache;
 use super::{CacheParts, Progress, model::ModelExec, upos};
 use crate::{batch::Req, handle::Handle, load::load_weight, memory::MemPages};
 use cuda::{DevByte, DevMem, Stream, VirByte};
 use nn::{
-    Distribution, Graph, GraphBuilder, LLaMA, NNGraph, Tensor, TensorMeta, digit_layout::types, op,
+    Distribution, Graph, GraphBuilder, LLaMA, Mamba, NNGraph, Tensor, TensorMeta,
+    digit_layout::types, op,
 };
 use std::{
     collections::BTreeMap,
@@ -233,10 +235,208 @@ fn builder() -> GraphBuilder {
         .register_op("rope", op::rope::Rope)
         .register_op("attention", op::attention::Attention)
         .register_op("swiglu", op::activation::SwiGLU)
+        .register_op("silu", op::activation::SiLU)
         .register_op("concat", op::concat::Concat)
         .register_op("split", op::split::Split)
         .register_op("tile", op::tile::Tile)
         .register_op("merge", op::merge::Merge)
         .register_op("all-reduce", op::all_reduce::AllReduce);
     ans
 }
+
+// 针对 Mamba 的 GraphBuilder（注册其所需算子）
+fn builder_mamba() -> GraphBuilder {
+    let mut ans = GraphBuilder::default();
+    ans.register_op("embedding", op::embedding::Embedding)
+        .register_op("rms-norm", op::normalization::RmsNorm)
+        .register_op("linear", op::linear::Linear)
+        .register_op("silu", op::activation::SiLU)
+        .register_op("element-mul", op::element_mul::ElementMul)
+        .register_op("split", op::split::Split)
+        .register_op("mamba-causal-conv1d", op::mamba::CausalConv1d)
+        .register_op("mamba-selective-scan", op::mamba::SelectiveScan);
+    ans
+}
+
+// Mamba 推理组：仅 token 输入，无 KV cache 参与
+pub(crate) struct ModelGroupMamba<'ctx> {
+    internal: Internal<'ctx>,
+    pages: MemPages,
+    _weight: DevMem<'ctx>,
+    // 下一次写入到 pos_buf 的位置（用于单步增量 decode/prefill）
+    next_pos: u32,
+}
+
+impl<'ctx> ModelGroupMamba<'ctx> {
+    pub fn new<T: IntoIterator<Item = usize>>(
+        mamba: Mamba<Tensor<&[u8], 2>>,
+        dist: Distribution,
+        progress: Option<Arc<Progress>>, // 预留
+        config: ModelGroupConfig<T>,
+        handle: &mut Handle<'ctx>,
+        barrier: Option<&Barrier>,
+    ) -> Self {
+        let ModelGroupConfig {
+            static_model_keys,
+            mut dyn_cache_size,
+            use_cuda_graph,
+        } = config;
+
+        let NNGraph(Graph { topo, nodes, edges }) = builder_mamba()
+            .build(
+                mamba.tensor_parallel(dist),
+                [
+                    TensorMeta::new(types::U32, ["n_tok".into()]),
+                    TensorMeta::new(types::U32, ["n_tok".into()]),
+                    TensorMeta::new(types::U32, ["n_tok".into()]),
+                ],
+            )
+            .unwrap();
+        handle.ctx.stream().synchronize();
+
+        let dev = handle.ctx.dev();
+        let mut pages = MemPages::new(dev);
+        let (_weight, edges) = load_weight(edges, progress, handle.ctx);
+        let graph = NNGraph(Graph { topo, nodes, edges });
+        let static_models = if use_cuda_graph {
+            static_model_keys
+                .into_iter()
+                .map(|n_tok| {
+                    if let Some(b) = barrier {
+                        b.wait();
+                    }
+                    let key = NonZeroUsize::new(n_tok).unwrap();
+                    let exec = ModelExec::new(graph.clone(), n_tok, handle, &mut pages, true);
+                    (key, exec)
+                })
+                .collect::<BTreeMap<_, _>>()
+        } else {
+            dyn_cache_size += static_model_keys.into_iter().count();
+            Default::default()
+        };
+
+        let internal = Internal::new(graph, static_models, dyn_cache_size);
+        Self {
+            internal,
+            pages,
+            _weight,
+            next_pos: 0,
+        }
+    }
+
+    pub fn load_inputs_mamba(
+        &mut self,
+        handle: &mut Handle<'ctx>,
+        len: usize,
+        tok: &[utok],
+        stream: &Stream<'ctx>,
+    ) -> (NonZeroUsize, &mut [DevByte]) {
+        let key = self.internal.get_key(NonZeroUsize::new(len).unwrap());
+        let model = self.internal.map_exec(key, handle, &mut self.pages, stream);
+        stream.memcpy_h2d(model.tok_buf(), &tok[..key.get()]);
+        let pos: Vec<upos> = (0..key.get()).map(|i| i as upos).collect();
+        stream.memcpy_h2d(model.pos_buf(), &pos);
+        // 将 next_pos 对齐到 prefill 末尾，便于后续 decode 递增
+        self.next_pos = key.get() as u32;
+        // out_idx：prefill 阶段对所有位置计算输出头
+        let out_idx: Vec<utok> = (0..key.get()).map(|i| i as utok).collect();
+        let buf = model.input_buf_at(2);
+        stream.memcpy_h2d(buf, &out_idx);
+        (key, model.tok_buf())
+    }
+
+    #[cfg(nccl)]
+    pub fn share_inputs(
+        &mut self,
+        key: NonZeroUsize,
+        handle: &mut Handle<'ctx>,
+        stream: &Stream<'ctx>,
+    ) {
+        let model = self.internal.map_exec(key, handle, &mut self.pages, stream);
+        if let Some(comm) = &handle.comm {
+            comm.broadcast(model.tok_buf(), None, 0, stream);
+        }
+    }
+
+    pub fn launch(
+        &mut self,
+        key: NonZeroUsize,
+        handle: &mut Handle,
+        stream: &Stream<'ctx>,
+    ) -> Tensor<*const VirByte, 2> {
+        self.internal
+            .get_mut(&key)
+            .unwrap()
+            .launch(handle, &[], stream)
+    }
+
+    /// 单步增量：加载单 token（pos 固定为 0，out_idx 固定为 0）
+    pub fn append_input_mamba(
+        &mut self,
+        handle: &mut Handle<'ctx>,
+        tok: utok,
+        stream: &Stream<'ctx>,
+    ) -> (NonZeroUsize, &mut [DevByte]) {
+        // 使用 n_tok = 1 的模型
+        let key = self.internal.get_key(NonZeroUsize::new(1).unwrap());
+        let model = self.internal.map_exec(key, handle, &mut self.pages, stream);
+        // tok
+        let tok_buf = model.tok_buf();
+        stream.memcpy_h2d(tok_buf, &[tok]);
+        // pos 递增（prefill 从 0 开始，decode 从 prefill 末尾继续）
+        let pos_buf = model.pos_buf();
+        let cur = self.next_pos;
+        stream.memcpy_h2d(pos_buf, &[cur]);
+        self.next_pos = cur.saturating_add(1);
+        // out_idx 固定为 0
+        let out_idx_buf = model.input_buf_at(2);
+        stream.memcpy_h2d(out_idx_buf, &[0u32]);
+        (key, model.tok_buf())
+    }
+
+    /// 设置下一步写入的起始位置（用于显式对齐 prefill→decode）
+    pub fn set_decode_start_pos(&mut self, start: u32) {
+        self.next_pos = start;
+    }
+
+    /// 单步增量：执行一步，返回隐藏态（仅一个位置）
+    pub fn launch_step(
+        &mut self,
+        key: NonZeroUsize,
+        handle: &mut Handle,
+        stream: &Stream<'ctx>,
+    ) -> Tensor<*const VirByte, 2> {
+        // 目前复用现有图执行路径（n_tok=1），后续接入 Step::Mamba 内核以就地更新状态
+        self.internal
+            .get_mut(&key)
+            .unwrap()
+            .launch(handle, &[], stream)
+    }
+
+    /// 单步增量（预留）：带 mamba cache 的版本
+    pub fn launch_step_with_cache(
+        &mut self,
+        key: NonZeroUsize,
+        cache: &mut MambaCache,
+        cache_pos: usize,
+        handle: &mut Handle,
+        stream: &Stream<'ctx>,
+    ) -> Tensor<*const VirByte, 2> {
+        // 切换到专用单步：只替换 Mamba Step 的执行，其他仍按原图执行
+        let model = self.internal.get_mut(&key).unwrap();
+        model.launch_with_mamba_cache(handle, cache, cache_pos, stream)
+    }
+
+    /// Prefill 阶段：带 mamba cache 的版本，用于执行 prefill 并写回状态
+    pub fn launch_prefill_with_cache(
+        &mut self,
+        key: NonZeroUsize,
+        cache: &mut MambaCache,
+        handle: &mut Handle,
+        stream: &Stream<'ctx>,
+    ) -> Tensor<*const VirByte, 2> {
+        // 切换到专用 prefill：只替换 Mamba Step 的执行，其他仍按原图执行
+        let model = self.internal.get_mut(&key).unwrap();
+        model.launch_with_mamba_cache(handle, cache, 0, stream)
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Subproject commit 9eef45af70e2d65cf96373d79bdabb9b38003e45`