InfiniTensor
diff --git a/‎models/llama/common-cpu/src/infer.rs‎
Lines changed: 1 addition & 1 deletion b/‎models/llama/common-cpu/src/infer.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/llama/common-cpu/src/lib.rs‎
Lines changed: 12 additions & 11 deletions b/‎models/llama/common-cpu/src/lib.rs‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎models/llama/common/src/compute.rs‎
Lines changed: 16 additions & 24 deletions b/‎models/llama/common/src/compute.rs‎
Lines changed: 16 additions & 24 deletions
diff --git a/‎models/llama/common/src/lib.rs‎
Lines changed: 26 additions & 5 deletions b/‎models/llama/common/src/lib.rs‎
Lines changed: 26 additions & 5 deletions
diff --git a/‎models/llama/common/src/storage.rs‎
Lines changed: 16 additions & 21 deletions b/‎models/llama/common/src/storage.rs‎
Lines changed: 16 additions & 21 deletions
diff --git a/‎models/llama/infini/src/infer.rs‎
Lines changed: 1 addition & 1 deletion b/‎models/llama/infini/src/infer.rs‎
Lines changed: 1 addition & 1 deletion
@@ -69,7 +69,7 @@ fn test_infer() {
                 Some(s.spawn(move || {
                     let WorkerSeed { node, tasks } = seed;
                     let weights = Weights::new(model, range, count);
-                    let mut worker = Worker::new(id, &node, meta.clone(), weights, id == 0);
+                    let mut worker = Worker::new(id, &node, meta.clone(), weights);
                     let mut cache = meta.kv_cache(meta.nctx).map(Blob::new);
                     let sin_cos = <Operators as llama::Operators>::build_sin_cos(
                         meta.dt_embd,
 
@@ -17,6 +17,7 @@ use std::{
     marker::PhantomData,
     mem::size_of,
     ops::{Deref, Range, RangeBounds},
+    ptr::copy_nonoverlapping,
     slice::{from_raw_parts, from_raw_parts_mut},
 };
 
@@ -69,7 +70,7 @@ where
     where
         T: Deref<Target = [ByteOf<Self::Hardware>]>,
     {
-        println!("{tensor}");
+        println!("{tensor}")
     }
 
     fn memcpy_d2h<T: Copy>(
@@ -79,7 +80,7 @@ where
     ) {
         let count = size_of_val(dst);
         assert_eq!(size_of_val(src), count);
-        unsafe { std::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr().cast::<u8>(), count) }
+        unsafe { copy_nonoverlapping(src.as_ptr(), dst.as_mut_ptr().cast::<u8>(), count) }
     }
 }
 
@@ -236,13 +237,13 @@ impl WeightLoader for Weights<'_> {
 
         #[rustfmt::skip]
         match which {
-            AttnNorm                        => return Borrowed(attn_norm  ),
-            AttnQKV    if dt_mat == dt_embd => return Borrowed(attn_qkv   ),
-            AttnO      if dt_mat == dt_embd => return Borrowed(attn_o     ),
-            FfnNorm                         => return Borrowed(ffn_norm   ),
-            FfnGateInp if dt_mat == dt_embd => return Borrowed(ffn_gate_inp.as_ref().unwrap()),
-            FfnGateUp  if dt_mat == dt_embd => return Borrowed(ffn_gate_up),
-            FfnDown    if dt_mat == dt_embd => return Borrowed(ffn_down   ),
+            AttnNorm                        => return Borrowed(attn_norm   ),
+            AttnQKV    if dt_mat == dt_embd => return Borrowed(attn_qkv    ),
+            AttnO      if dt_mat == dt_embd => return Borrowed(attn_o      ),
+            FfnNorm                         => return Borrowed(ffn_norm    ),
+            FfnGateInp if dt_mat == dt_embd => return Borrowed(ffn_gate_inp),
+            FfnGateUp  if dt_mat == dt_embd => return Borrowed(ffn_gate_up ),
+            FfnDown    if dt_mat == dt_embd => return Borrowed(ffn_down    ),
             _ => {}
         };
 
@@ -265,7 +266,7 @@ impl WeightLoader for Weights<'_> {
             match which {
                 AttnQKV => dequant(dt_mat, dt_embd, attn_qkv, &mut cache[..size_qkv]),
                 AttnO => dequant(dt_mat, dt_embd, attn_o, &mut cache[..size_o]),
-                FfnGateInp => todo!(),
+                FfnGateInp => todo!("dequant ffn gate inp"),
                 FfnGateUp | FfnDown => {
                     dequant(dt_mat, dt_embd, ffn_gate_up, &mut cache[..size_gate_up]);
                     dequant(
@@ -284,7 +285,7 @@ impl WeightLoader for Weights<'_> {
             match which {
                 AttnQKV => 0..size_qkv,
                 AttnO => 0..size_o,
-                FfnGateInp => todo!(),
+                FfnGateInp => todo!("dequant ffn gate inp"),
                 FfnGateUp => 0..size_gate_up,
                 FfnDown => size_gate_up..size_gate_up + size_down,
                 AttnNorm | FfnNorm => unreachable!(),
 
@@ -33,12 +33,10 @@ pub trait Operators {
         T: Deref<Target = [ByteOf<Self::Hardware>]>;
 
     fn memcpy_d2h<T: Copy>(
-        _dst: &mut [T],
-        _src: &[ByteOf<Self::Hardware>],
-        _queue: &QueueOf<Self::Hardware>,
-    ) {
-        todo!()
-    }
+        dst: &mut [T],
+        src: &[ByteOf<Self::Hardware>],
+        queue: &QueueOf<Self::Hardware>,
+    );
 
     fn build_sin_cos<QA>(
         dt: DigitLayout,
@@ -81,13 +79,11 @@ pub trait WeightLoader {
 
     fn load_moe<'a>(
         &'a self,
-        _which: BlkWeight,
-        _iblk: usize,
-        _iexp: usize,
-        _queue: &'a QueueOf<Self::Hardware>,
-    ) -> Self::Weight<'a> {
-        todo!()
-    }
+        which: BlkWeight,
+        iblk: usize,
+        iexp: usize,
+        queue: &'a QueueOf<Self::Hardware>,
+    ) -> Self::Weight<'a>;
 
     fn output_norm<'a>(&'a self, queue: &'a QueueOf<Self::Hardware>) -> Self::Weight<'a>;
     fn output<'a>(&'a self, queue: &'a QueueOf<Self::Hardware>) -> Self::Weight<'a>;
@@ -105,17 +101,10 @@ pub struct LlamaWorker<Ops: Operators, W> {
     swiglu: Ops::Swiglu,
     rearrange: Ops::Rearrange,
     all_reduce: Ops::AllReduce,
-    residual: bool,
 }
 
 impl<Ops: Operators, W> LlamaWorker<Ops, W> {
-    pub fn new(
-        id: usize,
-        node: &Ops::TopoNode,
-        meta: LlamaMeta,
-        weights: W,
-        residual: bool,
-    ) -> Self {
+    pub fn new(id: usize, node: &Ops::TopoNode, meta: LlamaMeta, weights: W) -> Self {
         let processor = node.processor();
         Self {
             id,
@@ -128,7 +117,6 @@ impl<Ops: Operators, W> LlamaWorker<Ops, W> {
             swiglu: Ops::Swiglu::new(processor),
             rearrange: Ops::Rearrange::new(processor),
             all_reduce: Ops::AllReduce::new(node),
-            residual,
         }
     }
 
@@ -199,7 +187,6 @@ where
             di,
             ..
         } = self.meta;
-        let residual = if self.residual { 1. } else { 0. };
 
         let workspace_size = self.workspace_size(nt, max_seq_len, max_att_len);
         let mut workspace = Workspace::new(queue_alloc, workspace, workspace_size);
@@ -289,6 +276,7 @@ where
 
                 let o = q.merge(1..3).unwrap();
                 let w = self.weights.attn_o(iblk, queue);
+                let residual = if self.id == 0 { 1. } else { 0. };
                 self.mat_mul(&mut x, residual, &o, &w, 1., workspace, queue_alloc)?
             }
             self.all_reduce(&mut x, workspace, queue_alloc)?;
@@ -310,6 +298,7 @@ where
                 self.swiglu(&mut gate, &up, workspace, queue_alloc)?;
 
                 let w = self.weights.ffn_down(iblk, 0, queue);
+                let residual = if self.id == 0 { 1. } else { 0. };
                 self.mat_mul(&mut x, residual, &gate, &w, 1., workspace, queue_alloc)?
             } else {
                 let mut routes_host = routes.clone().map(Blob::new).take();
@@ -336,6 +325,7 @@ where
                 for (mut x, x1) in izip!(x, x1) {
                     let (line, tail) = routes.split_at(nexp);
                     routes = tail;
+                    let mut first = true;
                     for (iexp, kexp) in self.topk_with_index(line) {
                         let w = self.weights.ffn_gate_up(iblk, iexp, queue);
                         self.mat_mul(&mut gate_up, 0., &x1, &w, 1., workspace, queue_alloc)?;
@@ -346,7 +336,9 @@ where
                         self.swiglu(&mut gate, &up, workspace, queue_alloc)?;
 
                         let w = self.weights.ffn_down(iblk, iexp, queue);
-                        self.mat_mul(&mut x, residual, &gate, &w, kexp, workspace, queue_alloc)?
+                        let residual = if self.id == 0 || !first { 1. } else { 0. };
+                        self.mat_mul(&mut x, residual, &gate, &w, kexp, workspace, queue_alloc)?;
+                        first = false
                     }
                 }
             }
 
@@ -109,27 +109,48 @@ impl LlamaMeta {
 
     pub fn ffn_gate_up(&self, usage: TensorUsage) -> Tensor<usize> {
         let &Self { d, di, .. } = self;
-        self.mat(di + di, d, usage)
+        self.mat_ffn(di + di, d, usage)
     }
 
     pub fn ffn_down(&self, usage: TensorUsage) -> Tensor<usize> {
         let &Self { d, di, .. } = self;
-        self.mat(d, di, usage)
+        self.mat_ffn(d, di, usage)
     }
 
     pub fn output(&self) -> Tensor<usize> {
         self.token_embd().transpose(&[1, 0])
     }
 
     fn mat(&self, row: usize, col: usize, usage: TensorUsage) -> Tensor<usize> {
+        let &Self {
+            dt_embd, dt_mat, ..
+        } = self;
+        // NOTICE: 权重矩阵以 mat 类型存储但以 embd 类型参与计算
+        match usage {
+            TensorUsage::Storage => Tensor::new(dt_mat, &[row, col / dt_mat.group_size()]),
+            TensorUsage::Computation => {
+                assert_eq!(dt_embd.group_size(), 1);
+                Tensor::new(dt_embd, &[row, col]).transpose(&[1, 0])
+            }
+        }
+    }
+
+    fn mat_ffn(&self, row: usize, col: usize, usage: TensorUsage) -> Tensor<usize> {
+        let &Self {
+            nexp,
+            dt_embd,
+            dt_mat,
+            ..
+        } = self;
         // NOTICE: 权重矩阵以 mat 类型存储但以 embd 类型参与计算
         match usage {
             TensorUsage::Storage => {
-                Tensor::new(self.dt_mat, &[row, col / self.dt_mat.group_size()])
+                let nexp = if nexp == 0 { 1 } else { nexp };
+                Tensor::new(dt_mat, &[nexp, row, col / dt_mat.group_size()])
             }
             TensorUsage::Computation => {
-                assert_eq!(self.dt_embd.group_size(), 1);
-                Tensor::new(self.dt_embd, &[row, col]).transpose(&[1, 0])
+                assert_eq!(dt_embd.group_size(), 1);
+                Tensor::new(dt_embd, &[row, col]).transpose(&[1, 0])
             }
         }
     }
 
@@ -19,7 +19,7 @@ pub struct BlkStorage<T> {
     pub attn_qkv: T,
     pub attn_o: T,
     pub ffn_norm: T,
-    pub ffn_gate_inp: Option<T>,
+    pub ffn_gate_inp: T,
     pub ffn_gate_up: T,
     pub ffn_down: T,
 }
@@ -58,12 +58,12 @@ impl<'a> Storage<&'a [u8]> {
                 attn_qkv:  tensor![gguf => format!("blk.{i}.attn_qkv.weight"   )].data,
                 attn_o:    tensor![gguf => format!("blk.{i}.attn_output.weight")].data,
                 ffn_norm:  tensor![gguf => format!("blk.{i}.ffn_norm.weight"   )].data,
-                ffn_gate_inp: if !meta.is_moe() { None }
-                              else              { Some(tensor![gguf => format!("blk.{i}.ffn_gate_inp.weight"    )].data) },
-                ffn_gate_up : if !meta.is_moe() {      tensor![gguf => format!("blk.{i}.ffn_gate_up.weight"     )].data  }
-                              else              {      tensor![gguf => format!("blk.{i}.ffn_gate_up_exps.weight")].data  },
-                ffn_down    : if !meta.is_moe() {      tensor![gguf => format!("blk.{i}.ffn_down.weight"        )].data  }
-                              else              {      tensor![gguf => format!("blk.{i}.ffn_down_exps.weight"   )].data  },
+                ffn_gate_inp: if !meta.is_moe() { &[] }
+                              else              { tensor![gguf => format!("blk.{i}.ffn_gate_inp.weight"    )].data },
+                ffn_gate_up : if !meta.is_moe() { tensor![gguf => format!("blk.{i}.ffn_gate_up.weight"     )].data }
+                              else              { tensor![gguf => format!("blk.{i}.ffn_gate_up_exps.weight")].data },
+                ffn_down    : if !meta.is_moe() { tensor![gguf => format!("blk.{i}.ffn_down.weight"        )].data }
+                              else              { tensor![gguf => format!("blk.{i}.ffn_down_exps.weight"   )].data },
             })
             .collect();
 
@@ -84,7 +84,7 @@ impl<T> BlkStorage<T> {
             attn_qkv: f(self.attn_qkv),
             attn_o: f(self.attn_o),
             ffn_norm: f(self.ffn_norm),
-            ffn_gate_inp: self.ffn_gate_inp.map(&mut f),
+            ffn_gate_inp: f(self.ffn_gate_inp),
             ffn_gate_up: f(self.ffn_gate_up),
             ffn_down: f(self.ffn_down),
         }
@@ -96,7 +96,7 @@ impl<T> BlkStorage<T> {
             attn_qkv: &self.attn_qkv,
             attn_o: &self.attn_o,
             ffn_norm: &self.ffn_norm,
-            ffn_gate_inp: self.ffn_gate_inp.as_ref(),
+            ffn_gate_inp: &self.ffn_gate_inp,
             ffn_gate_up: &self.ffn_gate_up,
             ffn_down: &self.ffn_down,
         }
@@ -174,27 +174,22 @@ impl<'w> BlkStorage<&'w [u8]> {
                 own(o_.take())
             },
             ffn_norm: borrow(self.ffn_norm),
-            ffn_gate_inp: if len == count {
-                self.ffn_gate_inp.map(borrow)
-            } else {
-                todo!()
-            },
+            ffn_gate_inp: borrow(self.ffn_gate_inp),
             ffn_gate_up: if len == count {
                 borrow(self.ffn_gate_up)
             } else {
                 let gu = meta.ffn_gate_up(TensorMem).map(|_| self.ffn_gate_up);
-                split!(gu => g, u; [di, di] @ 0);
+                split!(gu => g, u; [di, di] @ 1);
 
                 let di = di / count;
 
-                let g = g.slice(0, di * start, 1, di * len);
-                let u = u.slice(0, di * start, 1, di * len);
-                debug_assert!(g.is_contiguous() && u.is_contiguous());
+                let g = g.slice(1, di * start, 1, di * len);
+                let u = u.slice(1, di * start, 1, di * len);
 
                 let mut ans = dis.ffn_gate_up(TensorMem).map(&mut f);
                 {
                     let ans = ans.map_slice_mut();
-                    split!(ans => g_, u_; [di * len , di * len] @ 0);
+                    split!(ans => g_, u_; [di * len , di * len] @ 1);
                     let mut g_ = g_;
                     let mut u_ = u_;
                     rearrange(&mut g_, &g);
@@ -207,8 +202,8 @@ impl<'w> BlkStorage<&'w [u8]> {
             } else {
                 let down = meta.ffn_down(TensorMem).map(|_| self.ffn_down);
 
-                let d = down.shape()[1] / count;
-                let down = down.slice(1, d * start, 1, d * len);
+                let d = down.shape()[2] / count;
+                let down = down.slice(2, d * start, 1, d * len);
 
                 let mut down_ = Tensor::new(down.dt(), down.shape()).map(&mut f);
                 rearrange(&mut down_, &down);
 
@@ -90,7 +90,7 @@ fn test_infer() {
                     let device = node.processor();
                     let stream = device.stream();
                     let weights = Weights::new(model, range, count, &stream);
-                    let mut worker = Worker::new(id, &node, meta.clone(), weights, id == 0);
+                    let mut worker = Worker::new(id, &node, meta.clone(), weights);
                     let mut cache = meta
                         .kv_cache(meta.nctx)
                         .map(|size| stream.malloc::<u8>(size));