t

onenewcode · onenewcode · commit 255cce49cf60 · 2025-02-20T05:46:35.000Z
diff --git a/models/minicpm3/common-cpu/src/lib.rs b/models/minicpm3/common-cpu/src/lib.rs
@@ -43,6 +43,7 @@ where
     type Rearrange = op!(rearrange);
     type Scale = op!(scale);
     type AttnKVCached = op!(attention_kv_cached);
+    type FuesdSoftmax = op!(fuesd_softmax);
     type AllReduce = R;
 
     fn debug<T>(tensor: &Tensor<T>, _queue: &QueueOf<Self::Hardware>)
diff --git a/models/minicpm3/common/src/compute.rs b/models/minicpm3/common/src/compute.rs
@@ -3,6 +3,8 @@ use gguf::ggml_quants::digit_layout::types as ty;
 use gguf::ggml_quants::digit_layout::DigitLayout;
 use half::f16;
 use itertools::Itertools;
+use operators::fuesd_softmax;
+use operators::fuesd_softmax::FusedSoftmax;
 use operators::scale;
 use operators::scale::Scale;
 use operators::{
@@ -19,6 +21,7 @@ use operators::{
     ByteOf, Hardware, LaunchError, Operator, QueueAlloc, QueueOf, TopoNode, Workspace,
 };
 use std::ops::{Deref, DerefMut};
+use std::process::Output;
 use tensor::split_mut;
 use tensor::{split, Tensor};
 
@@ -33,6 +36,7 @@ pub trait Operators {
     type MatMul: MatMul<Self::Hardware>;
     type Swiglu: Swiglu<Self::Hardware>;
     type Scale: Scale<Self::Hardware>;
+    type FuesdSoftmax: FusedSoftmax<Self::Hardware>;
     type Rearrange: Rearrange<Self::Hardware>;
     type AllReduce: AllReduce<Self::Hardware, Self::TopoNode>;
 
@@ -87,6 +91,7 @@ pub struct Minicpm3Worker<Ops: Operators, W> {
     mat_mul: Ops::MatMul,
     scale: Ops::Scale,
     swiglu: Ops::Swiglu,
+    fuesd_softmax: Ops::FuesdSoftmax,
     rearrange: Ops::Rearrange,
     all_reduce: Ops::AllReduce,
 }
@@ -109,6 +114,7 @@ impl<Ops: Operators, W> Minicpm3Worker<Ops, W> {
             all_reduce: Ops::AllReduce::new(node),
             dt_pos: ty::U64,
             attention: Ops::Attention::new(processor),
+            fuesd_softmax: Ops::FuesdSoftmax::new(processor),
         }
     }
 
@@ -165,12 +171,11 @@ where
 
         let gate_up = tensor(&[nt, di * 2]);
         // 空间 x+x1+q(应该可以删除)+q3+kv_pe+attn
-        let workspace_size = *x1.get() * 3 + *gate_up.get();
+        let workspace_size = *x1.get() * 20 + *gate_up.get();
         let mut workspace = Workspace::new(queue_alloc, workspace, workspace_size);
         let (buf, workspace) = workspace.split_at_mut(*x1.get());
         let mut x1 = x1.map(|_| buf);
 
-
         let queue = queue_alloc.queue();
 
         let sin = sin_cos.clone().index(0, 0);
@@ -205,17 +210,15 @@ where
             let w = self.weights.attn_qa_norm(iblk, queue);
             self.rms_norm(&mut q, &inplace, &w, workspace, queue_alloc)?;
             {
-                // q [1, 768] q1 [1, 3840]   kv_pe [1,288]  kv  [1, 5120] k  [1, 3840] attn  [1, 2560]
                 let q1 = tensor(&[nt, nh * dk]);
                 let (buf, workspace) = workspace.split_at_mut(*q1.get());
                 let mut q1 = q1.map(|_| buf);
                 let w = self.weights.attn_qb(iblk, queue).transpose(&[1, 0]);
                 self.mat_mul(&mut q1, 0., &q, &w, 1., workspace, queue_alloc)?;
-                drop(q);
-                // q3 是计算 attn 需要用到的数据，但是我们仍然需要对 q3 的的部分进行嵌入操作
+
                 let mut q3 = q1.tile(1, &[nh, dk]);
                 let q2 = unsafe { q3.map_slice_static_mut() };
-                split_mut!(q2=>_q, q_rope;[dnope, dh]@ 2);
+                split_mut!(q2=>q_nope, q_rope;[dnope, dh]@ 2);
 
                 // kv_pe [1,288]
                 let kv_pe = tensor(&[nt, dkv_lora + dh]);
@@ -224,65 +227,125 @@ where
 
                 let w = self.weights.attn_kva(iblk, queue).transpose(&[1, 0]);
                 self.mat_mul(&mut kv_pe, 0., &x1, &w, 1., workspace, queue_alloc)?;
-
+                drop(q);
                 split_mut!(kv_pe =>  kv_lora, k_rope; [dkv_lora, dh] @ 1);
 
+                self.rope(&mut q_rope, &pos, &sin, &cos, workspace, queue_alloc)?;
+                let mut k_rope = k_rope.tile(1, &[1, dh]);
+                self.rope(&mut k_rope, &pos, &sin, &cos, workspace, queue_alloc)?;
+                let k_rope = k_rope.broadcast(1, nh);
+
                 let inplace = unsafe { kv_lora.map_slice_static() };
                 let w = self.weights.attn_kva_norm(iblk, queue);
                 self.rms_norm(&mut kv_lora, &inplace, &w, workspace, queue_alloc)?;
                 //   kv    X[1, 5120]
                 let kv = tensor(&[nt, nh * (dnope + dv)]);
                 let (buf, workspace) = workspace.split_at_mut(*kv.get());
                 let mut kv = kv.map(|_| buf);
-                let w = self.weights.attn_kvb(iblk, queue).transpose(&[1, 0]);
-
-                self.mat_mul(&mut kv, 0., &kv_lora, &w, 1., workspace, queue_alloc)?;
-
-                let kv = kv.tile(1, &[nh, dnope + dv]);
-
-                split_mut!(kv =>  k_nope ,v ; [dnope  , dv ] @ 2);
 
-                //  k   [1, 3840]
-                let k = tensor(&[nt, nh, dk]);
-                let (buf, workspace) = workspace.split_at_mut(*k.get());
-                let k = k.map(|_| buf);
-
-                split_mut!(k =>  k_nope_r ,k_rope_r ; [dnope, dh] @ 2);
-
-                self.rope(&mut q_rope, &pos, &sin, &cos, workspace, queue_alloc)?;
-                let mut k_rope = k_rope.tile(1, &[1, dh]);
-                self.rope(&mut k_rope, &pos, &sin, &cos, workspace, queue_alloc)?;
-                let k_rope = k_rope.broadcast(1, nh);
-                self.rearrange(&mut k_rope_r, &k_rope, workspace, queue_alloc)?;
-                self.rearrange(&mut k_nope_r, &k_nope, workspace, queue_alloc)?;
-
-                let pos = requests.last().unwrap().pos as f32;
-                let mut q = q3.transpose(&[1, 0]);
-                let k = k.map_slice().transpose(&[1, 0]);
-                let v = v.map_slice_mut().transpose(&[1, 0]);
-                        // 经行 attention
-                let attn = tensor(&[nt, nh, dv]);
-                let (buf, workspace) = workspace.split_at_mut(*attn.get());
-                let mut attn = attn.map(|_| buf);
-                
-                let mut attn = unsafe { attn.map_slice_mut().transpose(&[1, 0]) };
-                let pos = requests.last().unwrap().pos as f32;
-                self.attnention(
-                    &mut q,
-                    &k,
-                    &v,
-                    &mut attn,
-                    pos as usize,
+                let kv_b_proj = unsafe {
+                    self.weights
+                        .attn_kvb(iblk, queue)
+                        .tile(0, &[nh, dnope + dv])
+                        .map_slice_static()
+                };
+                split!(kv_b_proj=> q_absorb , out_absorb ; [dnope, dv] @ 1);
+                let inplace = unsafe { q_nope.map_slice_static() };
+
+                let q_nope_0 = q_nope.map_slice().transpose(&[1, 0]);
+                let q_nope_1 = tensor(&[nh, nt, dkv_lora]);
+                let (buf, workspace) = workspace.split_at_mut(*q_nope_1.get());
+                let mut q_nope = q_nope_1.map(|_| buf);
+                self.mat_mul(
+                    &mut q_nope,
+                    0.,
+                    &q_nope_0,
+                    &q_absorb,
+                    1.,
                     workspace,
                     queue_alloc,
                 )?;
 
-                let o = attn.transpose(&[1, 0]).merge(1..3).unwrap();
+                drop(q3);
+                //计算 attn_weights
+                // todo deepseek 中会有 softmax_scale
+                // python 代码
+                // attn_weights = (torch.matmul(q_pe, k_pe.mT) + torch.matmul(q_nope, compressed_kv.unsqueeze(-3).mT)) * self.softmax_scale
+                let attn_weights = tensor(&[nh, nt, nt]);
+                let (buf, workspace) = workspace.split_at_mut(*attn_weights.get());
+                let mut attn_weights = attn_weights.map(|_| buf);
+                {
+                    let q_rope = q_rope.transpose(&[1, 0]);
+                    let k_rope = k_rope.transpose(&[1, 2, 0]);
+
+                    self.mat_mul(
+                        &mut attn_weights,
+                        0.,
+                        &q_rope,
+                        &k_rope,
+                        1.,
+                        workspace,
+                        queue_alloc,
+                    )?;
+                    let kv_lora = kv_lora
+                        .map_slice()
+                        .tile(0, &[1, 1])
+                        .broadcast(0, nh)
+                        .transpose(&[2, 1]);
+                    self.mat_mul(
+                        &mut attn_weights,
+                        1.,
+                        &q_nope,
+                        &kv_lora,
+                        1.,
+                        workspace,
+                        queue_alloc,
+                    )?;
+                }
+                // softmax
+                self.softmax(&mut attn_weights, workspace, queue_alloc)?;
+                // attn_output
+                let attn_output_r = tensor(&[nt, nh, dv]);
+                let (buf, workspace) = workspace.split_at_mut(*attn_output_r.get());
+                let mut attn_output_r = attn_output_r.map(|_| buf);
+                {
+                    let attn_output = tensor(&[nh, nt, dkv_lora]);
+                    let (buf, workspace) = workspace.split_at_mut(*attn_output.get());
+                    let mut attn_output = attn_output.map(|_| buf);
+                    let kv_lora = kv_lora.tile(0, &[1, 1]).broadcast(0, nh);
+                    self.mat_mul(
+                        &mut attn_output,
+                        0.,
+                        &attn_weights,
+                        &kv_lora,
+                        1.,
+                        workspace,
+                        queue_alloc,
+                    )?;
+                    let mut attn_output_r = attn_output_r.map_slice_mut().transpose(&[1, 0]);
+                    let out_absorb = out_absorb.transpose(&[2, 1]);
+
+                    self.mat_mul(
+                        &mut attn_output_r,
+                        0.,
+                        &attn_output,
+                        &out_absorb,
+                        1.,
+                        workspace,
+                        queue_alloc,
+                    )?;
+                }
+                Ops::debug(&attn_output_r, queue);
+                todo!();
+                // let o = attn_output_r;
                 let w = self.weights.attn_o(iblk, queue);
-
-                self.mat_mul(&mut x1, 0., &o, &w, s, workspace, queue_alloc)?;
-                let inplace = unsafe { x.map_slice_static() };
-                self.add(&mut x, &inplace, &x1, workspace, queue_alloc)?;
+                println!("{:?}", attn_output_r.shape());
+                println!("{:?}", w.shape());
+                // println!("{:?}",out_absorb.shape());
+                todo!();
+                // self.mat_mul(&mut x1, 0., &o, &w, s, workspace, queue_alloc)?;
+                // let inplace = unsafe { x.map_slice_static() };
+                // self.add(&mut x, &inplace, &x1, workspace, queue_alloc)?;
             }
             let w = self.weights.ffn_norm(iblk, queue);
             self.rms_norm(&mut x1, &x, &w, workspace, queue_alloc)?;
@@ -594,6 +657,26 @@ where
             queue_alloc,
         )
     }
+    fn softmax<A, QA>(
+        &self,
+        a: &mut Tensor<A>,
+        workspace: &mut [ByteOf<Ops::Hardware>],
+        queue_alloc: &QA,
+    ) -> Result<(), LaunchError>
+    where
+        A: DerefMut<Target = [ByteOf<Ops::Hardware>]>,
+        QA: QueueAlloc<Hardware = Ops::Hardware>,
+    {
+        self.fuesd_softmax.launch(
+            &fuesd_softmax::Args {
+                att_mask: AttnMask::Causal,
+                att_layout: a.layout(),
+                att_base: a.base_mut(),
+            },
+            workspace,
+            queue_alloc,
+        )
+    }
     fn all_reduce<X, QA>(
         &self,
         x: &mut Tensor<X>,
diff --git a/tensor/src/split.rs b/tensor/src/split.rs
@@ -14,7 +14,6 @@ impl<T> Splitable for &[T] {
         self
     }
 }
-
 impl<T> Splitable for &mut [T] {
     #[inline]
     fn split(&self) -> Self {

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,6 @@ impl<T> Splitable for &[T] {`
`14`	`14`	`self`
`15`	`15`	`}`
`16`	`16`	`}`
`17`		`-`
`18`	`17`	`impl<T> Splitable for &mut [T] {`
`19`	`18`	`#[inline]`
`20`	`19`	`fn split(&self) -> Self {`