feat(clip): 添加 pos_embd 计算

YdrMaster · YdrMaster · commit 26504300e79b · 2024-11-29T14:15:23.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,8 +27,8 @@ ggus = "0.3"
 itertools = "0.13"
 build-script-cfg = "0.0"
 
-ndarray-layout = { git = "https://github.com/YdrMaster/ndarray-layout", rev = "48d36c5" }
-operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "d73a53e", default-features = false }
+ndarray-layout = { git = "https://github.com/YdrMaster/ndarray-layout", rev = "f1fdd24" }
+operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "02f95bc", default-features = false }
 
 search-cl-tools = { git = "https://github.com/InfiniTensor/clrt", rev = "6846d52" }
 search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "e2ec203" }
diff --git a/models/clip/common-cpu/src/lib.rs b/models/clip/common-cpu/src/lib.rs
@@ -1,12 +1,13 @@
 use clip::{ClipStorage, WeightLoader};
-use operators::{common_cpu::Cpu, conv, QueueOf, TopoNode};
+use operators::{add_rows, common_cpu::Cpu, conv, QueueOf, TopoNode};
 use std::marker::PhantomData;
 
 pub struct Operators<N = Cpu>(PhantomData<N>);
 
 pub struct Weights<'w> {
     patch_embd_w: &'w [u8],
     patch_embd_b: &'w [u8],
+    pos_embd: &'w [u8],
 }
 
 impl<N> clip::Operators for Operators<N>
@@ -16,13 +17,15 @@ where
     type Hardware = Cpu;
     type TopoNode = Cpu;
     type Conv = conv::common_cpu::ConvIm2Col;
+    type AddRows = add_rows::common_cpu::Operator;
 }
 
 impl<'w> Weights<'w> {
     pub fn new(model: &'w ClipStorage<&'w [u8]>) -> Self {
         Self {
             patch_embd_w: model.patch_embd_w,
             patch_embd_b: model.patch_embd_b,
+            pos_embd: model.pos_embd,
         }
     }
 }
@@ -35,6 +38,11 @@ impl WeightLoader for Weights<'_> {
     fn patch_embd<'a>(&'a self, _queue: &'a QueueOf<Self::Hardware>) -> [Self::Weight<'a>; 2] {
         [self.patch_embd_w, self.patch_embd_b]
     }
+
+    #[inline]
+    fn pos_embd<'a>(&'a self, _queue: &'a QueueOf<Self::Hardware>) -> Self::Weight<'a> {
+        self.pos_embd
+    }
 }
 
 #[cfg(test)]
diff --git a/models/clip/common-cpu/src/test_infer.rs b/models/clip/common-cpu/src/test_infer.rs
@@ -1,5 +1,5 @@
 ﻿use crate::{Operators, Weights};
-use clip::{ClipArgs, ClipMeta, ClipStorage, ClipWorker, Image, Tensor};
+use clip::{ClipArgs, ClipMeta, ClipStorage, ClipWorker, Image, Tensor, D_POS_EMBD};
 use gguf::{ggml_quants::digit_layout::types as ty, GGufModel};
 use operators::{
     common_cpu::{Cpu, ThisThread},
@@ -53,22 +53,22 @@ fn test_infer() {
         .launch(
             ClipArgs {
                 raw: whole.to_nchw(),
-                pos: pos70(whole.shape(), d_patch).map_slice(),
+                pos: pos70(1, whole.shape(), d_patch).map_slice(),
             },
             &mut [],
             &ThisThread,
         )
         .unwrap();
 
     if let Some(patches) = slices.patches_nchw() {
-        let &[_, 3, h, w] = patches.shape() else {
+        let &[n, 3, h, w] = patches.shape() else {
             unreachable!()
         };
         worker
             .launch(
                 ClipArgs {
                     raw: patches.map_slice(),
-                    pos: pos70([w, h], d_patch).map_slice(),
+                    pos: pos70(n, [w, h], d_patch).map_slice(),
                 },
                 &mut [],
                 &ThisThread,
@@ -77,26 +77,21 @@ fn test_infer() {
     }
 }
 
-fn pos70([w, h]: [usize; 2], d_patch: usize) -> Tensor<Blob> {
+fn pos70(n: usize, [w, h]: [usize; 2], d_patch: usize) -> Tensor<Blob> {
     let pos_w = w / d_patch;
     let pos_h = h / d_patch;
-    let mut bucket_corrds_h = [0; 70];
-    let mut bucket_corrds_w = [0; 70];
-    for i in 0..pos_w {
-        bucket_corrds_w[i] = ((70 * i) as f64 / pos_w as f64) as _;
-    }
-    for i in 0..pos_h {
-        bucket_corrds_h[i] = ((70 * i) as f64 / pos_h as f64) as _;
-    }
 
-    let mut ans = Tensor::new(ty::U32, &[pos_w * pos_h]).map(Blob::new);
+    let mut ans = Tensor::new(ty::U32, &[1, pos_w * pos_h])
+        .broadcast(0, n)
+        .map(Blob::new);
     let (&mut [], data, &mut []) = (unsafe { ans.get_mut().align_to_mut::<u32>() }) else {
         panic!()
     };
 
-    let f = |i, d| ((70 * i) as f64 / d as f64) as u32;
     for i in 0..pos_h * pos_w {
-        data[i] = f(i / pos_w, pos_h) * 70 + f(i % pos_w, pos_w);
+        let y = (i / pos_w) * D_POS_EMBD / pos_h;
+        let x = (i % pos_w) * D_POS_EMBD / pos_w;
+        data[i] = (y * D_POS_EMBD + x) as _;
     }
 
     ans
diff --git a/models/clip/common/src/args.rs b/models/clip/common/src/args.rs
@@ -4,6 +4,6 @@ use tensor::Tensor;
 pub struct Args<'a, H: Hardware> {
     /// shape: [n, c, h, w]
     pub raw: Tensor<&'a [H::Byte]>,
-    /// shape: [h x w]
+    /// shape: [n, h x w]
     pub pos: Tensor<&'a [H::Byte]>,
 }
diff --git a/models/clip/common/src/compute.rs b/models/clip/common/src/compute.rs
@@ -1,5 +1,6 @@
 use super::{args::Args, ClipMeta};
 use operators::{
+    add_rows::{self, AddRows},
     conv::{self, Conv},
     ByteOf, Hardware, LaunchError, Operator, QueueAlloc, QueueOf, TopoNode,
 };
@@ -13,6 +14,7 @@ pub trait Operators {
     type Hardware: Hardware;
     type TopoNode: TopoNode<Self::Hardware>;
     type Conv: Conv<Self::Hardware>;
+    type AddRows: AddRows<Self::Hardware>;
 }
 
 pub trait WeightLoader {
@@ -22,12 +24,14 @@ pub trait WeightLoader {
         Self: 's;
 
     fn patch_embd<'a>(&'a self, queue: &'a QueueOf<Self::Hardware>) -> [Self::Weight<'a>; 2];
+    fn pos_embd<'a>(&'a self, queue: &'a QueueOf<Self::Hardware>) -> Self::Weight<'a>;
 }
 
 pub struct ClipWorker<Ops: Operators, W> {
     meta: ClipMeta,
     weights: WeightDecorator<W>,
     conv: Ops::Conv,
+    add_rows: Ops::AddRows,
     pub debug: bool,
 }
 
@@ -38,6 +42,7 @@ impl<Ops: Operators, W> ClipWorker<Ops, W> {
             weights: meta.decorator(weights),
             meta,
             conv: Ops::Conv::new(processor),
+            add_rows: Ops::AddRows::new(processor),
             debug: true,
         }
     }
@@ -64,7 +69,7 @@ where
         QA: QueueAlloc<Hardware = Ops::Hardware>,
     {
         let time = Instant::now();
-        let Args { raw, .. } = args;
+        let Args { raw, pos } = args;
         let queue = queue_alloc.queue();
 
         let ClipMeta { dt_embd, .. } = self.meta;
@@ -80,7 +85,10 @@ where
         let mut embd = Tensor::new(dt_embd, &[n, m, h / hk, w / wk]).map(|s| queue_alloc.alloc(s));
         self.conv(&mut embd, &raw, &k, &b, workspace, queue_alloc)?;
 
-        let _embd = embd.merge(2..4).unwrap().transpose(&[2, 1]);
+        let mut embd = embd.merge(2..4).unwrap().transpose(&[2, 1]);
+
+        let pos_embd = self.weights.pos_embd(queue);
+        self.add_rows(&mut embd, &pos_embd, &pos, workspace, queue_alloc)?;
 
         if self.debug {
             println!("encode {n} x {h} x {w} image in {:?}", time.elapsed());
@@ -130,19 +138,49 @@ where
             queue_alloc,
         )
     }
+
+    fn add_rows<Dst, Src, Idx, QA>(
+        &self,
+        dst: &mut Tensor<Dst>,
+        src: &Tensor<Src>,
+        idx: &Tensor<Idx>,
+        workspace: &mut [ByteOf<Ops::Hardware>],
+        queue_alloc: &QA,
+    ) -> Result<(), LaunchError>
+    where
+        Dst: DerefMut<Target = [ByteOf<Ops::Hardware>]>,
+        Src: Deref<Target = [ByteOf<Ops::Hardware>]>,
+        Idx: Deref<Target = [ByteOf<Ops::Hardware>]>,
+        QA: QueueAlloc<Hardware = Ops::Hardware>,
+    {
+        self.add_rows.launch(
+            &add_rows::Args {
+                dst_layout: dst.layout(),
+                dst_base: dst.base_mut(),
+                src_layout: src.layout(),
+                src_base: src.base(),
+                idx_layout: idx.layout(),
+                idx_base: idx.base(),
+            },
+            workspace,
+            queue_alloc,
+        )
+    }
 }
 
 struct WeightDecorator<W> {
     weights: W,
     patch_embd_w: Tensor<usize>,
     patch_embd_b: Tensor<usize>,
+    pos_embd: Tensor<usize>,
 }
 
 impl ClipMeta {
     fn decorator<W>(&self, weights: W) -> WeightDecorator<W> {
         WeightDecorator {
             patch_embd_w: self.patch_embd_w(),
             patch_embd_b: self.patch_embd_b(),
+            pos_embd: self.pos_embd(),
             weights,
         }
     }
@@ -157,4 +195,10 @@ impl<W: WeightLoader> WeightDecorator<W> {
             self.patch_embd_b.clone().map(|_| b),
         ]
     }
+
+    #[inline]
+    pub fn pos_embd<'a>(&'a self, queue: &'a QueueOf<W::Hardware>) -> Tensor<W::Weight<'a>> {
+        let pos_embd = self.weights.pos_embd(queue);
+        self.pos_embd.clone().map(|_| pos_embd)
+    }
 }
diff --git a/models/clip/common/src/lib.rs b/models/clip/common/src/lib.rs
@@ -38,6 +38,8 @@ pub struct ClipMeta {
     pub epsilon: f32,
 }
 
+pub const D_POS_EMBD: usize = 70;
+
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 #[repr(u8)]
 pub enum ProjectorType {
@@ -86,4 +88,9 @@ impl ClipMeta {
         let &Self { d, .. } = self;
         Tensor::new(self.dt_bias, &[d])
     }
+
+    pub fn pos_embd(&self) -> Tensor<usize> {
+        let &Self { d, .. } = self;
+        Tensor::new(self.dt_embd, &[D_POS_EMBD.pow(2), d])
+    }
 }
diff --git a/models/clip/common/src/storage.rs b/models/clip/common/src/storage.rs
@@ -6,11 +6,12 @@ pub struct Storage<T> {
     pub meta: ClipMeta,
     pub patch_embd_w: T,
     pub patch_embd_b: T,
+    pub pos_embd: T,
 }
 
 impl<'a> Storage<&'a [u8]> {
     pub fn from_gguf(gguf: &GGufModel<'a>) -> Self {
-        let position_embd = &gguf.tensors["v.position_embd.weight"];
+        let pos_embd = &gguf.tensors["v.position_embd.weight"];
         let patch_embd_w = &gguf.tensors["v.patch_embd.weight"];
         let patch_embd_b = &gguf.tensors["v.patch_embd.bias"];
 
@@ -27,7 +28,7 @@ impl<'a> Storage<&'a [u8]> {
             projector,
             minicpmv_version: gguf.get_usize("clip.minicpmv_version").unwrap() as _,
 
-            dt_embd: position_embd.ty,
+            dt_embd: pos_embd.ty,
             dt_mat :  patch_embd_w.ty,
             dt_bias:  patch_embd_b.ty,
 
@@ -47,6 +48,7 @@ impl<'a> Storage<&'a [u8]> {
             meta,
             patch_embd_w: patch_embd_w.data,
             patch_embd_b: patch_embd_b.data,
+            pos_embd: pos_embd.data,
         }
     }
 }
diff --git a/tensor/src/lib.rs b/tensor/src/lib.rs
@@ -292,6 +292,14 @@ impl<T> Tensor<T> {
         }
     }
 
+    #[inline]
+    pub fn broadcast(self, axis: usize, times: usize) -> Self {
+        Self {
+            layout: self.layout.broadcast(axis, times),
+            ..self
+        }
+    }
+
     #[inline]
     pub fn merge(self, range: Range<usize>) -> Option<Self> {
         self.layout

Original file line number	Diff line number	Diff line change
`@@ -1,12 +1,13 @@`
`1`	`1`	`use clip::{ClipStorage, WeightLoader};`
`2`		`-use operators::{common_cpu::Cpu, conv, QueueOf, TopoNode};`
	`2`	`+use operators::{add_rows, common_cpu::Cpu, conv, QueueOf, TopoNode};`
`3`	`3`	`use std::marker::PhantomData;`
`4`	`4`
`5`	`5`	`pub struct Operators<N = Cpu>(PhantomData<N>);`
`6`	`6`
`7`	`7`	`pub struct Weights<'w> {`
`8`	`8`	`patch_embd_w: &'w [u8],`
`9`	`9`	`patch_embd_b: &'w [u8],`
	`10`	`+ pos_embd: &'w [u8],`
`10`	`11`	`}`
`11`	`12`
`12`	`13`	`impl<N> clip::Operators for Operators<N>`
`@@ -16,13 +17,15 @@ where`
`16`	`17`	`type Hardware = Cpu;`
`17`	`18`	`type TopoNode = Cpu;`
`18`	`19`	`type Conv = conv::common_cpu::ConvIm2Col;`
	`20`	`+ type AddRows = add_rows::common_cpu::Operator;`
`19`	`21`	`}`
`20`	`22`
`21`	`23`	`impl<'w> Weights<'w> {`
`22`	`24`	`pub fn new(model: &'w ClipStorage<&'w [u8]>) -> Self {`
`23`	`25`	`Self {`
`24`	`26`	`patch_embd_w: model.patch_embd_w,`
`25`	`27`	`patch_embd_b: model.patch_embd_b,`
	`28`	`+ pos_embd: model.pos_embd,`
`26`	`29`	`}`
`27`	`30`	`}`
`28`	`31`	`}`
`@@ -35,6 +38,11 @@ impl WeightLoader for Weights<'_> {`
`35`	`38`	`fn patch_embd<'a>(&'a self, _queue: &'a QueueOf<Self::Hardware>) -> [Self::Weight<'a>; 2] {`
`36`	`39`	`[self.patch_embd_w, self.patch_embd_b]`
`37`	`40`	`}`
	`41`	`+`
	`42`	`+ #[inline]`
	`43`	`+ fn pos_embd<'a>(&'a self, _queue: &'a QueueOf<Self::Hardware>) -> Self::Weight<'a> {`
	`44`	`+ self.pos_embd`
	`45`	`+ }`
`38`	`46`	`}`
`39`	`47`
`40`	`48`	`#[cfg(test)]`
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,6 @@ use tensor::Tensor;`
`4`	`4`	`pub struct Args<'a, H: Hardware> {`
`5`	`5`	`/// shape: [n, c, h, w]`
`6`	`6`	`pub raw: Tensor<&'a [H::Byte]>,`
`7`		`- /// shape: [h x w]`
	`7`	`+ /// shape: [n, h x w]`
`8`	`8`	`pub pos: Tensor<&'a [H::Byte]>,`
`9`	`9`	`}`
Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,8 @@ pub struct ClipMeta {`
`38`	`38`	`pub epsilon: f32,`
`39`	`39`	`}`
`40`	`40`
	`41`	`+pub const D_POS_EMBD: usize = 70;`
	`42`	`+`
`41`	`43`	`#[derive(Clone, Copy, PartialEq, Eq, Debug)]`
`42`	`44`	`#[repr(u8)]`
`43`	`45`	`pub enum ProjectorType {`
`@@ -86,4 +88,9 @@ impl ClipMeta {`
`86`	`88`	`let &Self { d, .. } = self;`
`87`	`89`	`Tensor::new(self.dt_bias, &[d])`
`88`	`90`	`}`
	`91`	`+`
	`92`	`+ pub fn pos_embd(&self) -> Tensor<usize> {`
	`93`	`+ let &Self { d, .. } = self;`
	`94`	`+ Tensor::new(self.dt_embd, &[D_POS_EMBD.pow(2), d])`
	`95`	`+ }`
`89`	`96`	`}`
Original file line number	Diff line number	Diff line change
`@@ -292,6 +292,14 @@ impl<T> Tensor<T> {`
`292`	`292`	`}`
`293`	`293`	`}`
`294`	`294`
	`295`	`+ #[inline]`
	`296`	`+ pub fn broadcast(self, axis: usize, times: usize) -> Self {`
	`297`	`+ Self {`
	`298`	`+ layout: self.layout.broadcast(axis, times),`
	`299`	`+ ..self`
	`300`	`+ }`
	`301`	`+ }`
	`302`	`+`
`295`	`303`	`#[inline]`
`296`	`304`	`pub fn merge(self, range: Range<usize>) -> Option<Self> {`
`297`	`305`	`self.layout`