todo: 创建 llama-nv

YdrMaster · YdrMaster · commit c7ef588503d8 · 2024-10-10T17:01:14.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "tensor",
     "models/llama/common",
     "models/llama/common-cpu",
+    "models/llama/nvidia-gpu",
     "test-utils",
 ]
 resolver = "2"
@@ -14,6 +15,6 @@ tensor.path = "tensor"
 causal-lm.path = "causal-lm"
 test-utils.path = "test-utils"
 
-ggus = { git = "https://github.com/YdrMaster/gguf", rev = "e64d758" }
+ggus = { git = "https://github.com/YdrMaster/gguf", rev = "c676bcc" }
 ndarray-layout = { git = "https://github.com/YdrMaster/ndarray-layout", rev = "5c6b969" }
 operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "64419f0", default-features = false }
diff --git a/models/llama/common-cpu/src/lib.rs b/models/llama/common-cpu/src/lib.rs
@@ -142,15 +142,15 @@ struct Weights {
 
 impl WeightLoader for Weights {
     type Hardware = Cpu;
-    type Memory = &'static [u8];
+    type Memory<'s> = &'s [u8];
 
     #[inline]
     fn load_blk(
         &self,
         which: BlkWeight,
         iblk: usize,
         _queue: &QueueOf<Self::Hardware>,
-    ) -> Self::Memory {
+    ) -> Self::Memory<'_> {
         let blk = &self.blks[iblk];
         match which {
             BlkWeight::AttnNorm => blk.attn_norm,
@@ -163,12 +163,12 @@ impl WeightLoader for Weights {
     }
 
     #[inline]
-    fn output_norm(&self, _queue: &QueueOf<Self::Hardware>) -> Self::Memory {
+    fn output_norm(&self, _queue: &QueueOf<Self::Hardware>) -> Self::Memory<'_> {
         self.output_norm
     }
 
     #[inline]
-    fn output(&self, _queue: &QueueOf<Self::Hardware>) -> Self::Memory {
+    fn output(&self, _queue: &QueueOf<Self::Hardware>) -> Self::Memory<'_> {
         self.output
     }
 }
diff --git a/models/llama/common/src/compute.rs b/models/llama/common/src/compute.rs
@@ -54,17 +54,19 @@ pub enum BlkWeight {
 
 pub trait WeightLoader {
     type Hardware: Hardware;
-    type Memory: Deref<Target = [ByteOf<Self::Hardware>]>;
+    type Memory<'s>: Deref<Target = [ByteOf<Self::Hardware>]> + 's
+    where
+        Self: 's;
 
     fn load_blk(
         &self,
         which: BlkWeight,
         iblk: usize,
         queue: &QueueOf<Self::Hardware>,
-    ) -> Self::Memory;
+    ) -> Self::Memory<'_>;
 
-    fn output_norm(&self, queue: &QueueOf<Self::Hardware>) -> Self::Memory;
-    fn output(&self, queue: &QueueOf<Self::Hardware>) -> Self::Memory;
+    fn output_norm(&self, queue: &QueueOf<Self::Hardware>) -> Self::Memory<'_>;
+    fn output(&self, queue: &QueueOf<Self::Hardware>) -> Self::Memory<'_>;
 }
 
 pub struct LlamaWorker<Ops: Operators, W> {
@@ -544,60 +546,60 @@ impl LlamaMeta {
 
 impl<W: WeightLoader> WeightDecorator<W> {
     #[inline]
-    pub fn attn_norm(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {
+    pub fn attn_norm(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {
         combine(
             &self.attn_norm,
             self.weights.load_blk(BlkWeight::AttnNorm, iblk, queue),
         )
     }
 
     #[inline]
-    pub fn attn_qkv(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {
+    pub fn attn_qkv(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {
         combine(
             &self.attn_qkv,
             self.weights.load_blk(BlkWeight::AttnQKV, iblk, queue),
         )
     }
 
     #[inline]
-    pub fn attn_o(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {
+    pub fn attn_o(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {
         combine(
             &self.attn_o,
             self.weights.load_blk(BlkWeight::AttnO, iblk, queue),
         )
     }
 
     #[inline]
-    pub fn ffn_norm(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {
+    pub fn ffn_norm(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {
         combine(
             &self.ffn_norm,
             self.weights.load_blk(BlkWeight::FfnNorm, iblk, queue),
         )
     }
 
     #[inline]
-    pub fn ffn_gate_up(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {
+    pub fn ffn_gate_up(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {
         combine(
             &self.ffn_gate_up,
             self.weights.load_blk(BlkWeight::FfnGateUp, iblk, queue),
         )
     }
 
     #[inline]
-    pub fn ffn_down(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {
+    pub fn ffn_down(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {
         combine(
             &self.ffn_down,
             self.weights.load_blk(BlkWeight::FfnDown, iblk, queue),
         )
     }
 
     #[inline]
-    pub fn output_norm(&self, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {
+    pub fn output_norm(&self, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {
         combine(&self.output_norm, self.weights.output_norm(queue))
     }
 
     #[inline]
-    pub fn output(&self, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {
+    pub fn output(&self, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {
         combine(&self.output, self.weights.output(queue))
     }
 }
diff --git a/models/llama/nvidia-gpu/Cargo.toml b/models/llama/nvidia-gpu/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "llama-nv"
+version = "0.0.0"
+edition = "2021"
+authors = ["YdrMaster <ydrml@hotmail.com>"]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+llama.path = "../common"
+operators = { workspace = true, features = ["nvidia-gpu"] }
+
+[dev-dependencies]
+test-utils.workspace = true
+gguf.workspace = true
diff --git a/models/llama/nvidia-gpu/src/lib.rs b/models/llama/nvidia-gpu/src/lib.rs
@@ -0,0 +1,76 @@
+use llama::{ext::Mmap, LlamaStorage, Tensor, WeightLoader};
+use operators::{
+    all_reduce::NonAllReduce,
+    cuda::{memcpy_d2h, DevByte},
+    nvidia_gpu::Gpu,
+    ByteOf,
+};
+use std::ops::Deref;
+
+pub struct Llama {}
+
+impl Llama {
+    pub fn new(_storage: Box<[Mmap]>, _model: LlamaStorage<&'static [u8]>) -> Self {
+        Self {}
+    }
+
+    pub fn infer(&mut self, input: &[u32], cache: &mut [u8], pos: usize) -> u32 {
+        todo!()
+    }
+}
+
+struct Operators;
+
+macro_rules! op {
+    ($name:ident) => {
+        operators::$name::nvidia_gpu::Operator
+    };
+}
+
+impl llama::Operators for Operators {
+    type Hardware = Gpu;
+    type TopoNode = Gpu;
+    type RmsNorm = op!(rms_norm);
+    type MatMul = op!(mat_mul);
+    type Rope = op!(rope);
+    type AttnKVCached = op!(attention_kv_cached);
+    type Mlp = op!(mlp);
+    type Rearrange = op!(rearrange);
+    type AllReduce = NonAllReduce<Gpu>;
+
+    fn debug<T>(tensor: &Tensor<T>)
+    where
+        T: Deref<Target = [ByteOf<Self::Hardware>]>,
+    {
+        let tensor = tensor.as_ref().map(|mem| {
+            let mut buf = vec![0u8; mem.len()];
+            memcpy_d2h(&mut buf, mem);
+            buf
+        });
+        println!("{tensor}");
+    }
+}
+
+struct Weights {}
+
+impl WeightLoader for Weights {
+    type Hardware = Gpu;
+    type Memory<'s> = &'s [DevByte];
+
+    fn load_blk(
+        &self,
+        which: llama::BlkWeight,
+        iblk: usize,
+        queue: &operators::QueueOf<Self::Hardware>,
+    ) -> Self::Memory<'_> {
+        todo!()
+    }
+
+    fn output_norm(&self, queue: &operators::QueueOf<Self::Hardware>) -> Self::Memory<'_> {
+        todo!()
+    }
+
+    fn output(&self, queue: &operators::QueueOf<Self::Hardware>) -> Self::Memory<'_> {
+        todo!()
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -54,17 +54,19 @@ pub enum BlkWeight {`
`54`	`54`
`55`	`55`	`pub trait WeightLoader {`
`56`	`56`	`type Hardware: Hardware;`
`57`		`- type Memory: Deref<Target = [ByteOf<Self::Hardware>]>;`
	`57`	`+ type Memory<'s>: Deref<Target = [ByteOf<Self::Hardware>]> + 's`
	`58`	`+ where`
	`59`	`+ Self: 's;`
`58`	`60`
`59`	`61`	`fn load_blk(`
`60`	`62`	`&self,`
`61`	`63`	`which: BlkWeight,`
`62`	`64`	`iblk: usize,`
`63`	`65`	`queue: &QueueOf<Self::Hardware>,`
`64`		`- ) -> Self::Memory;`
	`66`	`+ ) -> Self::Memory<'_>;`
`65`	`67`
`66`		`- fn output_norm(&self, queue: &QueueOf<Self::Hardware>) -> Self::Memory;`
`67`		`- fn output(&self, queue: &QueueOf<Self::Hardware>) -> Self::Memory;`
	`68`	`+ fn output_norm(&self, queue: &QueueOf<Self::Hardware>) -> Self::Memory<'_>;`
	`69`	`+ fn output(&self, queue: &QueueOf<Self::Hardware>) -> Self::Memory<'_>;`
`68`	`70`	`}`
`69`	`71`
`70`	`72`	`pub struct LlamaWorker<Ops: Operators, W> {`
`@@ -544,60 +546,60 @@ impl LlamaMeta {`
`544`	`546`
`545`	`547`	`impl<W: WeightLoader> WeightDecorator<W> {`
`546`	`548`	`#[inline]`
`547`		`- pub fn attn_norm(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {`
	`549`	`+ pub fn attn_norm(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {`
`548`	`550`	`combine(`
`549`	`551`	`&self.attn_norm,`
`550`	`552`	`self.weights.load_blk(BlkWeight::AttnNorm, iblk, queue),`
`551`	`553`	`)`
`552`	`554`	`}`
`553`	`555`
`554`	`556`	`#[inline]`
`555`		`- pub fn attn_qkv(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {`
	`557`	`+ pub fn attn_qkv(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {`
`556`	`558`	`combine(`
`557`	`559`	`&self.attn_qkv,`
`558`	`560`	`self.weights.load_blk(BlkWeight::AttnQKV, iblk, queue),`
`559`	`561`	`)`
`560`	`562`	`}`
`561`	`563`
`562`	`564`	`#[inline]`
`563`		`- pub fn attn_o(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {`
	`565`	`+ pub fn attn_o(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {`
`564`	`566`	`combine(`
`565`	`567`	`&self.attn_o,`
`566`	`568`	`self.weights.load_blk(BlkWeight::AttnO, iblk, queue),`
`567`	`569`	`)`
`568`	`570`	`}`
`569`	`571`
`570`	`572`	`#[inline]`
`571`		`- pub fn ffn_norm(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {`
	`573`	`+ pub fn ffn_norm(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {`
`572`	`574`	`combine(`
`573`	`575`	`&self.ffn_norm,`
`574`	`576`	`self.weights.load_blk(BlkWeight::FfnNorm, iblk, queue),`
`575`	`577`	`)`
`576`	`578`	`}`
`577`	`579`
`578`	`580`	`#[inline]`
`579`		`- pub fn ffn_gate_up(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {`
	`581`	`+ pub fn ffn_gate_up(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {`
`580`	`582`	`combine(`
`581`	`583`	`&self.ffn_gate_up,`
`582`	`584`	`self.weights.load_blk(BlkWeight::FfnGateUp, iblk, queue),`
`583`	`585`	`)`
`584`	`586`	`}`
`585`	`587`
`586`	`588`	`#[inline]`
`587`		`- pub fn ffn_down(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {`
	`589`	`+ pub fn ffn_down(&self, iblk: usize, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {`
`588`	`590`	`combine(`
`589`	`591`	`&self.ffn_down,`
`590`	`592`	`self.weights.load_blk(BlkWeight::FfnDown, iblk, queue),`
`591`	`593`	`)`
`592`	`594`	`}`
`593`	`595`
`594`	`596`	`#[inline]`
`595`		`- pub fn output_norm(&self, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {`
	`597`	`+ pub fn output_norm(&self, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {`
`596`	`598`	`combine(&self.output_norm, self.weights.output_norm(queue))`
`597`	`599`	`}`
`598`	`600`
`599`	`601`	`#[inline]`
`600`		`- pub fn output(&self, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory> {`
	`602`	`+ pub fn output(&self, queue: &QueueOf<W::Hardware>) -> Tensor<W::Memory<'_>> {`
`601`	`603`	`combine(&self.output, self.weights.output(queue))`
`602`	`604`	`}`
`603`	`605`	`}`