todo: 添加 all reduce

YdrMaster · YdrMaster · commit 2b8c97ae8325 · 2024-10-10T09:18:24.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/Cargo.toml b/Cargo.toml
@@ -16,4 +16,4 @@ test-utils.path = "test-utils"
 
 ggus = { git = "https://github.com/YdrMaster/gguf", rev = "e64d758" }
 ndarray-layout = { git = "https://github.com/YdrMaster/ndarray-layout", rev = "5c6b969" }
-operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "b956b29", default-features = false }
+operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "15cebdd", default-features = false }
diff --git a/models/llama/common-cpu/src/lib.rs b/models/llama/common-cpu/src/lib.rs
@@ -4,7 +4,7 @@ use llama::{
     RandomSample, Tensor, WeightLoader,
 };
 use operators::{
-    common_cpu::{Cpu, ThisThread},
+    common_cpu::{Cpu, ThisThread, Threads},
     random_sample::{common_cpu::Operator as CpuOp, KVPair, SampleArgs},
     ByteOf, QueueOf,
 };
@@ -31,7 +31,7 @@ impl Llama {
             _storage,
             token_embed,
             single: LlamaWorker::new(
-                &Cpu,
+                todo!(),
                 meta,
                 Weights {
                     blks: blocks,
@@ -116,12 +116,14 @@ macro_rules! op {
 
 impl llama::Operators for Operators {
     type Hardware = Cpu;
+    type TopoNode = Threads;
     type RmsNorm = op!(rms_norm);
     type MatMul = op!(mat_mul);
     type Rope = op!(rope);
     type AttnKVCached = op!(attention_kv_cached);
     type Mlp = op!(mlp);
     type Rearrange = op!(rearrange);
+    type AllReduce = op!(all_reduce);
 
     fn debug<T>(tensor: &Tensor<T>)
     where
diff --git a/models/llama/common/src/compute.rs b/models/llama/common/src/compute.rs
@@ -2,25 +2,28 @@
 use gguf::ggml_quants::digit_layout::{types as ty, DigitLayout};
 use itertools::izip;
 use operators::{
+    all_reduce::{AllReduce, ReduceOp},
     attention_kv_cached::AttnKVCached,
     mat_mul::MatMul,
     mlp::Mlp,
     rearrange::Rearrange,
     rms_norm::RmsNorm,
     rope::{Rope, Seq},
-    ByteOf, Hardware, LaunchError, Operator, QueueAlloc, QueueOf, Workspace,
+    ByteOf, Hardware, LaunchError, Operator, QueueAlloc, QueueOf, TopoNode, Workspace,
 };
 use std::ops::{Deref, DerefMut};
 use tensor::{dt_size, split, Tensor};
 
 pub trait Operators {
     type Hardware: Hardware;
+    type TopoNode: TopoNode<Self::Hardware>;
     type RmsNorm: RmsNorm<Self::Hardware>;
     type MatMul: MatMul<Self::Hardware>;
     type Rope: Rope<Self::Hardware>;
     type AttnKVCached: AttnKVCached<Self::Hardware>;
     type Mlp: Mlp<Self::Hardware>;
     type Rearrange: Rearrange<Self::Hardware>;
+    type AllReduce: AllReduce<Self::Hardware, Self::TopoNode>;
 
     fn debug<T>(tensor: &Tensor<T>)
     where
@@ -73,19 +76,21 @@ pub struct LlamaWorker<Ops: Operators, W> {
     attn_kv_cached: Ops::AttnKVCached,
     mlp: Ops::Mlp,
     rearrange: Ops::Rearrange,
+    all_reduce: Ops::AllReduce,
 }
 
 impl<Ops: Operators, W> LlamaWorker<Ops, W> {
-    pub fn new(processor: &Ops::Hardware, meta: LlamaMeta, weights: W) -> Self {
+    pub fn new(node: &Ops::TopoNode, meta: LlamaMeta, weights: W) -> Self {
         Self {
             weights: meta.decorator(weights),
             meta,
-            rms_norm: Ops::RmsNorm::new(processor),
-            mat_mul: Ops::MatMul::new(processor),
-            rope: Ops::Rope::new(processor),
-            attn_kv_cached: Ops::AttnKVCached::new(processor),
-            mlp: Ops::Mlp::new(processor),
-            rearrange: Ops::Rearrange::new(processor),
+            rms_norm: Ops::RmsNorm::new(node.processor()),
+            mat_mul: Ops::MatMul::new(node.processor()),
+            rope: Ops::Rope::new(node.processor()),
+            attn_kv_cached: Ops::AttnKVCached::new(node.processor()),
+            mlp: Ops::Mlp::new(node.processor()),
+            rearrange: Ops::Rearrange::new(node.processor()),
+            all_reduce: Ops::AllReduce::new(node),
         }
     }
 
@@ -240,7 +245,7 @@ where
             self.mat_mul(&mut x, 1., &x1, &w, 1., workspace, queue_alloc)?;
 
             if distribute > 1 {
-                todo!("all reduce")
+                self.all_reduce(&mut x, workspace, queue_alloc)?;
             }
 
             let w = self.weights.ffn_norm(iblk, queue);
@@ -250,7 +255,7 @@ where
             self.mlp(&mut x, &x1, iblk, mlp_alpha, true, workspace, queue_alloc)?;
 
             if distribute > 1 {
-                todo!("all reduce")
+                self.all_reduce(&mut x, workspace, queue_alloc)?;
             }
         }
 
@@ -483,6 +488,29 @@ where
             queue_alloc,
         )
     }
+
+    fn all_reduce<X, QA>(
+        &self,
+        x: &mut Tensor<X>,
+        workspace: &mut [ByteOf<Ops::Hardware>],
+        queue_alloc: &QA,
+    ) -> Result<(), LaunchError>
+    where
+        X: DerefMut<Target = [ByteOf<Ops::Hardware>]>,
+        QA: QueueAlloc<Hardware = Ops::Hardware>,
+    {
+        self.all_reduce.launch(
+            &operators::all_reduce::Args {
+                dst_layout: x.layout(),
+                dst_base: x.base_mut(),
+                src_layout: x.layout(),
+                src_base: x.base(),
+                op: ReduceOp::Sum,
+            },
+            workspace,
+            queue_alloc,
+        )
+    }
 }
 
 struct WeightDecorator<W> {