InfiniTensor
diff --git a/‎Cargo.lock‎
Lines changed: 56 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎llama.cu/src/exec/engine.rs‎
Lines changed: 54 additions & 14 deletions b/‎llama.cu/src/exec/engine.rs‎
Lines changed: 54 additions & 14 deletions
diff --git a/‎llama.cu/src/exec/group.rs‎
Lines changed: 5 additions & 15 deletions b/‎llama.cu/src/exec/group.rs‎
Lines changed: 5 additions & 15 deletions
diff --git a/‎llama.cu/src/exec/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎llama.cu/src/exec/mod.rs‎
Lines changed: 1 addition & 0 deletions
@@ -23,7 +23,8 @@ use std::{
     num::NonZeroUsize,
     ops::Deref,
     sync::{
-        Arc, Barrier, Mutex, RwLock,
+        Arc, Barrier, Mutex, OnceLock, RwLock,
+        atomic::AtomicUsize,
         mpsc::{Receiver, Sender},
     },
 };
@@ -76,23 +77,41 @@ const NTOKS: [usize; 7] = [1, 8, 32, 64, 128, 256, 1024];
 const CHUNKED_PREFILL_LEN: Option<usize> = Some(256);
 const MAX_TOKS: usize = 1024;
 
+#[derive(Default)]
+pub struct Progress {
+    pub(crate) weight_size: OnceLock<usize>,
+    pub(crate) weight_loaded: AtomicUsize,
+}
+
 pub(crate) fn engine(
     llama: LLaMA<Tensor<&[u8], 2>>,
-    gpus: &[c_int],
+    workers: &[(c_int, Option<Arc<Progress>>)],
     commands: Receiver<Command>,
     outputs: Sender<Output>,
     use_cuda_graph: bool,
 ) {
-    if let &[dev] = gpus {
-        return mono(llama, Device::new(dev), commands, outputs, use_cuda_graph);
+    if let &[(gpu, progress)] = &workers {
+        return mono(
+            llama,
+            Device::new(*gpu),
+            progress.clone(),
+            commands,
+            outputs,
+            use_cuda_graph,
+        );
     }
 
     #[cfg(not(nccl))]
     unreachable!();
 
     #[cfg(nccl)]
     {
-        let mut comms = CommunicatorGroup::new(gpus).into_vec().into_iter();
+        use std::collections::HashMap;
+
+        let devlist = workers.iter().map(|(gpu, _)| *gpu).collect::<Vec<_>>();
+        let mut workers = workers.iter().cloned().collect::<HashMap<_, _>>();
+
+        let mut comms = CommunicatorGroup::new(&devlist).into_vec().into_iter();
         let first = comms.next().unwrap();
 
         let mut llama = llama;
@@ -102,26 +121,28 @@ pub(crate) fn engine(
             dist: Distribution {
                 start: 0,
                 len: 1,
-                total: gpus.len(),
+                total: devlist.len(),
             },
+            progress: workers.remove(&first.device().index()).unwrap(),
             config: ModelGroupConfig {
                 static_model_keys: NTOKS,
                 dyn_cache_size: 1,
                 use_cuda_graph,
             },
             max_toks: MAX_TOKS,
-            barrier: Some(Arc::new(Barrier::new(gpus.len()))),
+            barrier: Some(Arc::new(Barrier::new(devlist.len()))),
             task_box: Default::default(),
             chunked_prefill_len: CHUNKED_PREFILL_LEN,
         };
-
         std::thread::scope(|s| {
             let _threads = comms
                 .map(|comm| {
-                    let dist = Distribution::new(comm.rank(), 1, gpus.len());
+                    let dev = comm.device();
+                    let dist = Distribution::new(comm.rank(), 1, devlist.len());
                     let worker = Worker {
-                        dev: comm.device(),
+                        dev,
                         dist,
+                        progress: workers.remove(&dev.index()).unwrap(),
                         ..worker.clone()
                     };
                     let llama = llama.clone();
@@ -139,6 +160,7 @@ pub(crate) fn engine(
 fn mono(
     mut llama: LLaMA<Tensor<&[u8], 2>>,
     dev: Device,
+    progress: Option<Arc<Progress>>,
     commands: Receiver<Command>,
     outputs: Sender<Output>,
     use_cuda_graph: bool,
@@ -151,6 +173,7 @@ fn mono(
             len: 1,
             total: 1,
         },
+        progress,
         config: ModelGroupConfig {
             static_model_keys: NTOKS,
             dyn_cache_size: 1,
@@ -170,6 +193,7 @@ fn mono(
 struct Worker<T> {
     dev: Device,
     dist: Distribution,
+    progress: Option<Arc<Progress>>,
     config: ModelGroupConfig<T>,
     max_toks: usize,
     barrier: Option<Arc<Barrier>>,
@@ -197,6 +221,7 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
         let Self {
             dev,
             dist,
+            progress,
             config,
             max_toks,
             barrier,
@@ -210,8 +235,15 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
         gpu.apply(|ctx| {
             let mut manager = EngineManager::new(chunked_prefill_len, max_toks);
             let mut handle = handle(ctx);
-            let mut models =
-                ModelGroup::new(llama, dist, config, attn, &mut handle, barrier.as_deref());
+            let mut models = ModelGroup::new(
+                llama,
+                dist,
+                progress,
+                config,
+                attn,
+                &mut handle,
+                barrier.as_deref(),
+            );
 
             let mut output_head = OutputHead::new(output_head, ctx);
 
@@ -332,6 +364,7 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
         let Self {
             dev,
             dist,
+            progress,
             config,
             max_toks: _max_toks,
             barrier,
@@ -345,8 +378,15 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
         let barrier = barrier.unwrap();
         gpu.apply(|ctx| {
             let mut handle = Handle::with_comm(ctx, comm);
-            let mut models =
-                ModelGroup::new(llama, dist, config, attn, &mut handle, Some(&barrier));
+            let mut models = ModelGroup::new(
+                llama,
+                dist,
+                progress,
+                config,
+                attn,
+                &mut handle,
+                Some(&barrier),
+            );
 
             let stream = ctx.stream();
             loop {
 
@@ -1,6 +1,5 @@
-use super::{KVCache, model::ModelExec};
+use super::{KVCache, Progress, model::ModelExec};
 use crate::{exec::upos, handle::Handle, load::load_weight, memory::MemPages};
-use log::debug;
 use nn::{
     Distribution, Graph, GraphBuilder, LLaMA, NNGraph, Tensor, TensorMeta, digit_layout::types, op,
 };
@@ -12,7 +11,6 @@ use std::{
     collections::BTreeMap,
     num::{NonZero, NonZeroUsize},
     sync::{Arc, Barrier, Mutex},
-    time::Instant,
 };
 use tokeneer::utok;
 
@@ -41,6 +39,7 @@ impl<'ctx> ModelGroup<'ctx> {
     pub fn new<T: IntoIterator<Item = usize>>(
         llama: LLaMA<Tensor<&[u8], 2>>,
         dist: Distribution,
+        progress: Option<Arc<Progress>>,
 
         config: ModelGroupConfig<T>,
 
@@ -67,13 +66,11 @@ impl<'ctx> ModelGroup<'ctx> {
         // 加载权重
         let dev = handle.ctx.dev();
         let mut pages = MemPages::new(dev);
-        let (_weight, edges) = load_weight(edges, handle.ctx);
+        let (_weight, edges) = load_weight(edges, progress, handle.ctx);
         // 构建 cuda graph
         let graph = NNGraph(Graph { topo, nodes, edges });
-        debug!("compiling model group @{}", dev.index());
         let static_models = if use_cuda_graph {
-            let time = Instant::now();
-            let models = static_model_keys
+            static_model_keys
                 .into_iter()
                 .map(|n_tok| {
                     if let Some(b) = barrier {
@@ -83,14 +80,7 @@ impl<'ctx> ModelGroup<'ctx> {
                     let exec = ModelExec::new(graph.clone(), n_tok, handle, &mut pages, true);
                     (key, exec)
                 })
-                .collect::<BTreeMap<_, _>>();
-            debug!(
-                "group ({} models) compiled @{} in {:.02?}",
-                models.len(),
-                dev.index(),
-                time.elapsed(),
-            );
-            models
+                .collect::<BTreeMap<_, _>>()
         } else {
             dyn_cache_size += static_model_keys.into_iter().count();
             Default::default()
 
@@ -22,6 +22,7 @@ use tokeneer::utok;
 #[allow(non_camel_case_types)]
 type upos = u32;
 
+pub use engine::Progress;
 pub(crate) use engine::engine;
 
 pub(crate) enum Command {