InfiniTensor
diff --git a/‎Cargo.lock‎
Lines changed: 20 additions & 56 deletions b/‎Cargo.lock‎
Lines changed: 20 additions & 56 deletions
diff --git a/‎llama.cu/Cargo.toml‎
Lines changed: 4 additions & 3 deletions b/‎llama.cu/Cargo.toml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎llama.cu/src/exec/engine.rs‎
Lines changed: 7 additions & 23 deletions b/‎llama.cu/src/exec/engine.rs‎
Lines changed: 7 additions & 23 deletions
diff --git a/‎llama.cu/src/exec/group.rs‎
Lines changed: 5 additions & 13 deletions b/‎llama.cu/src/exec/group.rs‎
Lines changed: 5 additions & 13 deletions
diff --git a/‎llama.cu/src/exec/kv_cache.rs‎
Lines changed: 2 additions & 2 deletions b/‎llama.cu/src/exec/kv_cache.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎llama.cu/src/exec/mod.rs‎
Lines changed: 1 addition & 1 deletion b/‎llama.cu/src/exec/mod.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llama.cu/src/exec/model.rs‎
Lines changed: 2 additions & 6 deletions b/‎llama.cu/src/exec/model.rs‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎llama.cu/src/exec/output_head.rs‎
Lines changed: 2 additions & 2 deletions b/‎llama.cu/src/exec/output_head.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎llama.cu/src/exec/sample_manager.rs‎
Lines changed: 2 additions & 2 deletions b/‎llama.cu/src/exec/sample_manager.rs‎
Lines changed: 2 additions & 2 deletions
@@ -4,9 +4,10 @@ version = "0.0.0"
 edition.workspace = true
 
 [dependencies]
-operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "88c58bd", default-features = false, features = [
-    "nvidia-gpu",
-] }
+cuda = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
+cublas = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
+nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "31c8090" }
+flash-attn = { git = "https://github.com/YdrMaster/learn-flash-attn", rev = "57176f5" }
 nn = { git = "https://github.com/YdrMaster/InfiniNN", rev = "171c5b0" }
 ggus = { git = "https://github.com/InfiniTensor/gguf", rev = "23c362f" }
 tokeneer = { git = "https://github.com/InfiniTensor/tokeneer", rev = "c48f39f" }
 
@@ -9,12 +9,8 @@ use crate::{
     handle::Handle,
     op::{FastEmbedding, random_sample::KVPair},
 };
+use cuda::{ContextResource, CurrentCtx, Device, Event, HostMem};
 use nn::{Distribution, LLaMA, Tensor};
-use operators::{
-    Operator,
-    attention_kv_cached::cuda::Operator as Attn,
-    cuda::{ContextResource, CurrentCtx, Device, Event, Gpu, HostMem},
-};
 use std::{
     ffi::c_int,
     iter::zip,
@@ -30,7 +26,7 @@ use std::{
 use tokeneer::utok;
 
 #[cfg(nccl)]
-use operators::nccl::{Communicator, CommunicatorGroup};
+use nccl::{Communicator, CommunicatorGroup};
 
 type Stub = SessionStub<CacheParts>;
 
@@ -222,16 +218,13 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
         } = self;
 
         dev.set_mempool_threshold(u64::MAX);
-        let gpu = Gpu::new(dev.retain_primary(), Default::default());
-        let attn = Attn::new(&gpu);
-        gpu.apply(|ctx| {
+        dev.retain_primary().apply(|ctx| {
             let mut handle = handle(ctx);
             let mut models = ModelGroup::new(
                 llama,
                 dist,
                 progress,
                 config,
-                attn,
                 &mut handle,
                 barrier.as_deref(),
             );
@@ -373,21 +366,12 @@ impl<T: IntoIterator<Item = usize>> Worker<T> {
             ..
         } = self;
 
-        dev.set_mempool_threshold(u64::MAX);
-        let gpu = Gpu::new(dev.retain_primary(), Default::default());
-        let attn = Attn::new(&gpu);
         let barrier = barrier.unwrap();
-        gpu.apply(|ctx| {
+        dev.set_mempool_threshold(u64::MAX);
+        dev.retain_primary().apply(|ctx| {
             let mut handle = Handle::with_comm(ctx, comm);
-            let mut models = ModelGroup::new(
-                llama,
-                dist,
-                progress,
-                config,
-                attn,
-                &mut handle,
-                Some(&barrier),
-            );
+            let mut models =
+                ModelGroup::new(llama, dist, progress, config, &mut handle, Some(&barrier));
 
             let stream = ctx.stream();
             loop {
 
@@ -1,12 +1,9 @@
 use super::{CacheParts, Progress, model::ModelExec, upos};
 use crate::{batch::Req, handle::Handle, load::load_weight, memory::MemPages};
+use cuda::{DevByte, DevMem, Stream, VirByte};
 use nn::{
     Distribution, Graph, GraphBuilder, LLaMA, NNGraph, Tensor, TensorMeta, digit_layout::types, op,
 };
-use operators::{
-    attention_kv_cached::cuda::Operator as Attn,
-    cuda::{DevByte, DevMem, Stream, VirByte},
-};
 use std::{
     collections::BTreeMap,
     num::{NonZero, NonZeroUsize},
@@ -16,7 +13,6 @@ use tokeneer::utok;
 
 pub(crate) struct ModelGroup<'ctx> {
     internal: Internal<'ctx>,
-    attn: Attn,
     pages: MemPages,
     _weight: DevMem<'ctx>,
 }
@@ -36,7 +32,6 @@ impl<'ctx> ModelGroup<'ctx> {
 
         config: ModelGroupConfig<T>,
 
-        attn: Attn,
         handle: &mut Handle<'ctx>,
         barrier: Option<&Barrier>,
     ) -> Self {
@@ -82,7 +77,6 @@ impl<'ctx> ModelGroup<'ctx> {
         let models_with_one_dyn = Internal::new(graph, static_models, dyn_cache_size);
         Self {
             internal: models_with_one_dyn,
-            attn,
             pages,
             _weight,
         }
@@ -125,10 +119,7 @@ impl<'ctx> ModelGroup<'ctx> {
         stream: &Stream<'ctx>,
     ) -> Tensor<*const VirByte, 2> {
         let Self {
-            internal,
-            attn,
-            pages,
-            ..
+            internal, pages, ..
         } = self;
 
         let mut reqs = reqs
@@ -142,7 +133,8 @@ impl<'ctx> ModelGroup<'ctx> {
         let reqs = reqs
             .iter_mut()
             .map(|req| {
-                req.cache.update(req.pos + req.seq, pages);
+                req.cache
+                    .update((req.pos + req.seq).div_ceil(32) * 32, pages);
                 Req {
                     cache: req.cache.as_tensor(),
                     pos: req.pos,
@@ -154,7 +146,7 @@ impl<'ctx> ModelGroup<'ctx> {
         internal
             .get_mut(&key)
             .unwrap()
-            .launch(attn, handle, &reqs, stream)
+            .launch(handle, &reqs, stream)
     }
 }
 
 
@@ -1,6 +1,6 @@
-use crate::memory::MemPages;
+use crate::memory::MemPages;
+use cuda::{VirByte, VirMem};
 use nn::Tensor;
-use operators::cuda::{VirByte, VirMem};
 
 pub(crate) struct KVCache {
     /// 基于虚地址的 cache 张量
 
@@ -12,7 +12,7 @@ use crate::{
     batch::{Session as Session_, SessionId},
     op::random_sample::KVPair,
 };
-use operators::cuda::{ContextSpore, CurrentCtx, DevMemSpore, EventSpore, Stream};
+use cuda::{ContextSpore, CurrentCtx, DevMemSpore, EventSpore, Stream};
 use std::collections::BTreeMap;
 use tokeneer::utok;
 
 
@@ -6,12 +6,9 @@ use crate::{
     utils::{self, destruct},
 };
 use bytesize::ByteSize;
+use cuda::{DevByte, Stream, VirByte, VirMem};
 use log::trace;
 use nn::{NNGraph, Tensor};
-use operators::{
-    attention_kv_cached::cuda::Operator as Attn,
-    cuda::{DevByte, Stream, VirByte, VirMem},
-};
 use std::time::Instant;
 
 pub(super) struct ModelExec<'ctx> {
@@ -100,7 +97,6 @@ impl ModelExec<'_> {
 
     pub fn launch(
         &mut self,
-        attn: &Attn,
         handle: &mut Handle,
         reqs: &[Req<Tensor<*const VirByte, 2>>],
         stream: &Stream,
@@ -117,7 +113,7 @@ impl ModelExec<'_> {
                         std::process::exit(0);
                     }
                 }
-                Step::Attention(box_) => handle.launch_attn(attn, box_, reqs, stream),
+                Step::Attention(box_) => handle.launch_attn(box_, reqs, stream),
                 Step::Exec(exec) => handle.launch_nn_exec(exec, stream),
             }
         }
 
@@ -1,11 +1,11 @@
-use crate::{
+use crate::{
     handle::Handle,
     load::WeightLoader,
     op::{self, Operator as _},
     utils::dims,
 };
+use cuda::{CurrentCtx, DevMem, Stream, VirByte};
 use nn::{Arg, Linear, NormType, Normalization, Tensor, digit_layout::types};
-use operators::cuda::{CurrentCtx, DevMem, Stream, VirByte};
 use tokeneer::utok;
 
 pub(super) struct OutputHead<'ctx> {
 
@@ -1,11 +1,11 @@
-use crate::{
+use crate::{
     SessionId,
     batch::SampleInfo,
     op::random_sample::{KV_PAIR, KVPair, LogitsModifier, RandomSample},
     utils::dims,
 };
+use cuda::{CurrentCtx, DevByte, DevMem, Stream};
 use nn::Tensor;
-use operators::cuda::{CurrentCtx, DevByte, DevMem, Stream};
 use std::{collections::BTreeMap, ptr::null};
 use tokeneer::utok;