vectorch-ai
diff --git a/‎.github/workflows/package_test.yml‎
Lines changed: 1 addition & 6 deletions b/‎.github/workflows/package_test.yml‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎.github/workflows/release_test.yml‎
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/release_test.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎scalellm/llm.py‎
Lines changed: 1 addition & 1 deletion b/‎scalellm/llm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scalellm/llm_engine.py‎
Lines changed: 1 addition & 1 deletion b/‎scalellm/llm_engine.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scalellm/serve/server_args.py‎
Lines changed: 2 additions & 2 deletions b/‎scalellm/serve/server_args.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/engine/batch.cpp‎
Lines changed: 2 additions & 2 deletions b/‎src/engine/batch.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/engine/batch_test.cpp‎
Lines changed: 7 additions & 6 deletions b/‎src/engine/batch_test.cpp‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎src/engine/llm_engine.cpp‎
Lines changed: 5 additions & 4 deletions b/‎src/engine/llm_engine.cpp‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/engine/llm_engine.h‎
Lines changed: 2 additions & 2 deletions b/‎src/engine/llm_engine.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/engine/worker.cpp‎
Lines changed: 21 additions & 13 deletions b/‎src/engine/worker.cpp‎
Lines changed: 21 additions & 13 deletions
@@ -1,12 +1,7 @@
 name: Package test
 
 on:
-  workflow_dispatch:
-
-  # Schedule the workflow to run at 08:00 (UTC) every day.
-  schedule:
-    # Minute[0,59] Hour[0,23] Day of month[1,31] Month[1,12] Day of week[0,6] (Sunday=0)
-    - cron: '0 8 * * *'  
+  workflow_dispatch: 
 
   push:
     paths:
 
@@ -4,6 +4,12 @@ on:
   workflow_dispatch:
 
   workflow_call:
+
+  # Schedule the workflow to run at 08:00 (UTC) every day.
+  schedule:
+    # Minute[0,59] Hour[0,23] Day of month[1,31] Month[1,12] Day of week[0,6] (Sunday=0)
+    - cron: '0 8 * * *' 
+
 env:
   # Tells where to store caches.
   CI_CACHE_DIR: ${{ github.workspace }}/../../ci_cache
 
@@ -19,7 +19,7 @@ def __init__(
         convert_to_safetensors: bool = False,
         devices: Optional[str] = None,
         draft_devices: Optional[str] = None,
-        block_size: int = 16,
+        block_size: int = 8,
         max_cache_size: int = 0, # 0 means that cache size is caculated by available memory
         max_memory_utilization: float = 0.9,
         enable_prefix_cache: bool = True,
 
@@ -117,7 +117,7 @@ def __init__(
         convert_to_safetensors: bool = False,
         devices: Optional[str] = None,
         draft_devices: Optional[str] = None,
-        block_size: int = 16,
+        block_size: int = 8,
         max_cache_size: int = 0,  # 0 means that cache size is caculated by available memory
         max_memory_utilization: float = 0.9,
         enable_prefix_cache: bool = True,
 
@@ -47,8 +47,8 @@ def parse_args():
     parser.add_argument(
         "--block_size",
         type=int,
-        default=16,
-        help="Number of slots per kv cache block. Default is 16.",
+        default=8,
+        help="Number of slots per kv cache block, must be a power of 2. Default is 8.",
     )
     parser.add_argument(
         "--max_cache_size",
 
@@ -203,9 +203,9 @@ ModelInput Batch::prepare_model_input(uint32_t num_decoding_tokens,
     new_token_slot_ids.insert(
         new_token_slot_ids.end(), slot_ids.begin(), slot_ids.end());
 
-    // add block ids for each sequence
     for (const auto& block : blocks) {
-      block_tables.push_back(block.id());
+      // put first slot id of each block into block_table
+      block_tables.push_back(block.id() * block.size());
     }
     cu_block_lens.push_back(static_cast<int32_t>(block_tables.size()));
   }
 
@@ -13,22 +13,22 @@
 namespace llm {
 
 template <typename T>
-bool equal(const torch::Tensor& t, const std::vector<T>& d) {
+bool equal(const torch::Tensor& t, const std::vector<T>& d, T scale = 1) {
   auto flatten_t = t.flatten();
   if (flatten_t.size(0) != d.size()) {
     return false;
   }
   for (int i = 0; i < d.size(); i++) {
-    if (flatten_t[i].item<T>() != d[i]) {
+    if (flatten_t[i].item<T>() != d[i] * scale) {
       return false;
     }
   }
   return true;
 }
 
 TEST(BatchTest, Basic) {
-  const uint32_t n_blocks = 20;
-  const uint32_t block_size = 4;
+  const int32_t n_blocks = 20;
+  const int32_t block_size = 4;
 
   BlockAllocator allocator(n_blocks, block_size);
   // reserve block 0
@@ -103,11 +103,12 @@ TEST(BatchTest, Basic) {
     /*seq3*/ 47};
   EXPECT_TRUE(equal(input_params.new_cache_slots, new_cache_slots));
 
-  const std::vector<int32_t> block_tables = {
+  const std::vector<int32_t> block_id_tables = {
     /*seq1*/ 1, 2, 3,
     /*seq2*/ 4, 5, 6,  7,
     /*seq3*/ 8, 9, 10, 11, 12};
-  EXPECT_TRUE(equal(input_params.block_tables, block_tables));
+  
+  EXPECT_TRUE(equal(input_params.block_tables, block_id_tables, block_size));
   const std::vector<int32_t> cu_block_lens = {0, 3, 7, 12};
   EXPECT_TRUE(equal(input_params.cu_block_lens, cu_block_lens));
 
 
@@ -314,9 +314,9 @@ bool LLMEngine::init_kv_cache(int64_t n_blocks) {
   const int32_t block_size = options_.block_size();
 
   // init kv cache for each worker
-  const std::vector<int64_t> kv_cache_shape = {
-      n_blocks, block_size, n_local_kv_heads_, head_dim_};
-  LOG(INFO) << "Initializing kv cache with shape: [" << kv_cache_shape << "]";
+  LOG(INFO) << "Initializing kv cache with shape: [" << n_blocks << ", "
+            << block_size << ", " << n_local_kv_heads_ << ", " << head_dim_
+            << "]";
 
   // initialize block manager
   BlockManager::Options options;
@@ -329,7 +329,8 @@ bool LLMEngine::init_kv_cache(int64_t n_blocks) {
   std::vector<folly::SemiFuture<bool>> futures;
   futures.reserve(workers_.size());
   for (auto& worker : workers_) {
-    futures.push_back(worker->init_kv_cache_async(kv_cache_shape));
+    futures.push_back(worker->init_kv_cache_async(
+        n_blocks, block_size, n_local_kv_heads_, head_dim_));
   }
   // wait for all futures to complete
   auto results = folly::collectAll(futures).get();
 
@@ -32,8 +32,8 @@ class LLMEngine : public Engine {
   struct Options {
     DEFINE_ARG(std::vector<torch::Device>, devices);
 
-    // the number of slots per block, default 16, value must be multiple of 16
-    DEFINE_ARG(int32_t, block_size) = 16;
+    // the number of slots per block, default 8, value must be a power of 2
+    DEFINE_ARG(int32_t, block_size) = 8;
 
     // 0 means that cache size is caculated by available memory
     DEFINE_ARG(int64_t, max_cache_size) = 0;
 
@@ -64,19 +64,20 @@ bool Worker::init_model(torch::ScalarType dtype,
   return true;
 }
 
-bool Worker::init_kv_cache(const std::vector<int64_t>& kv_cache_shape) {
+bool Worker::init_kv_cache(int64_t n_blocks,
+                           int64_t block_size,
+                           int64_t n_kv_heads,
+                           int64_t head_dim) {
   CHECK(model_ != nullptr) << "Model is not initialized.";
   CHECK(kv_caches_.empty()) << "KV caches are already initialized.";
 
+  const auto options = torch::dtype(dtype_).device(device_);
   // create a KVCache for each layer
   const int64_t num_layers = args_.n_layers();
   kv_caches_.reserve(num_layers);
   for (int64_t i = 0; i < num_layers; ++i) {
-    auto key_cache =
-        torch::empty(kv_cache_shape, torch::dtype(dtype_).device(device_));
-    auto value_cache =
-        torch::empty(kv_cache_shape, torch::dtype(dtype_).device(device_));
-    kv_caches_.emplace_back(key_cache, value_cache);
+    kv_caches_.emplace_back(
+        n_blocks, block_size, n_kv_heads, head_dim, options);
   }
   return true;
 }
@@ -238,15 +239,22 @@ folly::SemiFuture<bool> Worker::init_model_async(torch::ScalarType dtype,
   return future;
 }
 
-folly::SemiFuture<bool> Worker::init_kv_cache_async(
-    const std::vector<int64_t>& kv_cache_shape) {
+folly::SemiFuture<bool> Worker::init_kv_cache_async(int64_t n_blocks,
+                                                    int64_t block_size,
+                                                    int64_t n_kv_heads,
+                                                    int64_t head_dim) {
   folly::Promise<bool> promise;
   auto future = promise.getSemiFuture();
-  threadpool_.schedule(
-      [this, &kv_cache_shape, promise = std::move(promise)]() mutable {
-        const bool success = this->init_kv_cache(kv_cache_shape);
-        promise.setValue(success);
-      });
+  threadpool_.schedule([this,
+                        n_blocks,
+                        block_size,
+                        n_kv_heads,
+                        head_dim,
+                        promise = std::move(promise)]() mutable {
+    const bool success =
+        this->init_kv_cache(n_blocks, block_size, n_kv_heads, head_dim);
+    promise.setValue(success);
+  });
   return future;
 }
Original file line number	Diff line number	Diff line change
`@@ -203,9 +203,9 @@ ModelInput Batch::prepare_model_input(uint32_t num_decoding_tokens,`
`203`	`203`	`new_token_slot_ids.insert(`
`204`	`204`	`new_token_slot_ids.end(), slot_ids.begin(), slot_ids.end());`
`205`	`205`
`206`		`- // add block ids for each sequence`
`207`	`206`	`for (const auto& block : blocks) {`
`208`		`- block_tables.push_back(block.id());`
	`207`	`+ // put first slot id of each block into block_table`
	`208`	`+ block_tables.push_back(block.id() * block.size());`
`209`	`209`	`}`
`210`	`210`	`cu_block_lens.push_back(static_cast<int32_t>(block_tables.size()));`
`211`	`211`	`}`