refine3

niushengxiao · niushengxiao · commit 4f744f310017 · 2025-11-18T14:36:51.000+08:00
diff --git a/lightllm/server/multi_level_kv_cache/cpu_cache_client.py b/lightllm/server/multi_level_kv_cache/cpu_cache_client.py
@@ -57,8 +57,6 @@ def get_one_empty_page(self, hash_key: int, disk_offload_enable: bool) -> Option
         if cur_page.self_index == tail.self_index:
             return None
 
-        assert cur_page.is_empty() or cur_page.is_ready_recycle()
-        assert cur_page.ref_count == 0
         if cur_page.can_realloc(disk_offload_enable=disk_offload_enable):
             page_index = cur_page.self_index
             cur_page.del_self_from_list()
@@ -130,8 +128,11 @@ def update_pages_status_to_ready(
                 cur_page = page_items[page_index]
                 if cur_page.status < _CpuPageStatus.READY:
                     cur_page.status = _CpuPageStatus.READY
+
+                # 全部落盘，已落盘前缀部分会在落盘中自动剔除
                 if disk_offload_enable:
                     offload_candidates.append(cur_page.self_index)
+
                 if deref:
                     assert cur_page.ref_count > 0
                     cur_page.ref_count -= 1
@@ -202,13 +203,13 @@ def deref_pages(self, page_list: List[int]):
         for page_index in page_list:
             if page_index != -1:
                 page_item = page_items[page_index]
-                assert page_item.ref_count == 1
+                assert page_item.ref_count > 0
                 page_item.ref_count -= 1
         return
 
     def deref_one_page(self, page_index: int):
         page_item: _CpuPageStatus = self.page_items.get_item_by_index(page_index)
-        assert page_item.ref_count == 1
+        assert page_item.ref_count > 0
         page_item.ref_count -= 1
         return
 
@@ -220,7 +221,6 @@ def get_pages_to_offloading(self) -> List[List[int]]:
         if page_list is None:
             return groups
 
-        # 缓存常量和对象引用，减少属性访问
         page_items = self.page_items.linked_items
         for value in page_list:
             page_index, is_group_head = self._decode_offload_value(value)
@@ -243,10 +243,9 @@ def update_pages_status_to_ready_recycle(self, page_list: List[int], deref: bool
         for page_index in page_list:
             if page_index != -1:
                 cur_page = page_items[page_index]
-                assert cur_page.is_offloading()
                 cur_page.status = _CpuPageStatus.READY_RECYCLE
                 if deref:
-                    assert cur_page.ref_count == 1
+                    assert cur_page.ref_count > 0
                     cur_page.ref_count -= 1
         return
 
diff --git a/lightllm/server/multi_level_kv_cache/disk_cache_worker.py b/lightllm/server/multi_level_kv_cache/disk_cache_worker.py
@@ -42,8 +42,11 @@ def __init__(
 
         assert disk_cache_storage_size > 0
         storage_size = int(disk_cache_storage_size * (1024 ** 3))
-        num_shard = 64
+        # num_shard与KVCACHE_MAX_BLOCK_SIZE相关，KVCACHE_MAX_BLOCK_SIZE默认64MB前提下，
+        # num_shard设置32, 能使disk cache的容量利用率达到90%，继续增大num_shard会导致容量利用率下降
+        num_shard = 32
         num_worker = 48
+        # 读写同时进行时，分配16线程用来写，32线程用来读
         max_concurrent_write_tasks = 16
 
         cache_dir = disk_cache_dir
@@ -134,16 +137,24 @@ def _persist_pages_to_disk(self, payloads: List[_PagePayload]) -> None:
         self.cpu_cache_client.update_pages_status_to_ready_recycle(page_list=page_indexes, deref=True)
         self.cpu_cache_client.lock.release()
 
-    def blocks_exist(self, tokens: List[int], start_pos: int = 0) -> bool:
+    def query_loadable_pages(self, tokens: List[int], start_pos: int) -> int:
+        """
+        查询从start_pos位置开始,可以从disk cache加载的最长前缀长度
+        Returns:
+            loadable_len: 从start_pos开始可以加载的长度
+        """
         if not tokens or start_pos < 0 or start_pos >= len(tokens):
-            return False
+            return 0
 
         query_result = self.service.query(tokens)
-        block_start = start_pos // self.service._n
-        block_end = math.ceil(len(tokens) / self.service._n)
-        if block_start >= block_end:
-            return False
-        return all(query_result[block_start:block_end])
+        n = self.service._n
+        start_block = start_pos // n
+        try:
+            first_false_idx = start_block + query_result[start_block:].index(False)
+        except ValueError:
+            return len(tokens) - start_pos
+        first_missing_pos = first_false_idx * n
+        return max(0, first_missing_pos - start_pos)
 
     # 从磁盘读取数据到内存
     def load_pages(self, tokens: List[int], page_indexes: List[int], start_pos: int = 0) -> bool:
diff --git a/lightllm/server/multi_level_kv_cache/manager.py b/lightllm/server/multi_level_kv_cache/manager.py
@@ -34,6 +34,7 @@ def __init__(
         logger.info(f"send_to_router sendhwm {self.send_to_router.getsockopt(zmq.SNDHWM)}")
         self.cpu_cache_client = CpuKvCacheClient(only_create_meta_data=False, init_shm_data=True)
         self.shm_req_manager = ShmReqManager()
+        # 磁盘io在NVMe SSD上需要大量并发才能发挥性能
         self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=500)
         # 控制进行 cpu cache 页面匹配的时间，超过时间则不再匹配，直接转发。
         self.cpu_cache_time_out = 0.5
@@ -60,21 +61,88 @@ def cpu_cache_hanle_loop(self):
             try:
                 current_group_req = self.recv_queue.get()
 
-                self.executor.submit(self._handle_group_req_cpu_cache_match, current_group_req, time.time())
+                self.executor.submit(self._handle_group_req_multi_cache_match, current_group_req, time.time())
             except BaseException as e:
                 logger.exception(str(e))
 
-    # blueswhen TODO: 考虑拆分函数，简化逻辑
-    def _handle_group_req_cpu_cache_match(self, group_req_indexes: GroupReqIndexes, start_time: float):
+    def _cpu_cache_match(self, token_hash_list: List[int]) -> List[int]:
         """
-        match cpu cache pages
+        匹配CPU cache,返回命中的pages列表(最长前缀)
+        Returns:
+            all_pages: 命中的page索引列表,len(all_pages)即为命中长度
         """
-        # 超时时，放弃进行 cpu cache page 的匹配。
+        all_pages = []
+        self.cpu_cache_client.lock.acquire_sleep1ms()
+        for token_hash in token_hash_list:
+            page_index, _ = self.cpu_cache_client.query_one_page(token_hash)
+            if page_index is None:
+                break
+            all_pages.append(page_index)
+        self.cpu_cache_client.lock.release()
+        return all_pages
+
+    def _disk_cache_match(self, token_hash_list: List[int], all_pages: List[int]) -> tuple[List[int], int]:
+        """
+        匹配disk cache并加载缺失的页面,直接append到all_pages
+        Returns:
+            (finded_page_indexes, disk_page_num): 最终匹配到的页面索引列表(最长前缀)和从disk加载的页面数量
+        """
+        cpu_hit_len = len(all_pages)
+        loadable_len = self.disk_cache_worker.query_loadable_pages(tokens=token_hash_list, start_pos=cpu_hit_len)
+        if loadable_len == 0:
+            return all_pages, 0
+
+        missing_hash_keys = token_hash_list[cpu_hit_len : cpu_hit_len + loadable_len]
+        self.cpu_cache_client.lock.acquire_sleep1ms()
+        allocated_pages, _ = self.cpu_cache_client.allocate_pages(
+            hash_keys=missing_hash_keys, disk_offload_enable=self.args.enable_disk_cache
+        )
+        self.cpu_cache_client.lock.release()
+
+        # 收集成功分配的页面,直接append到all_pages
+        new_page_indexes = []
+        for page_index in allocated_pages:
+            if page_index == -1:
+                break
+            all_pages.append(page_index)
+            new_page_indexes.append(page_index)
+
+        if not new_page_indexes:
+            return all_pages, 0
+
+        # 计算需要从disk加载的范围,必须按block边界对齐
+        block_size = self.disk_cache_worker.service._n
+        start_block = cpu_hit_len // block_size
+        load_start_pos = start_block * block_size
+
+        load_tokens = token_hash_list[: cpu_hit_len + len(new_page_indexes)]
+        if not self.disk_cache_worker.load_pages(tokens=load_tokens, page_indexes=all_pages, start_pos=load_start_pos):
+            self.cpu_cache_client.lock.acquire_sleep1ms()
+            self.cpu_cache_client.recycle_pages(new_page_indexes)
+            self.cpu_cache_client.lock.release()
+            return all_pages[:cpu_hit_len], 0
+
+        self.cpu_cache_client.lock.acquire_sleep1ms()
+        self.cpu_cache_client.update_pages_status_to_ready(
+            page_list=all_pages,
+            deref=False,
+            disk_offload_enable=False,
+        )
+        if self.args.enable_disk_cache:
+            self.cpu_cache_client.mark_pages_recyclable(new_page_indexes)
+        self.cpu_cache_client.lock.release()
+        return all_pages, len(new_page_indexes)
+
+    def _handle_group_req_multi_cache_match(self, group_req_indexes: GroupReqIndexes, start_time: float):
+        """
+        match cpu cache and disk cache pages
+        """
+        # 超时时，放弃进行 cache page 的匹配。
         current_time = time.time()
         if current_time - start_time >= self.cpu_cache_time_out:
             self.send_to_router.send_pyobj(group_req_indexes, protocol=pickle.HIGHEST_PROTOCOL)
             logger.warning(
-                f"cpu cache match time out {current_time - start_time}s, "
+                f"cache matching time out {current_time - start_time}s, "
                 f"group_req_id: {group_req_indexes.group_req_id}"
             )
             return
@@ -96,119 +164,26 @@ def _handle_group_req_cpu_cache_match(self, group_req_indexes: GroupReqIndexes,
             if len(token_hash_list) == 0:
                 continue
 
-            req.disk_prompt_cache_len = 0
             finded_page_indexes: List[int] = []
             disk_service = (
                 self.disk_cache_worker.service
                 if (self.disk_cache_worker is not None and self.disk_cache_worker.service is not None)
                 else None
             )
-            block_capacity = disk_service._n if disk_service is not None else 1
-            if block_capacity <= 0:
-                block_capacity = 1
-
-            disk_loaded_page_indexes: List[int] = []
-            idx = 0
-            while idx < len(token_hash_list):
-                chunk_len = min(block_capacity, len(token_hash_list) - idx)
-                chunk_tokens = token_hash_list[idx : idx + chunk_len]
-                if not chunk_tokens:
-                    break
-
-                block_pages: List[int] = []
-                missing_positions: List[int] = []
-
-                self.cpu_cache_client.lock.acquire_sleep1ms()
-                for pos, token_hash_value in enumerate(chunk_tokens):
-                    page_index, ready = self.cpu_cache_client.query_one_page(token_hash_value)
-                    if page_index is not None:
-                        block_pages.append(page_index)
-                        continue
-
-                    # -1仅用于占位
-                    block_pages.append(-1)
-                    missing_positions.append(pos)
-                self.cpu_cache_client.lock.release()
-
-                if not missing_positions:
-                    finded_page_indexes.extend(block_pages)
-                    idx += chunk_len
-                    continue
-
-                if disk_service is None:
-                    finded_page_indexes.extend(block_pages)
-                    break
-
-                prefix_len = idx + chunk_len
-                prefix_tokens = token_hash_list[:prefix_len]
-                if not self.disk_cache_worker.blocks_exist(tokens=prefix_tokens, start_pos=idx):
-                    finded_page_indexes.extend(block_pages)
-                    break
-
-                self.cpu_cache_client.lock.acquire_sleep1ms()
-                new_page_indexes: List[int] = []
-                allocation_failed = False
-                page_items = self.cpu_cache_client.page_items.linked_items
-                for pos in missing_positions:
-                    token_hash_value = chunk_tokens[pos]
-                    page_index, ready = self.cpu_cache_client.allocate_one_page(
-                        page_items=page_items,
-                        hash_key=token_hash_value,
-                        disk_offload_enable=self.args.enable_disk_cache,
-                    )
-                    if page_index is None:
-                        allocation_failed = True
-                        break
-                    block_pages[pos] = page_index
-                    if not ready:
-                        new_page_indexes.append(page_index)
-                if allocation_failed and new_page_indexes:
-                    self.cpu_cache_client.recycle_pages(new_page_indexes)
-                self.cpu_cache_client.lock.release()
-
-                if allocation_failed:
-                    hit_pages = [p for p in block_pages if p not in new_page_indexes]
-                    finded_page_indexes.extend(hit_pages)
-                    break
-
-                pages_to_load = new_page_indexes
-                if pages_to_load:
-                    prefix_len = idx + chunk_len
-                    prefix_tokens = token_hash_list[:prefix_len]
-                    prefix_pages = finded_page_indexes + block_pages
-
-                    if not self.disk_cache_worker.load_pages(
-                        tokens=prefix_tokens, page_indexes=prefix_pages, start_pos=idx
-                    ):
-                        self.cpu_cache_client.lock.acquire_sleep1ms()
-                        self.cpu_cache_client.recycle_pages(pages_to_load)
-                        self.cpu_cache_client.lock.release()
-                        hit_pages = [p for p in block_pages if p not in pages_to_load]
-                        finded_page_indexes.extend(hit_pages)
-                        break
-
-                    self.cpu_cache_client.lock.acquire_sleep1ms()
-                    self.cpu_cache_client.update_pages_status_to_ready(
-                        page_list=block_pages,
-                        deref=False,
-                        disk_offload_enable=False,
-                    )
-                    if self.args.enable_disk_cache and pages_to_load:
-                        self.cpu_cache_client.mark_pages_recyclable(pages_to_load)
-                    self.cpu_cache_client.lock.release()
-
-                    disk_loaded_page_indexes.extend(pages_to_load)
-
-                finded_page_indexes.extend(block_pages)
-                idx += chunk_len
-
-            finded_page_indexes = [p for p in finded_page_indexes if p != -1]
+            req.disk_prompt_cache_len = 0
+
+            # 匹配 CPU cache
+            all_pages = self._cpu_cache_match(token_hash_list)
+            if len(all_pages) == len(token_hash_list) or disk_service is None:
+                finded_page_indexes = all_pages
+            else:
+                # 匹配 disk cache并load到cpu cache
+                finded_page_indexes, disk_page_num = self._disk_cache_match(token_hash_list, all_pages)
+                req.disk_prompt_cache_len = disk_page_num * self.args.cpu_cache_token_page_size
+
             while not self.cpu_cache_client.check_allpages_ready(finded_page_indexes):
                 time.sleep(0.01)
 
-            if disk_loaded_page_indexes:
-                req.disk_prompt_cache_len = len(disk_loaded_page_indexes) * self.args.cpu_cache_token_page_size
-
             req.cpu_cache_match_page_indexes.fill(finded_page_indexes)
 
         for req in reqs:
diff --git a/lightllm/server/router/model_infer/mode_backend/multi_level_kv_cache.py b/lightllm/server/router/model_infer/mode_backend/multi_level_kv_cache.py
@@ -252,10 +252,10 @@ def update_cpu_cache_task_states(self):
             page_array_list = [task.page_indexes.tolist() for task in trans_ok_tasks]
             if self.backend.is_master_in_dp:
                 self.cpu_cache_client.lock.acquire_sleep1ms()
+                # 分组update，避免不同请求的page交叉，导致disk cache hash不一致
                 for pages in page_array_list:
                     if not pages:
                         continue
-                    # Keep per-req grouping so disk cache hashes stay aligned with req prefixes.
                     self.cpu_cache_client.update_pages_status_to_ready(
                         page_list=pages, deref=True, disk_offload_enable=self.args.enable_disk_cache
                     )
diff --git a/test/benchmark/service/benchmark_qps.py b/test/benchmark/service/benchmark_qps.py
@@ -48,7 +48,6 @@ def gen_random_input_text(tokenizer, input_len) -> str:
 
 
 def gen_random_input_text_with_seed(tokenizer, input_len, seed) -> str:
-    """Generate random input text with a specific seed"""
     rng = random.Random(seed)
     random_ids = [rng.randint(0, tokenizer.vocab_size) for _ in range(input_len)]
     random_text = tokenizer.decode(random_ids)
@@ -68,15 +67,12 @@ def gen_random_data(
     output_lens = get_random_length(reqs_num, output_len, range_ratio)
     input_lens = get_random_length(reqs_num, input_len, range_ratio)
 
-    # Generate input_len2 lengths if input_len2 > 0
     if input_len2 > 0:
         input_lens2 = get_random_length(reqs_num, input_len2, range_ratio)
 
     for i in range(reqs_num):
-        # Generate first part with main random state
         input_text = gen_random_input_text(tokenizer, input_lens[i])
 
-        # Generate second part with seed2 if specified
         if input_len2 > 0 and seed2 is not None:
             input_text2 = gen_random_input_text_with_seed(tokenizer, input_lens2[i], seed2 + i)
             input_text = input_text + input_text2
@@ -339,7 +335,9 @@ def main():
     parser.add_argument("--input_num", type=int, default=2000)
     parser.add_argument("--input_qps", type=float, default=30.0)
     parser.add_argument("--input_len", type=int, default=1024)
-    parser.add_argument("--input_len2", type=int, default=0, help="Length of second part to append, 0 means disabled")
+    parser.add_argument(
+        "--input_len2", type=int, default=0, help="Length of second part to append behind input_len, 0 means disabled"
+    )
     parser.add_argument("--output_len", type=int, default=128)
     parser.add_argument("--server_api", type=str, default="lightllm")
     parser.add_argument("--dump_file", type=str, default="")