@@ -34,6 +34,7 @@ def __init__(
3434 logger .info (f"send_to_router sendhwm { self .send_to_router .getsockopt (zmq .SNDHWM )} " )
3535 self .cpu_cache_client = CpuKvCacheClient (only_create_meta_data = False , init_shm_data = True )
3636 self .shm_req_manager = ShmReqManager ()
37+ # 磁盘io在NVMe SSD上需要大量并发才能发挥性能
3738 self .executor = concurrent .futures .ThreadPoolExecutor (max_workers = 500 )
3839 # 控制进行 cpu cache 页面匹配的时间,超过时间则不再匹配,直接转发。
3940 self .cpu_cache_time_out = 0.5
@@ -60,21 +61,88 @@ def cpu_cache_hanle_loop(self):
6061 try :
6162 current_group_req = self .recv_queue .get ()
6263
63- self .executor .submit (self ._handle_group_req_cpu_cache_match , current_group_req , time .time ())
64+ self .executor .submit (self ._handle_group_req_multi_cache_match , current_group_req , time .time ())
6465 except BaseException as e :
6566 logger .exception (str (e ))
6667
67- # blueswhen TODO: 考虑拆分函数,简化逻辑
68- def _handle_group_req_cpu_cache_match (self , group_req_indexes : GroupReqIndexes , start_time : float ):
68+ def _cpu_cache_match (self , token_hash_list : List [int ]) -> List [int ]:
6969 """
70- match cpu cache pages
70+ 匹配CPU cache,返回命中的pages列表(最长前缀)
71+ Returns:
72+ all_pages: 命中的page索引列表,len(all_pages)即为命中长度
7173 """
72- # 超时时,放弃进行 cpu cache page 的匹配。
74+ all_pages = []
75+ self .cpu_cache_client .lock .acquire_sleep1ms ()
76+ for token_hash in token_hash_list :
77+ page_index , _ = self .cpu_cache_client .query_one_page (token_hash )
78+ if page_index is None :
79+ break
80+ all_pages .append (page_index )
81+ self .cpu_cache_client .lock .release ()
82+ return all_pages
83+
84+ def _disk_cache_match (self , token_hash_list : List [int ], all_pages : List [int ]) -> tuple [List [int ], int ]:
85+ """
86+ 匹配disk cache并加载缺失的页面,直接append到all_pages
87+ Returns:
88+ (finded_page_indexes, disk_page_num): 最终匹配到的页面索引列表(最长前缀)和从disk加载的页面数量
89+ """
90+ cpu_hit_len = len (all_pages )
91+ loadable_len = self .disk_cache_worker .query_loadable_pages (tokens = token_hash_list , start_pos = cpu_hit_len )
92+ if loadable_len == 0 :
93+ return all_pages , 0
94+
95+ missing_hash_keys = token_hash_list [cpu_hit_len : cpu_hit_len + loadable_len ]
96+ self .cpu_cache_client .lock .acquire_sleep1ms ()
97+ allocated_pages , _ = self .cpu_cache_client .allocate_pages (
98+ hash_keys = missing_hash_keys , disk_offload_enable = self .args .enable_disk_cache
99+ )
100+ self .cpu_cache_client .lock .release ()
101+
102+ # 收集成功分配的页面,直接append到all_pages
103+ new_page_indexes = []
104+ for page_index in allocated_pages :
105+ if page_index == - 1 :
106+ break
107+ all_pages .append (page_index )
108+ new_page_indexes .append (page_index )
109+
110+ if not new_page_indexes :
111+ return all_pages , 0
112+
113+ # 计算需要从disk加载的范围,必须按block边界对齐
114+ block_size = self .disk_cache_worker .service ._n
115+ start_block = cpu_hit_len // block_size
116+ load_start_pos = start_block * block_size
117+
118+ load_tokens = token_hash_list [: cpu_hit_len + len (new_page_indexes )]
119+ if not self .disk_cache_worker .load_pages (tokens = load_tokens , page_indexes = all_pages , start_pos = load_start_pos ):
120+ self .cpu_cache_client .lock .acquire_sleep1ms ()
121+ self .cpu_cache_client .recycle_pages (new_page_indexes )
122+ self .cpu_cache_client .lock .release ()
123+ return all_pages [:cpu_hit_len ], 0
124+
125+ self .cpu_cache_client .lock .acquire_sleep1ms ()
126+ self .cpu_cache_client .update_pages_status_to_ready (
127+ page_list = all_pages ,
128+ deref = False ,
129+ disk_offload_enable = False ,
130+ )
131+ if self .args .enable_disk_cache :
132+ self .cpu_cache_client .mark_pages_recyclable (new_page_indexes )
133+ self .cpu_cache_client .lock .release ()
134+ return all_pages , len (new_page_indexes )
135+
136+ def _handle_group_req_multi_cache_match (self , group_req_indexes : GroupReqIndexes , start_time : float ):
137+ """
138+ match cpu cache and disk cache pages
139+ """
140+ # 超时时,放弃进行 cache page 的匹配。
73141 current_time = time .time ()
74142 if current_time - start_time >= self .cpu_cache_time_out :
75143 self .send_to_router .send_pyobj (group_req_indexes , protocol = pickle .HIGHEST_PROTOCOL )
76144 logger .warning (
77- f"cpu cache match time out { current_time - start_time } s, "
145+ f"cache matching time out { current_time - start_time } s, "
78146 f"group_req_id: { group_req_indexes .group_req_id } "
79147 )
80148 return
@@ -96,119 +164,26 @@ def _handle_group_req_cpu_cache_match(self, group_req_indexes: GroupReqIndexes,
96164 if len (token_hash_list ) == 0 :
97165 continue
98166
99- req .disk_prompt_cache_len = 0
100167 finded_page_indexes : List [int ] = []
101168 disk_service = (
102169 self .disk_cache_worker .service
103170 if (self .disk_cache_worker is not None and self .disk_cache_worker .service is not None )
104171 else None
105172 )
106- block_capacity = disk_service ._n if disk_service is not None else 1
107- if block_capacity <= 0 :
108- block_capacity = 1
109-
110- disk_loaded_page_indexes : List [int ] = []
111- idx = 0
112- while idx < len (token_hash_list ):
113- chunk_len = min (block_capacity , len (token_hash_list ) - idx )
114- chunk_tokens = token_hash_list [idx : idx + chunk_len ]
115- if not chunk_tokens :
116- break
117-
118- block_pages : List [int ] = []
119- missing_positions : List [int ] = []
120-
121- self .cpu_cache_client .lock .acquire_sleep1ms ()
122- for pos , token_hash_value in enumerate (chunk_tokens ):
123- page_index , ready = self .cpu_cache_client .query_one_page (token_hash_value )
124- if page_index is not None :
125- block_pages .append (page_index )
126- continue
127-
128- # -1仅用于占位
129- block_pages .append (- 1 )
130- missing_positions .append (pos )
131- self .cpu_cache_client .lock .release ()
132-
133- if not missing_positions :
134- finded_page_indexes .extend (block_pages )
135- idx += chunk_len
136- continue
137-
138- if disk_service is None :
139- finded_page_indexes .extend (block_pages )
140- break
141-
142- prefix_len = idx + chunk_len
143- prefix_tokens = token_hash_list [:prefix_len ]
144- if not self .disk_cache_worker .blocks_exist (tokens = prefix_tokens , start_pos = idx ):
145- finded_page_indexes .extend (block_pages )
146- break
147-
148- self .cpu_cache_client .lock .acquire_sleep1ms ()
149- new_page_indexes : List [int ] = []
150- allocation_failed = False
151- page_items = self .cpu_cache_client .page_items .linked_items
152- for pos in missing_positions :
153- token_hash_value = chunk_tokens [pos ]
154- page_index , ready = self .cpu_cache_client .allocate_one_page (
155- page_items = page_items ,
156- hash_key = token_hash_value ,
157- disk_offload_enable = self .args .enable_disk_cache ,
158- )
159- if page_index is None :
160- allocation_failed = True
161- break
162- block_pages [pos ] = page_index
163- if not ready :
164- new_page_indexes .append (page_index )
165- if allocation_failed and new_page_indexes :
166- self .cpu_cache_client .recycle_pages (new_page_indexes )
167- self .cpu_cache_client .lock .release ()
168-
169- if allocation_failed :
170- hit_pages = [p for p in block_pages if p not in new_page_indexes ]
171- finded_page_indexes .extend (hit_pages )
172- break
173-
174- pages_to_load = new_page_indexes
175- if pages_to_load :
176- prefix_len = idx + chunk_len
177- prefix_tokens = token_hash_list [:prefix_len ]
178- prefix_pages = finded_page_indexes + block_pages
179-
180- if not self .disk_cache_worker .load_pages (
181- tokens = prefix_tokens , page_indexes = prefix_pages , start_pos = idx
182- ):
183- self .cpu_cache_client .lock .acquire_sleep1ms ()
184- self .cpu_cache_client .recycle_pages (pages_to_load )
185- self .cpu_cache_client .lock .release ()
186- hit_pages = [p for p in block_pages if p not in pages_to_load ]
187- finded_page_indexes .extend (hit_pages )
188- break
189-
190- self .cpu_cache_client .lock .acquire_sleep1ms ()
191- self .cpu_cache_client .update_pages_status_to_ready (
192- page_list = block_pages ,
193- deref = False ,
194- disk_offload_enable = False ,
195- )
196- if self .args .enable_disk_cache and pages_to_load :
197- self .cpu_cache_client .mark_pages_recyclable (pages_to_load )
198- self .cpu_cache_client .lock .release ()
199-
200- disk_loaded_page_indexes .extend (pages_to_load )
201-
202- finded_page_indexes .extend (block_pages )
203- idx += chunk_len
204-
205- finded_page_indexes = [p for p in finded_page_indexes if p != - 1 ]
173+ req .disk_prompt_cache_len = 0
174+
175+ # 匹配 CPU cache
176+ all_pages = self ._cpu_cache_match (token_hash_list )
177+ if len (all_pages ) == len (token_hash_list ) or disk_service is None :
178+ finded_page_indexes = all_pages
179+ else :
180+ # 匹配 disk cache并load到cpu cache
181+ finded_page_indexes , disk_page_num = self ._disk_cache_match (token_hash_list , all_pages )
182+ req .disk_prompt_cache_len = disk_page_num * self .args .cpu_cache_token_page_size
183+
206184 while not self .cpu_cache_client .check_allpages_ready (finded_page_indexes ):
207185 time .sleep (0.01 )
208186
209- if disk_loaded_page_indexes :
210- req .disk_prompt_cache_len = len (disk_loaded_page_indexes ) * self .args .cpu_cache_token_page_size
211-
212187 req .cpu_cache_match_page_indexes .fill (finded_page_indexes )
213188
214189 for req in reqs :
0 commit comments