66from dataclasses import dataclass
77from pathlib import Path
88import datasets
9+ import os
10+ import time
11+ from contextlib import contextmanager
12+ from datetime import datetime , timezone
913from tqdm import tqdm
1014
1115from openvino_genai import ContinuousBatchingPipeline , GenerationConfig , CacheEvictionConfig , AggregationMode
1822from utils .longbench import dataset2maxlen , evaluate , preprocess_prompt , post_process_pred
1923
2024
25+ def _ts () -> str :
26+ return datetime .now (timezone .utc ).isoformat (timespec = "milliseconds" )
27+
28+
29+ def _log (message : str ) -> None :
30+ print (f"[{ _ts ()} ] [kv-cache-eviction] { message } " , flush = True )
31+
32+
33+ @contextmanager
34+ def _stage (name : str ):
35+ _log (f"START { name } " )
36+ start = time .perf_counter ()
37+ try :
38+ yield
39+ finally :
40+ dt = time .perf_counter () - start
41+ _log (f"END { name } dt={ dt :.3f} s" )
42+
43+
2144def load_prompts_dataset (file_name : str ) -> dict [str , list [str ]]:
2245 TESTS_ROOT = Path (__file__ ).parent .parent
2346 file_path = TESTS_ROOT / 'data' / file_name
@@ -79,7 +102,15 @@ class CacheOptTestStruct:
79102def test_cache_optimized_generation_is_similar_to_unoptimized (test_struct , apply_rotation , use_sparse_attention ):
80103 import whowhatbench
81104
105+ if os .environ .get ("HF_HUB_OFFLINE" ) == "1" :
106+ pytest .skip ("HF_HUB_OFFLINE=1; cannot download/convert Hugging Face model" )
107+
82108 seqs_per_request = 32
109+ _log (
110+ "Test params: "
111+ f"test_id={ test_struct .test_id } prompt_file={ test_struct .prompt_file } max_new_tokens={ test_struct .max_new_tokens } "
112+ f"num_kv_blocks={ test_struct .num_kv_blocks } apply_rotation={ apply_rotation } use_sparse_attention={ use_sparse_attention } "
113+ )
83114 scheduler_config = get_scheduler_config (test_struct .num_kv_blocks )
84115
85116 generation_config = GenerationConfig () # expecting default greedy sampling
@@ -96,32 +127,45 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(test_struct, apply
96127 if use_sparse_attention :
97128 scheduler_config_opt .sparse_attention_config .num_last_dense_tokens_in_prefill = 10
98129
130+ if scheduler_config_opt .use_cache_eviction and scheduler_config_opt .cache_eviction_config is not None :
131+ cfg = scheduler_config_opt .cache_eviction_config
132+ _log (
133+ "Cache eviction config: "
134+ f"start_size={ cfg .get_start_size ()} recent_size={ cfg .get_recent_size ()} max_cache_size={ cfg .get_max_cache_size ()} "
135+ f"aggregation_mode={ cfg .aggregation_mode } apply_rotation={ cfg .apply_rotation } "
136+ )
137+
99138 model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
100- model_schema = download_and_convert_model (model_id )
139+ with _stage ("download_and_convert_model" ):
140+ model_schema = download_and_convert_model (model_id )
101141 tokenizer = model_schema .hf_tokenizer
102142 models_path = model_schema .models_path
103- model_cb_noopt = ContinuousBatchingPipeline (models_path , scheduler_config , "CPU" , {}, get_default_llm_properties ())
104- model_cb_opt = ContinuousBatchingPipeline (models_path , scheduler_config_opt , "CPU" , {}, get_default_llm_properties ())
143+ _log (f"Converted model path: { models_path } " )
144+ with _stage ("init_pipelines" ):
145+ model_cb_noopt = ContinuousBatchingPipeline (models_path , scheduler_config , "CPU" , {}, get_default_llm_properties ())
146+ model_cb_opt = ContinuousBatchingPipeline (models_path , scheduler_config_opt , "CPU" , {}, get_default_llm_properties ())
105147
106148 data_dict = load_prompts_dataset (test_struct .prompt_file )
107149
108- evaluator = whowhatbench .Evaluator (base_model = model_cb_noopt , tokenizer = tokenizer , test_data = data_dict ,
109- generation_config = generation_config ,
110- generation_config_base = generation_config ,
111- max_new_tokens = test_struct .max_new_tokens , seqs_per_request = seqs_per_request )
150+ with _stage ("init_whowhatbench_evaluator" ):
151+ evaluator = whowhatbench .Evaluator (base_model = model_cb_noopt , tokenizer = tokenizer , test_data = data_dict ,
152+ generation_config = generation_config ,
153+ generation_config_base = generation_config ,
154+ max_new_tokens = test_struct .max_new_tokens , seqs_per_request = seqs_per_request )
112155
113- _ , all_metrics = evaluator .score (model_cb_opt )
156+ with _stage ("whowhatbench_score" ):
157+ _ , all_metrics = evaluator .score (model_cb_opt )
114158
115159 similarity_metric = float (all_metrics ['similarity' ][0 ])
116160 pipeline_opt_metrics = model_cb_opt .get_metrics ()
117161 pipeline_noopt_metrics = model_cb_noopt .get_metrics ()
118162
119- print (f"Similarity: { similarity_metric } " )
120- print (f"No-opt cache usage: max { pipeline_noopt_metrics .max_cache_usage :.3f} , avg { pipeline_noopt_metrics .avg_cache_usage :.3f} " )
121- print (f"Opt cache usage: max { pipeline_opt_metrics .max_cache_usage :.3f} , avg { pipeline_opt_metrics .avg_cache_usage :.3f} " )
163+ _log (f"Similarity: { similarity_metric } " )
164+ _log (f"No-opt cache usage: max { pipeline_noopt_metrics .max_cache_usage :.3f} , avg { pipeline_noopt_metrics .avg_cache_usage :.3f} " )
165+ _log (f"Opt cache usage: max { pipeline_opt_metrics .max_cache_usage :.3f} , avg { pipeline_opt_metrics .avg_cache_usage :.3f} " )
122166 max_optimization_ratio = (pipeline_noopt_metrics .max_cache_usage / pipeline_opt_metrics .max_cache_usage )
123167 avg_optimization_ratio = (pipeline_noopt_metrics .avg_cache_usage / pipeline_opt_metrics .avg_cache_usage )
124- print (f"Optimization ratios: max { max_optimization_ratio :.3f} x, avg { avg_optimization_ratio :.3f} x" )
168+ _log (f"Optimization ratios: max { max_optimization_ratio :.3f} x, avg { avg_optimization_ratio :.3f} x" )
125169
126170 del model_cb_opt
127171 del model_cb_noopt
@@ -166,14 +210,18 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
166210]
167211@pytest .mark .parametrize ("params" , scheduler_params_list )
168212def test_dynamic_memory_allocation (params ):
213+ _log (f"test_dynamic_memory_allocation params={ params [0 ]} generation_config={ params [1 ]} " )
169214 prompts , _ = get_test_dataset ()
170- generate_and_compare (
171- model_schema = download_and_convert_model ("facebook/opt-125m" ),
172- prompts = prompts ,
173- scheduler_config = params [0 ],
174- generation_config = params [1 ],
175- pipeline_type = PipelineType .CONTINUOUS_BATCHING
176- )
215+ with _stage ("download_and_convert_model_opt125m" ):
216+ model_schema = download_and_convert_model ("facebook/opt-125m" )
217+ with _stage ("generate_and_compare" ):
218+ generate_and_compare (
219+ model_schema = model_schema ,
220+ prompts = prompts ,
221+ scheduler_config = params [0 ],
222+ generation_config = params [1 ],
223+ pipeline_type = PipelineType .CONTINUOUS_BATCHING
224+ )
177225
178226
179227@dataclass (frozen = True )
@@ -189,11 +237,22 @@ class LongBenchTestData:
189237 LongBenchTestData ("trec" , 3.2 , 2.0 , 3.3 ),
190238], ids = ["samsum" , "trec" ])
191239def test_optimized_generation_longbench (test_struct ):
240+ if os .environ .get ("HF_DATASETS_OFFLINE" ) == "1" :
241+ pytest .skip ("HF_DATASETS_OFFLINE=1; cannot load THUDM/LongBench" )
242+ if os .environ .get ("HF_HUB_OFFLINE" ) == "1" :
243+ pytest .skip ("HF_HUB_OFFLINE=1; cannot download/convert Hugging Face model" )
244+
192245 seqs_per_request = 32
193246 device = "CPU"
194247 num_kv_blocks = 1000 if device == "CPU" else 500
195248 model_id = "Qwen/Qwen2-0.5B-Instruct"
196- models_path = download_and_convert_model (model_id ).models_path
249+ _log (
250+ "Test params: "
251+ f"subset={ test_struct .subset } device={ device } seqs_per_request={ seqs_per_request } num_kv_blocks={ num_kv_blocks } model_id={ model_id } "
252+ )
253+ with _stage ("download_and_convert_model" ):
254+ models_path = download_and_convert_model (model_id ).models_path
255+ _log (f"Converted model path: { models_path } " )
197256 scheduler_config = get_scheduler_config (num_kv_blocks )
198257
199258 scheduler_config_opt = get_scheduler_config (num_kv_blocks )
@@ -202,18 +261,27 @@ def test_optimized_generation_longbench(test_struct):
202261
203262 scheduler_config_opt .use_sparse_attention = True
204263
205- model_cb_noopt = ContinuousBatchingPipeline (models_path , scheduler_config , device , {}, get_default_llm_properties ())
206- model_cb_opt = ContinuousBatchingPipeline (models_path , scheduler_config_opt , device , {}, get_default_llm_properties ())
264+ _log (
265+ "Cache eviction config (LongBench): "
266+ f"start_size={ LONGBENCH_CACHE_EVICTION_CONFIG .get_start_size ()} recent_size={ LONGBENCH_CACHE_EVICTION_CONFIG .get_recent_size ()} "
267+ f"max_cache_size={ LONGBENCH_CACHE_EVICTION_CONFIG .get_max_cache_size ()} aggregation_mode={ LONGBENCH_CACHE_EVICTION_CONFIG .aggregation_mode } "
268+ )
269+
270+ with _stage ("init_pipelines" ):
271+ model_cb_noopt = ContinuousBatchingPipeline (models_path , scheduler_config , device , {}, get_default_llm_properties ())
272+ model_cb_opt = ContinuousBatchingPipeline (models_path , scheduler_config_opt , device , {}, get_default_llm_properties ())
207273
208274 model_name = "/" .join (models_path .parts [- 2 :])
209275 subset = test_struct .subset
210276 max_new_tokens = dataset2maxlen [subset ]
277+ _log (f"model_name={ model_name } max_new_tokens={ max_new_tokens } " )
211278
212279 generation_config = GenerationConfig () # expecting default greedy sampling
213280 generation_config .num_return_sequences = 1
214281 generation_config .max_new_tokens = max_new_tokens
215282
216- data = datasets .load_dataset ('THUDM/LongBench' , subset , split = 'test[:32]' , trust_remote_code = True )
283+ with _stage ("load_dataset" ):
284+ data = datasets .load_dataset ('THUDM/LongBench' , subset , split = 'test[:32]' , trust_remote_code = True )
217285 with tqdm (total = len (data )) as progress_bar :
218286 batch = []
219287 answers = []
@@ -226,30 +294,35 @@ def test_optimized_generation_longbench(test_struct):
226294 ref_answers .append ({"answers" : data_sample ["answers" ], "all_classes" : data_sample ["all_classes" ]})
227295
228296 if len (batch ) == seqs_per_request or p_idx == len (data ) - 1 :
229- ans_batch = model_cb_opt .generate (
230- batch , [generation_config ] * len (batch )
231- )
232- ref_ans_batch = model_cb_noopt .generate (
233- batch , [generation_config ] * len (batch )
234- )
297+ _log (f"Generating batch size={ len (batch )} last_index={ p_idx } " )
298+ with _stage ("opt_generate" ):
299+ ans_batch = model_cb_opt .generate (
300+ batch , [generation_config ] * len (batch )
301+ )
302+ with _stage ("ref_generate" ):
303+ ref_ans_batch = model_cb_noopt .generate (
304+ batch , [generation_config ] * len (batch )
305+ )
235306 for i , (opt_output , ref_output ) in enumerate (zip (ans_batch , ref_ans_batch ), start = p_idx - len (batch )+ 1 ):
236307 answers [i ]["pred" ] = post_process_pred (opt_output .m_generation_ids [0 ], subset , model_name )
237308 ref_answers [i ]["pred" ] = post_process_pred (ref_output .m_generation_ids [0 ], subset , model_name )
238309 batch .clear ()
239310
240- score = evaluate (answers , subset )
241- print (f"Score: { score } " )
311+ with _stage ("evaluate_opt" ):
312+ score = evaluate (answers , subset )
313+ _log (f"Score: { score } " )
242314
243- ref_score = evaluate (ref_answers , subset )
244- print (f"Reference score: { ref_score } " )
315+ with _stage ("evaluate_ref" ):
316+ ref_score = evaluate (ref_answers , subset )
317+ _log (f"Reference score: { ref_score } " )
245318 pipeline_opt_metrics = model_cb_opt .get_metrics ()
246319 pipeline_noopt_metrics = model_cb_noopt .get_metrics ()
247320
248- print (f"No-opt cache usage: max { pipeline_noopt_metrics .max_cache_usage :.3f} , avg { pipeline_noopt_metrics .avg_cache_usage :.3f} " )
249- print (f"Opt cache usage: max { pipeline_opt_metrics .max_cache_usage :.3f} , avg { pipeline_opt_metrics .avg_cache_usage :.3f} " )
321+ _log (f"No-opt cache usage: max { pipeline_noopt_metrics .max_cache_usage :.3f} , avg { pipeline_noopt_metrics .avg_cache_usage :.3f} " )
322+ _log (f"Opt cache usage: max { pipeline_opt_metrics .max_cache_usage :.3f} , avg { pipeline_opt_metrics .avg_cache_usage :.3f} " )
250323 max_optimization_ratio = (pipeline_noopt_metrics .max_cache_usage / pipeline_opt_metrics .max_cache_usage )
251324 avg_optimization_ratio = (pipeline_noopt_metrics .avg_cache_usage / pipeline_opt_metrics .avg_cache_usage )
252- print (f"Optimization ratios: max { max_optimization_ratio :.3f} x, avg { avg_optimization_ratio :.3f} x" )
325+ _log (f"Optimization ratios: max { max_optimization_ratio :.3f} x, avg { avg_optimization_ratio :.3f} x" )
253326
254327 del model_cb_opt
255328 del model_cb_noopt
0 commit comments