@@ -232,6 +232,108 @@ class LongBenchTestData:
232232 avg_cache_usage_optimization_ratio : float
233233
234234
235+ @pytest .mark .parametrize ("test_struct" , [
236+ LongBenchTestData ("samsum" , 4 , 1.6 , 2.5 ),
237+ LongBenchTestData ("trec" , 3.2 , 2.0 , 3.3 ),
238+ ], ids = ["samsum" , "trec" ])
239+ def test_optimized_generation_longbench_new_model (test_struct ):
240+ if os .environ .get ("HF_DATASETS_OFFLINE" ) == "1" :
241+ pytest .skip ("HF_DATASETS_OFFLINE=1; cannot load THUDM/LongBench" )
242+ if os .environ .get ("HF_HUB_OFFLINE" ) == "1" :
243+ pytest .skip ("HF_HUB_OFFLINE=1; cannot download/convert Hugging Face model" )
244+
245+ seqs_per_request = 32
246+ device = "CPU"
247+ num_kv_blocks = 1000 if device == "CPU" else 500
248+ model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"
249+ _log (
250+ "Test params: "
251+ f"subset={ test_struct .subset } device={ device } seqs_per_request={ seqs_per_request } num_kv_blocks={ num_kv_blocks } model_id={ model_id } "
252+ )
253+ with _stage ("download_and_convert_model" ):
254+ models_path = download_and_convert_model (model_id ).models_path
255+ _log (f"Converted model path: { models_path } " )
256+ scheduler_config = get_scheduler_config (num_kv_blocks )
257+
258+ scheduler_config_opt = get_scheduler_config (num_kv_blocks )
259+ scheduler_config_opt .use_cache_eviction = True
260+ scheduler_config_opt .cache_eviction_config = LONGBENCH_CACHE_EVICTION_CONFIG
261+
262+ scheduler_config_opt .use_sparse_attention = True
263+
264+ _log (
265+ "Cache eviction config (LongBench): "
266+ f"start_size={ LONGBENCH_CACHE_EVICTION_CONFIG .get_start_size ()} recent_size={ LONGBENCH_CACHE_EVICTION_CONFIG .get_recent_size ()} "
267+ f"max_cache_size={ LONGBENCH_CACHE_EVICTION_CONFIG .get_max_cache_size ()} aggregation_mode={ LONGBENCH_CACHE_EVICTION_CONFIG .aggregation_mode } "
268+ )
269+
270+ with _stage ("init_pipelines" ):
271+ model_cb_noopt = ContinuousBatchingPipeline (models_path , scheduler_config , device , {}, get_default_llm_properties ())
272+ model_cb_opt = ContinuousBatchingPipeline (models_path , scheduler_config_opt , device , {}, get_default_llm_properties ())
273+
274+ model_name = "/" .join (models_path .parts [- 2 :])
275+ subset = test_struct .subset
276+ max_new_tokens = dataset2maxlen [subset ]
277+ _log (f"model_name={ model_name } max_new_tokens={ max_new_tokens } " )
278+
279+ generation_config = GenerationConfig () # expecting default greedy sampling
280+ generation_config .num_return_sequences = 1
281+ generation_config .max_new_tokens = max_new_tokens
282+
283+ with _stage ("load_dataset" ):
284+ data = datasets .load_dataset ('THUDM/LongBench' , subset , split = 'test[:32]' , trust_remote_code = True )
285+ with tqdm (total = len (data )) as progress_bar :
286+ batch = []
287+ answers = []
288+ ref_answers = []
289+ for p_idx , data_sample in enumerate (data ):
290+ prompt = preprocess_prompt (data_sample , subset , model_name )
291+ progress_bar .update (1 )
292+ batch .append (prompt )
293+ answers .append ({"answers" : data_sample ["answers" ], "all_classes" : data_sample ["all_classes" ]})
294+ ref_answers .append ({"answers" : data_sample ["answers" ], "all_classes" : data_sample ["all_classes" ]})
295+
296+ if len (batch ) == seqs_per_request or p_idx == len (data ) - 1 :
297+ _log (f"Generating batch size={ len (batch )} last_index={ p_idx } " )
298+ with _stage ("opt_generate" ):
299+ ans_batch = model_cb_opt .generate (
300+ batch , [generation_config ] * len (batch )
301+ )
302+ with _stage ("ref_generate" ):
303+ ref_ans_batch = model_cb_noopt .generate (
304+ batch , [generation_config ] * len (batch )
305+ )
306+ for i , (opt_output , ref_output ) in enumerate (zip (ans_batch , ref_ans_batch ), start = p_idx - len (batch )+ 1 ):
307+ answers [i ]["pred" ] = post_process_pred (opt_output .m_generation_ids [0 ], subset , model_name )
308+ ref_answers [i ]["pred" ] = post_process_pred (ref_output .m_generation_ids [0 ], subset , model_name )
309+ batch .clear ()
310+
311+ with _stage ("evaluate_opt" ):
312+ score = evaluate (answers , subset )
313+ _log (f"Score: { score } " )
314+
315+ with _stage ("evaluate_ref" ):
316+ ref_score = evaluate (ref_answers , subset )
317+ _log (f"Reference score: { ref_score } " )
318+ pipeline_opt_metrics = model_cb_opt .get_metrics ()
319+ pipeline_noopt_metrics = model_cb_noopt .get_metrics ()
320+
321+ _log (f"No-opt cache usage: max { pipeline_noopt_metrics .max_cache_usage :.3f} , avg { pipeline_noopt_metrics .avg_cache_usage :.3f} " )
322+ _log (f"Opt cache usage: max { pipeline_opt_metrics .max_cache_usage :.3f} , avg { pipeline_opt_metrics .avg_cache_usage :.3f} " )
323+ max_optimization_ratio = (pipeline_noopt_metrics .max_cache_usage / pipeline_opt_metrics .max_cache_usage )
324+ avg_optimization_ratio = (pipeline_noopt_metrics .avg_cache_usage / pipeline_opt_metrics .avg_cache_usage )
325+ _log (f"Optimization ratios: max { max_optimization_ratio :.3f} x, avg { avg_optimization_ratio :.3f} x" )
326+
327+ del model_cb_opt
328+ del model_cb_noopt
329+ import gc
330+ gc .collect ()
331+
332+ assert ref_score - score <= test_struct .threshold
333+ assert max_optimization_ratio >= test_struct .max_cache_usage_optimization_ratio
334+ assert avg_optimization_ratio >= test_struct .avg_cache_usage_optimization_ratio
335+
336+
235337@pytest .mark .parametrize ("test_struct" , [
236338 LongBenchTestData ("samsum" , 4 , 1.6 , 2.5 ),
237339 LongBenchTestData ("trec" , 3.2 , 2.0 , 3.3 ),
0 commit comments