Skip to content

Commit 6da9258

Browse files
committed
Update cache evuction tests to use new model
1 parent e19c523 commit 6da9258

File tree

1 file changed

+102
-0
lines changed

1 file changed

+102
-0
lines changed

tests/python_tests/test_kv_cache_eviction/test_kv_cache_eviction_1.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,108 @@ class LongBenchTestData:
232232
avg_cache_usage_optimization_ratio: float
233233

234234

235+
@pytest.mark.parametrize("test_struct", [
236+
LongBenchTestData("samsum", 4, 1.6, 2.5),
237+
LongBenchTestData("trec", 3.2, 2.0, 3.3),
238+
], ids=["samsum", "trec"])
239+
def test_optimized_generation_longbench_new_model(test_struct):
240+
if os.environ.get("HF_DATASETS_OFFLINE") == "1":
241+
pytest.skip("HF_DATASETS_OFFLINE=1; cannot load THUDM/LongBench")
242+
if os.environ.get("HF_HUB_OFFLINE") == "1":
243+
pytest.skip("HF_HUB_OFFLINE=1; cannot download/convert Hugging Face model")
244+
245+
seqs_per_request = 32
246+
device = "CPU"
247+
num_kv_blocks = 1000 if device == "CPU" else 500
248+
model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM"
249+
_log(
250+
"Test params: "
251+
f"subset={test_struct.subset} device={device} seqs_per_request={seqs_per_request} num_kv_blocks={num_kv_blocks} model_id={model_id}"
252+
)
253+
with _stage("download_and_convert_model"):
254+
models_path = download_and_convert_model(model_id).models_path
255+
_log(f"Converted model path: {models_path}")
256+
scheduler_config = get_scheduler_config(num_kv_blocks)
257+
258+
scheduler_config_opt = get_scheduler_config(num_kv_blocks)
259+
scheduler_config_opt.use_cache_eviction = True
260+
scheduler_config_opt.cache_eviction_config = LONGBENCH_CACHE_EVICTION_CONFIG
261+
262+
scheduler_config_opt.use_sparse_attention = True
263+
264+
_log(
265+
"Cache eviction config (LongBench): "
266+
f"start_size={LONGBENCH_CACHE_EVICTION_CONFIG.get_start_size()} recent_size={LONGBENCH_CACHE_EVICTION_CONFIG.get_recent_size()} "
267+
f"max_cache_size={LONGBENCH_CACHE_EVICTION_CONFIG.get_max_cache_size()} aggregation_mode={LONGBENCH_CACHE_EVICTION_CONFIG.aggregation_mode}"
268+
)
269+
270+
with _stage("init_pipelines"):
271+
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, get_default_llm_properties())
272+
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, get_default_llm_properties())
273+
274+
model_name = "/".join(models_path.parts[-2:])
275+
subset = test_struct.subset
276+
max_new_tokens = dataset2maxlen[subset]
277+
_log(f"model_name={model_name} max_new_tokens={max_new_tokens}")
278+
279+
generation_config = GenerationConfig() # expecting default greedy sampling
280+
generation_config.num_return_sequences = 1
281+
generation_config.max_new_tokens = max_new_tokens
282+
283+
with _stage("load_dataset"):
284+
data = datasets.load_dataset('THUDM/LongBench', subset, split='test[:32]', trust_remote_code=True)
285+
with tqdm(total=len(data)) as progress_bar:
286+
batch = []
287+
answers = []
288+
ref_answers = []
289+
for p_idx, data_sample in enumerate(data):
290+
prompt = preprocess_prompt(data_sample, subset, model_name)
291+
progress_bar.update(1)
292+
batch.append(prompt)
293+
answers.append({"answers": data_sample["answers"], "all_classes": data_sample["all_classes"]})
294+
ref_answers.append({"answers": data_sample["answers"], "all_classes": data_sample["all_classes"]})
295+
296+
if len(batch) == seqs_per_request or p_idx == len(data) - 1:
297+
_log(f"Generating batch size={len(batch)} last_index={p_idx}")
298+
with _stage("opt_generate"):
299+
ans_batch = model_cb_opt.generate(
300+
batch, [generation_config] * len(batch)
301+
)
302+
with _stage("ref_generate"):
303+
ref_ans_batch = model_cb_noopt.generate(
304+
batch, [generation_config] * len(batch)
305+
)
306+
for i, (opt_output, ref_output) in enumerate(zip(ans_batch, ref_ans_batch), start=p_idx-len(batch)+1):
307+
answers[i]["pred"] = post_process_pred(opt_output.m_generation_ids[0], subset, model_name)
308+
ref_answers[i]["pred"] = post_process_pred(ref_output.m_generation_ids[0], subset, model_name)
309+
batch.clear()
310+
311+
with _stage("evaluate_opt"):
312+
score = evaluate(answers, subset)
313+
_log(f"Score: {score}")
314+
315+
with _stage("evaluate_ref"):
316+
ref_score = evaluate(ref_answers, subset)
317+
_log(f"Reference score: {ref_score}")
318+
pipeline_opt_metrics = model_cb_opt.get_metrics()
319+
pipeline_noopt_metrics = model_cb_noopt.get_metrics()
320+
321+
_log(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
322+
_log(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}")
323+
max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage)
324+
avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage)
325+
_log(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x")
326+
327+
del model_cb_opt
328+
del model_cb_noopt
329+
import gc
330+
gc.collect()
331+
332+
assert ref_score - score <= test_struct.threshold
333+
assert max_optimization_ratio >= test_struct.max_cache_usage_optimization_ratio
334+
assert avg_optimization_ratio >= test_struct.avg_cache_usage_optimization_ratio
335+
336+
235337
@pytest.mark.parametrize("test_struct", [
236338
LongBenchTestData("samsum", 4, 1.6, 2.5),
237339
LongBenchTestData("trec", 3.2, 2.0, 3.3),

0 commit comments

Comments
 (0)