Skip to content

Commit 3827700

Browse files
committed
Add debug info to tests
1 parent aa84f1f commit 3827700

File tree

7 files changed

+410
-85
lines changed

7 files changed

+410
-85
lines changed

tests/python_tests/test_kv_cache_eviction/test_kv_cache_eviction_1.py

Lines changed: 109 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
from dataclasses import dataclass
77
from pathlib import Path
88
import datasets
9+
import os
10+
import time
11+
from contextlib import contextmanager
12+
from datetime import datetime, timezone
913
from tqdm import tqdm
1014

1115
from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, CacheEvictionConfig, AggregationMode
@@ -18,6 +22,25 @@
1822
from utils.longbench import dataset2maxlen, evaluate, preprocess_prompt, post_process_pred
1923

2024

25+
def _ts() -> str:
26+
return datetime.now(timezone.utc).isoformat(timespec="milliseconds")
27+
28+
29+
def _log(message: str) -> None:
30+
print(f"[{_ts()}] [kv-cache-eviction] {message}", flush=True)
31+
32+
33+
@contextmanager
34+
def _stage(name: str):
35+
_log(f"START {name}")
36+
start = time.perf_counter()
37+
try:
38+
yield
39+
finally:
40+
dt = time.perf_counter() - start
41+
_log(f"END {name} dt={dt:.3f}s")
42+
43+
2144
def load_prompts_dataset(file_name : str) -> dict[str, list[str]]:
2245
TESTS_ROOT = Path(__file__).parent.parent
2346
file_path = TESTS_ROOT / 'data' / file_name
@@ -79,7 +102,15 @@ class CacheOptTestStruct:
79102
def test_cache_optimized_generation_is_similar_to_unoptimized(test_struct, apply_rotation, use_sparse_attention):
80103
import whowhatbench
81104

105+
if os.environ.get("HF_HUB_OFFLINE") == "1":
106+
pytest.skip("HF_HUB_OFFLINE=1; cannot download/convert Hugging Face model")
107+
82108
seqs_per_request = 32
109+
_log(
110+
"Test params: "
111+
f"test_id={test_struct.test_id} prompt_file={test_struct.prompt_file} max_new_tokens={test_struct.max_new_tokens} "
112+
f"num_kv_blocks={test_struct.num_kv_blocks} apply_rotation={apply_rotation} use_sparse_attention={use_sparse_attention}"
113+
)
83114
scheduler_config = get_scheduler_config(test_struct.num_kv_blocks)
84115

85116
generation_config = GenerationConfig() # expecting default greedy sampling
@@ -96,32 +127,45 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(test_struct, apply
96127
if use_sparse_attention:
97128
scheduler_config_opt.sparse_attention_config.num_last_dense_tokens_in_prefill = 10
98129

130+
if scheduler_config_opt.use_cache_eviction and scheduler_config_opt.cache_eviction_config is not None:
131+
cfg = scheduler_config_opt.cache_eviction_config
132+
_log(
133+
"Cache eviction config: "
134+
f"start_size={cfg.get_start_size()} recent_size={cfg.get_recent_size()} max_cache_size={cfg.get_max_cache_size()} "
135+
f"aggregation_mode={cfg.aggregation_mode} apply_rotation={cfg.apply_rotation}"
136+
)
137+
99138
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
100-
model_schema = download_and_convert_model(model_id)
139+
with _stage("download_and_convert_model"):
140+
model_schema = download_and_convert_model(model_id)
101141
tokenizer = model_schema.hf_tokenizer
102142
models_path = model_schema.models_path
103-
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_llm_properties())
104-
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_llm_properties())
143+
_log(f"Converted model path: {models_path}")
144+
with _stage("init_pipelines"):
145+
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_llm_properties())
146+
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_llm_properties())
105147

106148
data_dict = load_prompts_dataset(test_struct.prompt_file)
107149

108-
evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict,
109-
generation_config=generation_config,
110-
generation_config_base=generation_config,
111-
max_new_tokens=test_struct.max_new_tokens, seqs_per_request=seqs_per_request)
150+
with _stage("init_whowhatbench_evaluator"):
151+
evaluator = whowhatbench.Evaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict,
152+
generation_config=generation_config,
153+
generation_config_base=generation_config,
154+
max_new_tokens=test_struct.max_new_tokens, seqs_per_request=seqs_per_request)
112155

113-
_, all_metrics = evaluator.score(model_cb_opt)
156+
with _stage("whowhatbench_score"):
157+
_, all_metrics = evaluator.score(model_cb_opt)
114158

115159
similarity_metric = float(all_metrics['similarity'][0])
116160
pipeline_opt_metrics = model_cb_opt.get_metrics()
117161
pipeline_noopt_metrics = model_cb_noopt.get_metrics()
118162

119-
print(f"Similarity: {similarity_metric}")
120-
print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
121-
print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}")
163+
_log(f"Similarity: {similarity_metric}")
164+
_log(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
165+
_log(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}")
122166
max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage)
123167
avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage)
124-
print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x")
168+
_log(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x")
125169

126170
del model_cb_opt
127171
del model_cb_noopt
@@ -166,14 +210,18 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
166210
]
167211
@pytest.mark.parametrize("params", scheduler_params_list)
168212
def test_dynamic_memory_allocation(params):
213+
_log(f"test_dynamic_memory_allocation params={params[0]} generation_config={params[1]}")
169214
prompts, _ = get_test_dataset()
170-
generate_and_compare(
171-
model_schema=download_and_convert_model("facebook/opt-125m"),
172-
prompts=prompts,
173-
scheduler_config=params[0],
174-
generation_config=params[1],
175-
pipeline_type=PipelineType.CONTINUOUS_BATCHING
176-
)
215+
with _stage("download_and_convert_model_opt125m"):
216+
model_schema = download_and_convert_model("facebook/opt-125m")
217+
with _stage("generate_and_compare"):
218+
generate_and_compare(
219+
model_schema=model_schema,
220+
prompts=prompts,
221+
scheduler_config=params[0],
222+
generation_config=params[1],
223+
pipeline_type=PipelineType.CONTINUOUS_BATCHING
224+
)
177225

178226

179227
@dataclass(frozen=True)
@@ -189,11 +237,22 @@ class LongBenchTestData:
189237
LongBenchTestData("trec", 3.2, 2.0, 3.3),
190238
], ids=["samsum", "trec"])
191239
def test_optimized_generation_longbench(test_struct):
240+
if os.environ.get("HF_DATASETS_OFFLINE") == "1":
241+
pytest.skip("HF_DATASETS_OFFLINE=1; cannot load THUDM/LongBench")
242+
if os.environ.get("HF_HUB_OFFLINE") == "1":
243+
pytest.skip("HF_HUB_OFFLINE=1; cannot download/convert Hugging Face model")
244+
192245
seqs_per_request = 32
193246
device = "CPU"
194247
num_kv_blocks = 1000 if device == "CPU" else 500
195248
model_id = "Qwen/Qwen2-0.5B-Instruct"
196-
models_path = download_and_convert_model(model_id).models_path
249+
_log(
250+
"Test params: "
251+
f"subset={test_struct.subset} device={device} seqs_per_request={seqs_per_request} num_kv_blocks={num_kv_blocks} model_id={model_id}"
252+
)
253+
with _stage("download_and_convert_model"):
254+
models_path = download_and_convert_model(model_id).models_path
255+
_log(f"Converted model path: {models_path}")
197256
scheduler_config = get_scheduler_config(num_kv_blocks)
198257

199258
scheduler_config_opt = get_scheduler_config(num_kv_blocks)
@@ -202,18 +261,27 @@ def test_optimized_generation_longbench(test_struct):
202261

203262
scheduler_config_opt.use_sparse_attention = True
204263

205-
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, get_default_llm_properties())
206-
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, get_default_llm_properties())
264+
_log(
265+
"Cache eviction config (LongBench): "
266+
f"start_size={LONGBENCH_CACHE_EVICTION_CONFIG.get_start_size()} recent_size={LONGBENCH_CACHE_EVICTION_CONFIG.get_recent_size()} "
267+
f"max_cache_size={LONGBENCH_CACHE_EVICTION_CONFIG.get_max_cache_size()} aggregation_mode={LONGBENCH_CACHE_EVICTION_CONFIG.aggregation_mode}"
268+
)
269+
270+
with _stage("init_pipelines"):
271+
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, device, {}, get_default_llm_properties())
272+
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, device, {}, get_default_llm_properties())
207273

208274
model_name = "/".join(models_path.parts[-2:])
209275
subset = test_struct.subset
210276
max_new_tokens = dataset2maxlen[subset]
277+
_log(f"model_name={model_name} max_new_tokens={max_new_tokens}")
211278

212279
generation_config = GenerationConfig() # expecting default greedy sampling
213280
generation_config.num_return_sequences = 1
214281
generation_config.max_new_tokens = max_new_tokens
215282

216-
data = datasets.load_dataset('THUDM/LongBench', subset, split='test[:32]', trust_remote_code=True)
283+
with _stage("load_dataset"):
284+
data = datasets.load_dataset('THUDM/LongBench', subset, split='test[:32]', trust_remote_code=True)
217285
with tqdm(total=len(data)) as progress_bar:
218286
batch = []
219287
answers = []
@@ -226,30 +294,35 @@ def test_optimized_generation_longbench(test_struct):
226294
ref_answers.append({"answers": data_sample["answers"], "all_classes": data_sample["all_classes"]})
227295

228296
if len(batch) == seqs_per_request or p_idx == len(data) - 1:
229-
ans_batch = model_cb_opt.generate(
230-
batch, [generation_config] * len(batch)
231-
)
232-
ref_ans_batch = model_cb_noopt.generate(
233-
batch, [generation_config] * len(batch)
234-
)
297+
_log(f"Generating batch size={len(batch)} last_index={p_idx}")
298+
with _stage("opt_generate"):
299+
ans_batch = model_cb_opt.generate(
300+
batch, [generation_config] * len(batch)
301+
)
302+
with _stage("ref_generate"):
303+
ref_ans_batch = model_cb_noopt.generate(
304+
batch, [generation_config] * len(batch)
305+
)
235306
for i, (opt_output, ref_output) in enumerate(zip(ans_batch, ref_ans_batch), start=p_idx-len(batch)+1):
236307
answers[i]["pred"] = post_process_pred(opt_output.m_generation_ids[0], subset, model_name)
237308
ref_answers[i]["pred"] = post_process_pred(ref_output.m_generation_ids[0], subset, model_name)
238309
batch.clear()
239310

240-
score = evaluate(answers, subset)
241-
print(f"Score: {score}")
311+
with _stage("evaluate_opt"):
312+
score = evaluate(answers, subset)
313+
_log(f"Score: {score}")
242314

243-
ref_score = evaluate(ref_answers, subset)
244-
print(f"Reference score: {ref_score}")
315+
with _stage("evaluate_ref"):
316+
ref_score = evaluate(ref_answers, subset)
317+
_log(f"Reference score: {ref_score}")
245318
pipeline_opt_metrics = model_cb_opt.get_metrics()
246319
pipeline_noopt_metrics = model_cb_noopt.get_metrics()
247320

248-
print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
249-
print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}")
321+
_log(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}")
322+
_log(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}")
250323
max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage)
251324
avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage)
252-
print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x")
325+
_log(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x")
253326

254327
del model_cb_opt
255328
del model_cb_noopt

tests/python_tests/test_kv_cache_eviction/test_kv_cache_eviction_2.py

Lines changed: 71 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
import datasets
55
import pytest
66

7+
import os
8+
import time
9+
from contextlib import contextmanager
10+
from datetime import datetime, timezone
11+
712
from tqdm import tqdm
813

914
from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, CacheEvictionConfig, AggregationMode, KVCrushAnchorPointMode, KVCrushConfig
@@ -14,6 +19,25 @@
1419
from kv_cache_eviction_utils import get_scheduler_config
1520

1621

22+
def _ts() -> str:
23+
return datetime.now(timezone.utc).isoformat(timespec="milliseconds")
24+
25+
26+
def _log(message: str) -> None:
27+
print(f"[{_ts()}] [kv-cache-eviction] {message}", flush=True)
28+
29+
30+
@contextmanager
31+
def _stage(name: str):
32+
_log(f"START {name}")
33+
start = time.perf_counter()
34+
try:
35+
yield
36+
finally:
37+
dt = time.perf_counter() - start
38+
_log(f"END {name} dt={dt:.3f}s")
39+
40+
1741
# KVCrush test configurations
1842
KVCRUSH_SNAPKV_BASELINE_CONFIG = CacheEvictionConfig(
1943
start_size=32,
@@ -36,17 +60,35 @@
3660
@pytest.mark.parametrize("subset", ["samsum", "trec", "qasper"])
3761
def test_kvcrush_vs_snapkv_baseline_longbench(subset):
3862
"""Test that KVCrush performs equal or better than SnapKV baseline on LongBench datasets."""
63+
if os.environ.get("HF_DATASETS_OFFLINE") == "1":
64+
pytest.skip("HF_DATASETS_OFFLINE=1; cannot load THUDM/LongBench")
65+
if os.environ.get("HF_HUB_OFFLINE") == "1":
66+
pytest.skip("HF_HUB_OFFLINE=1; cannot download/convert Hugging Face model")
67+
3968
device = "CPU"
4069
seqs_per_request = 32
4170
num_kv_blocks = 1000 if device == "CPU" else 500
4271
model_id = "Qwen/Qwen2-0.5B-Instruct"
43-
models_path = download_and_convert_model(model_id).models_path
72+
_log(
73+
"Test params: "
74+
f"subset={subset} device={device} seqs_per_request={seqs_per_request} num_kv_blocks={num_kv_blocks} model_id={model_id}"
75+
)
76+
with _stage("download_and_convert_model"):
77+
models_path = download_and_convert_model(model_id).models_path
78+
_log(f"Converted model path: {models_path}")
4479

4580
# Setup baseline and KVCrush configurations
4681
scheduler_config_baseline = get_scheduler_config(num_kv_blocks)
4782
scheduler_config_baseline.use_cache_eviction = True
4883
scheduler_config_baseline.cache_eviction_config = KVCRUSH_SNAPKV_BASELINE_CONFIG
4984

85+
_log(
86+
"Baseline SnapKV config: "
87+
f"start_size={KVCRUSH_SNAPKV_BASELINE_CONFIG.get_start_size()} recent_size={KVCRUSH_SNAPKV_BASELINE_CONFIG.get_recent_size()} "
88+
f"max_cache_size={KVCRUSH_SNAPKV_BASELINE_CONFIG.get_max_cache_size()} snapkv_window_size={KVCRUSH_SNAPKV_BASELINE_CONFIG.snapkv_window_size} "
89+
f"apply_rotation={KVCRUSH_SNAPKV_BASELINE_CONFIG.apply_rotation} kvcrush_budget={KVCRUSH_SNAPKV_BASELINE_CONFIG.kvcrush_config.budget}"
90+
)
91+
5092
scheduler_config_kvcrush = get_scheduler_config(num_kv_blocks)
5193
scheduler_config_kvcrush.use_cache_eviction = True
5294
max_cache_size, budget, anchor_mode = OPTIMAL_KVCRUSH_CONFIGS[subset]
@@ -61,18 +103,28 @@ def test_kvcrush_vs_snapkv_baseline_longbench(subset):
61103
)
62104
scheduler_config_kvcrush.cache_eviction_config = config
63105

64-
model_cb_baseline = ContinuousBatchingPipeline(models_path, scheduler_config_baseline, device, {}, get_default_llm_properties())
65-
model_cb_kvcrush = ContinuousBatchingPipeline(models_path, scheduler_config_kvcrush, device, {}, get_default_llm_properties())
106+
_log(
107+
"KVCrush config: "
108+
f"start_size={config.get_start_size()} recent_size={config.get_recent_size()} max_cache_size={config.get_max_cache_size()} "
109+
f"snapkv_window_size={config.snapkv_window_size} apply_rotation={config.apply_rotation} "
110+
f"kvcrush_budget={config.kvcrush_config.budget} anchor_point_mode={config.kvcrush_config.anchor_point_mode}"
111+
)
112+
113+
with _stage("init_pipelines"):
114+
model_cb_baseline = ContinuousBatchingPipeline(models_path, scheduler_config_baseline, device, {}, get_default_llm_properties())
115+
model_cb_kvcrush = ContinuousBatchingPipeline(models_path, scheduler_config_kvcrush, device, {}, get_default_llm_properties())
66116

67117
model_name = "/".join(models_path.parts[-2:])
68118
max_new_tokens = dataset2maxlen[subset]
119+
_log(f"model_name={model_name} max_new_tokens={max_new_tokens}")
69120

70121
generation_config = GenerationConfig()
71122
generation_config.num_return_sequences = 1
72123
generation_config.max_new_tokens = max_new_tokens
73124
generation_config.apply_chat_template = False
74125

75-
data = datasets.load_dataset('THUDM/LongBench', subset, split='test[:32]')
126+
with _stage("load_dataset"):
127+
data = datasets.load_dataset('THUDM/LongBench', subset, split='test[:32]', trust_remote_code=True)
76128
with tqdm(total=len(data)) as progress_bar:
77129
batch = []
78130
baseline_answers = []
@@ -85,22 +137,27 @@ def test_kvcrush_vs_snapkv_baseline_longbench(subset):
85137
kvcrush_answers.append({"answers": data_sample["answers"], "all_classes": data_sample["all_classes"]})
86138

87139
if len(batch) == seqs_per_request or p_idx == len(data) - 1:
88-
baseline_batch = model_cb_baseline.generate(
89-
batch, [generation_config] * len(batch)
90-
)
91-
kvcrush_batch = model_cb_kvcrush.generate(
92-
batch, [generation_config] * len(batch)
93-
)
140+
_log(f"Generating batch size={len(batch)} last_index={p_idx}")
141+
with _stage("baseline_generate"):
142+
baseline_batch = model_cb_baseline.generate(
143+
batch, [generation_config] * len(batch)
144+
)
145+
with _stage("kvcrush_generate"):
146+
kvcrush_batch = model_cb_kvcrush.generate(
147+
batch, [generation_config] * len(batch)
148+
)
94149
for i, (baseline_output, kvcrush_output) in enumerate(zip(baseline_batch, kvcrush_batch), start=p_idx-len(batch)+1):
95150
baseline_answers[i]["pred"] = post_process_pred(baseline_output.m_generation_ids[0], subset, model_name)
96151
kvcrush_answers[i]["pred"] = post_process_pred(kvcrush_output.m_generation_ids[0], subset, model_name)
97152
batch.clear()
98153

99-
baseline_score = evaluate(baseline_answers, subset)
100-
kvcrush_score = evaluate(kvcrush_answers, subset)
154+
with _stage("evaluate_baseline"):
155+
baseline_score = evaluate(baseline_answers, subset)
156+
with _stage("evaluate_kvcrush"):
157+
kvcrush_score = evaluate(kvcrush_answers, subset)
101158

102-
print(f"Baseline (SnapKV) score: {baseline_score}")
103-
print(f"KVCrush score: {kvcrush_score}")
159+
_log(f"Baseline (SnapKV) score: {baseline_score}")
160+
_log(f"KVCrush score: {kvcrush_score}")
104161

105162
assert kvcrush_score >= baseline_score, f"KVCrush score ({kvcrush_score}) is worse than baseline ({baseline_score}) on {subset} dataset"
106163

0 commit comments

Comments
 (0)