Skip to content

Commit 89bdff9

Browse files
authored
Lmcache benchmarks (#2971)
1 parent d08744c commit 89bdff9

File tree

11 files changed

+742
-3
lines changed

11 files changed

+742
-3
lines changed

engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,10 @@ def parse_chat_completions_request_vllm(
7878
default_sampling_params = rolling_batch.get_default_sampling_params()
7979
default_max_new_tokens = rolling_batch.engine.model_config.max_model_len - len(
8080
engine_prompt["prompt_token_ids"])
81+
# Use max_tokens from request if provided, otherwise use default
82+
max_tokens = chat_params.max_tokens or chat_params.max_completion_tokens or default_max_new_tokens
8183
sampling_params = chat_params.to_sampling_params(
82-
default_max_new_tokens,
83-
rolling_batch.engine.model_config.logits_processor_pattern,
84+
max_tokens, rolling_batch.engine.model_config.logits_processor_pattern,
8485
default_sampling_params)
8586
params = {
8687
"stream": chat_params.stream,

tests/integration/launch_container.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,13 @@ support_nvme() {
7777
fi
7878
}
7979

80-
if [[ "$(support_nvme)" == *"true"* ]]; then
80+
# Check if DISABLE_NVME_TMP is set in docker_env file
81+
disable_nvme_tmp=false
82+
if [[ -f ${PWD}/docker_env ]] && grep -q "DISABLE_NVME_TMP=true" ${PWD}/docker_env; then
83+
disable_nvme_tmp=true
84+
fi
85+
86+
if [[ "$(support_nvme)" == *"true"* ]] && [[ "$disable_nvme_tmp" != "true" ]]; then
8187
sudo rm -rf /opt/dlami/nvme/inf_tmp || true
8288
sudo mkdir -p /opt/dlami/nvme/inf_tmp && sudo chmod 777 /opt/dlami/nvme/inf_tmp
8389
nvme="/opt/dlami/nvme/inf_tmp:/tmp"

tests/integration/llm/prepare.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,124 @@
465465
"option.enable_reasoning": True,
466466
"option.reasoning_parser": "deepseek_r1",
467467
},
468+
"qwen3-8b": {
469+
"option.model_id": "Qwen/Qwen3-8B",
470+
"option.tensor_parallel_degree": 1,
471+
},
472+
"qwen3-8b-lmcache": {
473+
"option.model_id": "Qwen/Qwen3-8B",
474+
"option.tensor_parallel_degree": 1,
475+
"option.load_format": "dummy",
476+
"option.max_new_tokens": 100,
477+
"lmcache_config_file": "lmcache_qwen3_benchmark.yaml",
478+
"option.kv_transfer_config":
479+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
480+
"load_on_devices": 0,
481+
},
482+
"qwen3-8b-baseline": {
483+
"option.model_id": "Qwen/Qwen3-8B",
484+
"option.tensor_parallel_degree": 1,
485+
"option.load_format": "dummy",
486+
"option.max_new_tokens": 100,
487+
"gpu.maxWorkers": 1,
488+
"load_on_devices": 0,
489+
},
490+
"qwen3-8b-lmcache-ebs": {
491+
"option.model_id": "Qwen/Qwen3-8B",
492+
"option.tensor_parallel_degree": 1,
493+
"option.load_format": "dummy",
494+
"option.max_new_tokens": 100,
495+
"lmcache_config_file": "lmcache_qwen3_ebs.yaml",
496+
"option.kv_transfer_config":
497+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
498+
"load_on_devices": 0,
499+
},
500+
"qwen3-8b-lmcache-nvme": {
501+
"option.model_id": "Qwen/Qwen3-8B",
502+
"option.tensor_parallel_degree": 1,
503+
"option.load_format": "dummy",
504+
"option.max_new_tokens": 100,
505+
"lmcache_config_file": "lmcache_qwen3_nvme.yaml",
506+
"option.kv_transfer_config":
507+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
508+
"load_on_devices": 0,
509+
},
510+
"qwen3-8b-no-cache": {
511+
"option.model_id": "Qwen/Qwen3-8B",
512+
"option.tensor_parallel_degree": 1,
513+
"option.load_format": "dummy",
514+
"option.max_new_tokens": 100,
515+
"option.enable_prefix_caching": False,
516+
"load_on_devices": 0,
517+
},
518+
"qwen3-8b-vllm-prefix-cache": {
519+
"option.model_id": "Qwen/Qwen3-8B",
520+
"option.tensor_parallel_degree": 1,
521+
"option.load_format": "dummy",
522+
"option.max_new_tokens": 100,
523+
"option.enable_prefix_caching": True,
524+
"load_on_devices": 0,
525+
},
526+
"qwen2.5-1.5b": {
527+
"option.model_id": "Qwen/Qwen2.5-1.5B",
528+
"option.tensor_parallel_degree": 1,
529+
"option.load_format": "dummy",
530+
"option.max_new_tokens": 100,
531+
},
532+
"qwen2.5-7b": {
533+
"option.model_id": "Qwen/Qwen2.5-7B",
534+
"option.tensor_parallel_degree": 1,
535+
"option.load_format": "dummy",
536+
"option.max_new_tokens": 100,
537+
},
538+
"qwen2.5-72b": {
539+
"option.model_id": "Qwen/Qwen2.5-72B",
540+
"option.tensor_parallel_degree": 4,
541+
"option.load_format": "dummy",
542+
"option.max_new_tokens": 100,
543+
},
544+
"qwen2.5-1.5b-lmcache": {
545+
"option.model_id":
546+
"Qwen/Qwen2.5-1.5B",
547+
"option.tensor_parallel_degree":
548+
1,
549+
"option.load_format":
550+
"dummy",
551+
"option.max_new_tokens":
552+
100,
553+
"lmcache_config_file":
554+
"lmcache_qwen25_1_5b.yaml",
555+
"option.kv_transfer_config":
556+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
557+
},
558+
"qwen2.5-7b-lmcache": {
559+
"option.model_id":
560+
"Qwen/Qwen2.5-7B",
561+
"option.tensor_parallel_degree":
562+
1,
563+
"option.load_format":
564+
"dummy",
565+
"option.max_new_tokens":
566+
100,
567+
"lmcache_config_file":
568+
"lmcache_qwen25_7b.yaml",
569+
"option.kv_transfer_config":
570+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
571+
},
572+
"qwen2.5-72b-lmcache": {
573+
"option.model_id":
574+
"Qwen/Qwen2.5-72B",
575+
"option.tensor_parallel_degree":
576+
4,
577+
"option.load_format":
578+
"dummy",
579+
"option.max_new_tokens":
580+
100,
581+
"lmcache_config_file":
582+
"lmcache_qwen25_72b.yaml",
583+
"option.kv_transfer_config":
584+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
585+
},
468586
"tinyllama-input-len-exceeded": {
469587
"option.model_id": "s3://djl-llm/tinyllama-1.1b-chat/",
470588
"option.max_model_len": "50",

0 commit comments

Comments
 (0)