Skip to content

Commit 4302987

Browse files
authored
[Bugfix] Fix inappropriate content of model_name tag in Prometheus metrics (#3937)
1 parent 021b1a2 commit 4302987

File tree

5 files changed

+76
-14
lines changed

5 files changed

+76
-14
lines changed

tests/metrics/test_metrics.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import List
2+
13
import pytest
24
from prometheus_client import REGISTRY
35

@@ -76,6 +78,34 @@ def test_metric_counter_generation_tokens(
7678
f"metric: {metric_count!r}")
7779

7880

81+
@pytest.mark.parametrize("model", MODELS)
82+
@pytest.mark.parametrize("dtype", ["float"])
83+
@pytest.mark.parametrize(
84+
"served_model_name",
85+
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
86+
def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
87+
served_model_name: List[str]) -> None:
88+
vllm_model = vllm_runner(model,
89+
dtype=dtype,
90+
disable_log_stats=False,
91+
gpu_memory_utilization=0.3,
92+
served_model_name=served_model_name)
93+
stat_logger = vllm_model.model.llm_engine.stat_logger
94+
metrics_tag_content = stat_logger.labels["model_name"]
95+
96+
del vllm_model
97+
98+
if served_model_name is None or served_model_name == []:
99+
assert metrics_tag_content == model, (
100+
f"Metrics tag model_name is wrong! expect: {model!r}\n"
101+
f"actual: {metrics_tag_content!r}")
102+
else:
103+
assert metrics_tag_content == served_model_name[0], (
104+
f"Metrics tag model_name is wrong! expect: "
105+
f"{served_model_name[0]!r}\n"
106+
f"actual: {metrics_tag_content!r}")
107+
108+
79109
@pytest.mark.parametrize("model", MODELS)
80110
@pytest.mark.parametrize("dtype", ["half"])
81111
@pytest.mark.parametrize("max_tokens", [4])

vllm/config.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ class ModelConfig:
3131
3232
Args:
3333
model: Name or path of the huggingface model to use.
34+
It is also used as the content for `model_name` tag in metrics
35+
output when `served_model_name` is not specified.
3436
tokenizer: Name or path of the huggingface tokenizer to use.
3537
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
3638
available, and "slow" will always use the slow tokenizer.
@@ -69,6 +71,10 @@ class ModelConfig:
6971
to eager mode
7072
skip_tokenizer_init: If true, skip initialization of tokenizer and
7173
detokenizer.
74+
served_model_name: The model name used in metrics tag `model_name`,
75+
matches the model name exposed via the APIs. If multiple model
76+
names provided, the first name will be used. If not specified,
77+
the model name will be the same as `model`.
7278
"""
7379

7480
def __init__(
@@ -90,6 +96,7 @@ def __init__(
9096
max_seq_len_to_capture: Optional[int] = None,
9197
max_logprobs: int = 5,
9298
skip_tokenizer_init: bool = False,
99+
served_model_name: Optional[Union[str, List[str]]] = None,
93100
) -> None:
94101
self.model = model
95102
self.tokenizer = tokenizer
@@ -117,6 +124,8 @@ def __init__(
117124
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
118125
self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
119126
max_model_len)
127+
self.served_model_name = get_served_model_name(model,
128+
served_model_name)
120129
if not self.skip_tokenizer_init:
121130
self._verify_tokenizer_mode()
122131
self._verify_quantization()
@@ -1150,6 +1159,22 @@ def _get_and_verify_max_len(
11501159
return int(max_model_len)
11511160

11521161

1162+
def get_served_model_name(model: str,
1163+
served_model_name: Optional[Union[str, List[str]]]):
1164+
"""
1165+
If the input is a non-empty list, the first model_name in
1166+
`served_model_name` is taken.
1167+
If the input is a non-empty string, it is used directly.
1168+
For cases where the input is either an empty string or an
1169+
empty list, the fallback is to use `self.model`.
1170+
"""
1171+
if not served_model_name:
1172+
return model
1173+
if isinstance(served_model_name, list):
1174+
return served_model_name[0]
1175+
return served_model_name
1176+
1177+
11531178
@dataclass
11541179
class DecodingConfig:
11551180
"""Dataclass which contains the decoding strategy of the engine"""

vllm/engine/arg_utils.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import argparse
22
import dataclasses
33
from dataclasses import dataclass
4-
from typing import Optional
4+
from typing import List, Optional, Union
55

66
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
77
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
@@ -21,6 +21,7 @@ def nullable_str(val: str):
2121
class EngineArgs:
2222
"""Arguments for vLLM engine."""
2323
model: str
24+
served_model_name: Optional[Union[List[str]]] = None
2425
tokenizer: Optional[str] = None
2526
skip_tokenizer_init: bool = False
2627
tokenizer_mode: str = 'auto'
@@ -489,6 +490,21 @@ def add_cli_args(
489490
'This should be a JSON string that will be '
490491
'parsed into a dictionary.')
491492

493+
parser.add_argument(
494+
"--served-model-name",
495+
nargs="+",
496+
type=str,
497+
default=None,
498+
help="The model name(s) used in the API. If multiple "
499+
"names are provided, the server will respond to any "
500+
"of the provided names. The model name in the model "
501+
"field of a response will be the first name in this "
502+
"list. If not specified, the model name will be the "
503+
"same as the `--model` argument. Noted that this name(s)"
504+
"will also be used in `model_name` tag content of "
505+
"prometheus metrics, if multiple names provided, metrics"
506+
"tag will take the first one.")
507+
492508
return parser
493509

494510
@classmethod
@@ -508,7 +524,7 @@ def create_engine_config(self, ) -> EngineConfig:
508524
self.quantization, self.quantization_param_path,
509525
self.enforce_eager, self.max_context_len_to_capture,
510526
self.max_seq_len_to_capture, self.max_logprobs,
511-
self.skip_tokenizer_init)
527+
self.skip_tokenizer_init, self.served_model_name)
512528
cache_config = CacheConfig(self.block_size,
513529
self.gpu_memory_utilization,
514530
self.swap_space, self.kv_cache_dtype,

vllm/engine/llm_engine.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def __init__(
106106
"tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
107107
"quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
108108
"quantization_param_path=%s, device_config=%s, "
109-
"decoding_config=%r, seed=%d)",
109+
"decoding_config=%r, seed=%d, served_model_name=%s)",
110110
vllm.__version__,
111111
model_config.model,
112112
speculative_config,
@@ -129,6 +129,7 @@ def __init__(
129129
device_config.device,
130130
decoding_config,
131131
model_config.seed,
132+
model_config.served_model_name,
132133
)
133134
# TODO(woosuk): Print more configs in debug mode.
134135

@@ -219,7 +220,7 @@ def __init__(
219220
if self.log_stats:
220221
self.stat_logger = StatLogger(
221222
local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
222-
labels=dict(model_name=model_config.model),
223+
labels=dict(model_name=model_config.served_model_name),
223224
max_model_len=self.model_config.max_model_len)
224225
self.stat_logger.info("cache_config", self.cache_config)
225226

vllm/entrypoints/openai/cli_args.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,16 +56,6 @@ def make_arg_parser():
5656
default=None,
5757
help="If provided, the server will require this key "
5858
"to be presented in the header.")
59-
parser.add_argument("--served-model-name",
60-
nargs="+",
61-
type=nullable_str,
62-
default=None,
63-
help="The model name(s) used in the API. If multiple "
64-
"names are provided, the server will respond to any "
65-
"of the provided names. The model name in the model "
66-
"field of a response will be the first name in this "
67-
"list. If not specified, the model name will be the "
68-
"same as the `--model` argument.")
6959
parser.add_argument(
7060
"--lora-modules",
7161
type=nullable_str,

0 commit comments

Comments
 (0)