Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 123 additions & 146 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,15 @@

from ...utils import RemoteOpenAIServer

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MODELS = {
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
}
PREV_MINOR_VERSION = version._prev_minor_version()


@pytest.fixture(scope="module", params=[True])
def use_v1(request):
# Module-scoped variant of run_with_both_engines
#
# Use this fixture to run a test with both v0 and v1, and
# also to conditionalize the test logic e.g.
#
# def test_metrics_exist(use_v1, server, client):
# ...
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
# for metric in expected:
# assert metric in response.text
#
# @skip_v1 wouldn't work here because this is a module-level
# fixture - per-function decorators would have no effect
@pytest.fixture(scope="module", params=list(MODELS.keys()))
def model_key(request):
yield request.param


Expand All @@ -63,13 +53,12 @@ def default_server_args():
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
def server(use_v1, default_server_args, request):
def server(model_key, default_server_args, request):
if request.param:
default_server_args.append(request.param)
env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
with RemoteOpenAIServer(
MODEL_NAME, default_server_args, env_dict=env_dict
) as remote_server:

model_name = MODELS[model_key]
with RemoteOpenAIServer(model_name, default_server_args) as remote_server:
yield remote_server


Expand All @@ -80,72 +69,80 @@ async def client(server):


_PROMPT = "Hello my name is Robert and I love magic"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]

_NUM_REQUESTS = 10
_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
_NUM_GENERATION_TOKENS_PER_REQUEST = 10

# {metric_family: [(suffix, expected_value)]}
EXPECTED_VALUES = {
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
"vllm:time_per_output_token_seconds": [
("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))
],
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prompt_tokens": [
("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS),
],
"vllm:request_generation_tokens": [
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS),
],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_max_tokens": [
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS),
],
"vllm:iteration_tokens_total": [
(
"_sum",
_NUM_REQUESTS
* (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST),
),
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
],
"vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens": [
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
],
"vllm:request_success": [("_total", _NUM_REQUESTS)],
}
_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"


def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
num_prompt_tokens = len(prompt_ids)

# {metric_family: [(suffix, expected_value)]}
return {
"vllm:time_to_first_token_seconds": [("_count", num_requests)],
"vllm:time_per_output_token_seconds": [
("_count", num_requests * (max_tokens - 1))
],
"vllm:e2e_request_latency_seconds": [("_count", num_requests)],
"vllm:request_queue_time_seconds": [("_count", num_requests)],
"vllm:request_inference_time_seconds": [("_count", num_requests)],
"vllm:request_prefill_time_seconds": [("_count", num_requests)],
"vllm:request_decode_time_seconds": [("_count", num_requests)],
"vllm:request_prompt_tokens": [
("_sum", num_requests * num_prompt_tokens),
("_count", num_requests),
],
"vllm:request_generation_tokens": [
("_sum", num_requests * max_tokens),
("_count", num_requests),
],
"vllm:request_params_n": [("_count", num_requests)],
"vllm:request_params_max_tokens": [
("_sum", num_requests * max_tokens),
("_count", num_requests),
],
"vllm:iteration_tokens_total": [
(
"_sum",
num_requests * (num_prompt_tokens + max_tokens),
),
("_count", num_requests * max_tokens),
],
"vllm:prompt_tokens": [("_total", num_requests * num_prompt_tokens)],
"vllm:generation_tokens": [("_total", num_requests * num_prompt_tokens)],
"vllm:request_success": [("_total", num_requests)],
}


@pytest.mark.asyncio
async def test_metrics_counts(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
):
for _ in range(_NUM_REQUESTS):
if model_key == "multimodal":
pytest.skip("Unnecessary test")

model_name = MODELS[model_key]
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt_ids = tokenizer.encode(_PROMPT)
num_requests = 10
max_tokens = 10

for _ in range(num_requests):
# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST,
model=model_name,
prompt=prompt_ids,
max_tokens=max_tokens,
)

response = requests.get(server.url_for("metrics"))
print(response.text)
assert response.status_code == HTTPStatus.OK

# Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
expected_values = _get_expected_values(num_requests, prompt_ids, max_tokens)
for metric_family, suffix_values_list in expected_values.items():
if metric_family not in EXPECTED_METRICS_V1 or (
not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS
):
Expand Down Expand Up @@ -183,62 +180,6 @@ async def test_metrics_counts(
assert found_metric, f"Did not find {metric_family} in prom endpoint"


EXPECTED_METRICS = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count",
"vllm:request_queue_time_seconds_sum",
"vllm:request_queue_time_seconds_bucket",
"vllm:request_queue_time_seconds_count",
"vllm:request_inference_time_seconds_sum",
"vllm:request_inference_time_seconds_bucket",
"vllm:request_inference_time_seconds_count",
"vllm:request_prefill_time_seconds_sum",
"vllm:request_prefill_time_seconds_bucket",
"vllm:request_prefill_time_seconds_count",
"vllm:request_decode_time_seconds_sum",
"vllm:request_decode_time_seconds_bucket",
"vllm:request_decode_time_seconds_count",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:iteration_tokens_total",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:cache_config_info",
# labels in cache_config_info
"block_size",
"cache_dtype",
"cpu_offload_gb",
"enable_prefix_caching",
"gpu_memory_utilization",
"num_cpu_blocks",
"num_gpu_blocks",
"num_gpu_blocks_override",
"sliding_window",
"swap_space_bytes",
]

EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
Expand Down Expand Up @@ -292,6 +233,11 @@ async def test_metrics_counts(
"vllm:request_decode_time_seconds_count",
]

EXPECTED_METRICS_MM = [
"vllm:mm_cache_queries",
"vllm:mm_cache_hits",
]

HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
Expand All @@ -304,28 +250,62 @@ async def test_metrics_counts(

@pytest.mark.asyncio
async def test_metrics_exist(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
):
model_name = MODELS[model_key]

# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
)
if model_key == "text":
await client.completions.create(
model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
)
else:
await client.chat.completions.create(
model=model_name,
messages=[
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": _IMAGE_URL}},
{"type": "text", "text": "What's in this image?"},
],
}
],
max_tokens=5,
temperature=0.0,
)

response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK

for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
expected_metrics = EXPECTED_METRICS_V1
if model_key == "multimodal":
# NOTE: Don't use in-place assignment
expected_metrics = expected_metrics + EXPECTED_METRICS_MM

for metric in expected_metrics:
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
continue
assert metric in response.text


@pytest.mark.asyncio
async def test_abort_metrics_reset(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
model_key: str,
):
model_name = MODELS[model_key]
tokenizer = AutoTokenizer.from_pretrained(model_name)
prompt_ids = tokenizer.encode(_PROMPT)

running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
server,
)

# Expect no running requests or kvcache usage
Expand All @@ -338,8 +318,8 @@ async def test_abort_metrics_reset(
for _ in range(3):
task = asyncio.create_task(
client.completions.create(
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
model=model_name,
prompt=prompt_ids,
max_tokens=100, # Long generation to give time to abort
temperature=0.0,
)
Expand All @@ -351,7 +331,7 @@ async def test_abort_metrics_reset(

# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
server,
)

# Expect running requests and kvcache usage
Expand All @@ -371,7 +351,7 @@ async def test_abort_metrics_reset(

# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server, use_v1)
_get_running_metrics_from_api(server)
)

assert running_requests_after == 0, (
Expand All @@ -385,7 +365,7 @@ async def test_abort_metrics_reset(
)


def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
"""Return (running_count, waiting_count, kv_cache_usage)"""

response = requests.get(server.url_for("metrics"))
Expand All @@ -394,9 +374,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
# Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None

kv_cache_usage_metric = (
"vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
)
kv_cache_usage_metric = "vllm:kv_cache_usage_perc"

for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running":
Expand All @@ -422,7 +400,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
return running_requests, waiting_requests, kv_cache_usage


def test_metrics_exist_run_batch(use_v1: bool):
def test_metrics_exist_run_batch():
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501

base_url = "0.0.0.0"
Expand Down Expand Up @@ -452,7 +430,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
"--port",
port,
],
env={"VLLM_USE_V1": "1"},
)

def is_server_up(url):
Expand Down
Loading