Skip to content

Commit 40b6c91

Browse files
RichardoMrMuMu Huaihcyezhangbbartelssimon-mo
authored
[V1] feat:add engine v1 tracing (#20372)
Signed-off-by: Mu Huai <[email protected]> Signed-off-by: Ye Zhang <[email protected]> Signed-off-by: RichardoMu <[email protected]> Signed-off-by: simon-mo <[email protected]> Signed-off-by: Aaron Pham <[email protected]> Signed-off-by: 22quinn <[email protected]> Co-authored-by: Mu Huai <[email protected]> Co-authored-by: Ye Zhang <[email protected]> Co-authored-by: Benjamin Bartels <[email protected]> Co-authored-by: simon-mo <[email protected]> Co-authored-by: 瑜琮 <[email protected]> Co-authored-by: Aaron Pham <[email protected]> Co-authored-by: 22quinn <[email protected]>
1 parent 2e6bc46 commit 40b6c91

File tree

12 files changed

+253
-20
lines changed

12 files changed

+253
-20
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ steps:
231231
source_file_dependencies:
232232
- vllm/
233233
- tests/metrics
234-
- tests/tracing
234+
- tests/v1/tracing
235235
commands:
236236
- pytest -v -s metrics
237237
- "pip install \

tests/v1/tracing/test_tracing.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
# ruff: noqa
4+
# type: ignore
5+
from __future__ import annotations
6+
7+
import threading
8+
from collections.abc import Iterable
9+
from concurrent import futures
10+
from typing import Callable, Generator, Literal
11+
12+
import grpc
13+
import pytest
14+
from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
15+
ExportTraceServiceResponse)
16+
from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
17+
TraceServiceServicer, add_TraceServiceServicer_to_server)
18+
from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
19+
from opentelemetry.sdk.environment_variables import (
20+
OTEL_EXPORTER_OTLP_TRACES_INSECURE)
21+
22+
from vllm import LLM, SamplingParams
23+
from vllm.tracing import SpanAttributes
24+
25+
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
26+
27+
FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
28+
'array_value']
29+
30+
31+
def decode_value(value: AnyValue):
32+
field_decoders: dict[FieldName, Callable] = {
33+
"bool_value": (lambda v: v.bool_value),
34+
"string_value": (lambda v: v.string_value),
35+
"int_value": (lambda v: v.int_value),
36+
"double_value": (lambda v: v.double_value),
37+
"array_value":
38+
(lambda v: [decode_value(item) for item in v.array_value.values]),
39+
}
40+
for field, decoder in field_decoders.items():
41+
if value.HasField(field):
42+
return decoder(value)
43+
raise ValueError(f"Couldn't decode value: {value}")
44+
45+
46+
def decode_attributes(attributes: Iterable[KeyValue]):
47+
return {kv.key: decode_value(kv.value) for kv in attributes}
48+
49+
50+
class FakeTraceService(TraceServiceServicer):
51+
52+
def __init__(self):
53+
self.request = None
54+
self.evt = threading.Event()
55+
56+
def Export(self, request, context):
57+
self.request = request
58+
self.evt.set()
59+
return ExportTraceServiceResponse()
60+
61+
62+
@pytest.fixture
63+
def trace_service() -> Generator[FakeTraceService, None, None]:
64+
"""Fixture to set up a fake gRPC trace service"""
65+
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
66+
service = FakeTraceService()
67+
add_TraceServiceServicer_to_server(service, server)
68+
server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
69+
server.start()
70+
71+
yield service
72+
73+
server.stop(None)
74+
75+
76+
def test_traces(
77+
monkeypatch: pytest.MonkeyPatch,
78+
trace_service: FakeTraceService,
79+
):
80+
with monkeypatch.context() as m:
81+
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
82+
m.setenv("VLLM_USE_V1", "1")
83+
sampling_params = SamplingParams(
84+
temperature=0.01,
85+
top_p=0.1,
86+
max_tokens=256,
87+
)
88+
model = "facebook/opt-125m"
89+
llm = LLM(model=model,
90+
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
91+
gpu_memory_utilization=0.3,
92+
disable_log_stats=False)
93+
prompts = ["This is a short prompt"]
94+
outputs = llm.generate(prompts, sampling_params=sampling_params)
95+
print(f"test_traces outputs is : {outputs}")
96+
97+
timeout = 10
98+
if not trace_service.evt.wait(timeout):
99+
raise TimeoutError(
100+
f"The fake trace service didn't receive a trace within "
101+
f"the {timeout} seconds timeout")
102+
103+
request = trace_service.request
104+
assert len(request.resource_spans) == 1, (
105+
f"Expected 1 resource span, "
106+
f"but got {len(request.resource_spans)}")
107+
assert len(request.resource_spans[0].scope_spans) == 1, (
108+
f"Expected 1 scope span, "
109+
f"but got {len(request.resource_spans[0].scope_spans)}")
110+
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
111+
f"Expected 1 span, "
112+
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
113+
114+
attributes = decode_attributes(
115+
request.resource_spans[0].scope_spans[0].spans[0].attributes)
116+
# assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
117+
assert attributes.get(
118+
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
119+
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
120+
) == sampling_params.temperature
121+
assert attributes.get(
122+
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
123+
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
124+
) == sampling_params.max_tokens
125+
assert attributes.get(
126+
SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
127+
assert attributes.get(
128+
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
129+
outputs[0].prompt_token_ids)
130+
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
131+
assert attributes.get(
132+
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
133+
134+
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
135+
assert attributes.get(
136+
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
137+
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0

vllm/engine/arg_utils.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1491,12 +1491,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
14911491
recommend_to_remove=False)
14921492
return False
14931493

1494-
# No OTLP observability so far.
1495-
if (self.otlp_traces_endpoint or self.collect_detailed_traces):
1496-
_raise_or_fallback(feature_name="--otlp-traces-endpoint",
1497-
recommend_to_remove=False)
1498-
return False
1499-
15001494
# V1 supports N-gram, Medusa, and Eagle speculative decoding.
15011495
if (self.speculative_config is not None
15021496
and self.speculative_config.get("method") == "draft_model"):

vllm/tracing.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ class SpanAttributes:
119119
# forward, block/sync across workers, cpu-gpu sync time and sampling time.
120120
GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = (
121121
"gen_ai.latency.time_in_model_execute")
122+
GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL = \
123+
"gen_ai.latency.time_in_model_prefill"
124+
GEN_AI_LATENCY_TIME_IN_MODEL_DECODE = "gen_ai.latency.time_in_model_decode"
125+
GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE = \
126+
"gen_ai.latency.time_in_model_inference"
122127

123128

124129
def contains_trace_headers(headers: Mapping[str, str]) -> bool:

vllm/v1/core/sched/scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -969,9 +969,9 @@ def update_from_output(
969969
stop_reason=request.stop_reason,
970970
events=request.take_events(),
971971
kv_transfer_params=kv_transfer_params,
972+
trace_headers=request.trace_headers,
972973
num_cached_tokens=request.num_cached_tokens,
973974
))
974-
975975
else:
976976
# Invariant: EngineCore returns no partial prefill outputs.
977977
assert not prompt_logprobs_tensors

vllm/v1/engine/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import enum
55
import time
6+
from collections.abc import Mapping
67
from typing import Any, Optional, Union
78

89
import msgspec
@@ -66,6 +67,8 @@ class EngineCoreRequest(
6667
current_wave: int = 0
6768
priority: int = 0
6869

70+
trace_headers: Optional[Mapping[str, str]] = None
71+
6972

7073
class EngineCoreEventType(enum.IntEnum):
7174
"""The type of engine core request event."""
@@ -111,6 +114,7 @@ class EngineCoreOutput(
111114
events: Optional[list[EngineCoreEvent]] = None
112115
kv_transfer_params: Optional[dict[str, Any]] = None
113116

117+
trace_headers: Optional[Mapping[str, str]] = None
114118
# The number of tokens with prefix cache hits.
115119
num_cached_tokens: int = 0
116120

@@ -144,7 +148,7 @@ class EngineCoreOutputs(
144148
omit_defaults=True, # type: ignore[call-arg]
145149
gc=False): # type: ignore[call-arg]
146150

147-
#NOTE(Nick): We could consider ways to make this more compact,
151+
# NOTE(Nick): We could consider ways to make this more compact,
148152
# e.g. columnwise layout
149153

150154
engine_index: int = 0

vllm/v1/engine/async_llm.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from vllm.pooling_params import PoolingParams
2727
from vllm.sampling_params import SamplingParams
2828
from vllm.tasks import SupportedTask
29+
from vllm.tracing import init_tracer
2930
from vllm.transformers_utils.config import (
3031
maybe_register_config_serialize_by_value)
3132
from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -97,6 +98,7 @@ def __init__(
9798

9899
self.model_config = vllm_config.model_config
99100
self.vllm_config = vllm_config
101+
self.observability_config = vllm_config.observability_config
100102
self.log_requests = log_requests
101103

102104
self.log_stats = log_stats or (stat_loggers is not None)
@@ -124,6 +126,11 @@ def __init__(
124126
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
125127
self.output_processor = OutputProcessor(self.tokenizer,
126128
log_stats=self.log_stats)
129+
if self.observability_config.otlp_traces_endpoint is not None:
130+
tracer = init_tracer(
131+
"vllm.llm_engine",
132+
self.observability_config.otlp_traces_endpoint)
133+
self.output_processor.tracer = tracer
127134

128135
# EngineCore (starts the engine in background process).
129136
self.engine_core = EngineCoreClient.make_async_mp_client(
@@ -603,7 +610,7 @@ async def get_tokenizer(
603610
return self.tokenizer.get_lora_tokenizer(lora_request)
604611

605612
async def is_tracing_enabled(self) -> bool:
606-
return False
613+
return self.observability_config.otlp_traces_endpoint is not None
607614

608615
async def do_log_stats(
609616
self,

vllm/v1/engine/llm_engine.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from vllm.pooling_params import PoolingParams
2020
from vllm.sampling_params import SamplingParams
2121
from vllm.tasks import SupportedTask
22+
from vllm.tracing import init_tracer
2223
from vllm.transformers_utils.tokenizer_group import (
2324
TokenizerGroup, init_tokenizer_from_configs)
2425
from vllm.usage.usage_lib import UsageContext
@@ -65,6 +66,7 @@ def __init__(
6566
"Set VLLM_USE_V1=0 and file and issue on Github.")
6667

6768
self.vllm_config = vllm_config
69+
self.observability_config = vllm_config.observability_config
6870
self.model_config = vllm_config.model_config
6971
self.cache_config = vllm_config.cache_config
7072

@@ -99,6 +101,11 @@ def __init__(
99101
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
100102
self.output_processor = OutputProcessor(self.tokenizer,
101103
log_stats=self.log_stats)
104+
if self.observability_config.otlp_traces_endpoint is not None:
105+
tracer = init_tracer(
106+
"vllm.llm_engine",
107+
self.observability_config.otlp_traces_endpoint)
108+
self.output_processor.tracer = tracer
102109

103110
# EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
104111
self.engine_core = EngineCoreClient.make_client(

0 commit comments

Comments
 (0)