Skip to content

Commit 777823a

Browse files
committed
[Feat] Router: Add OpenTelemetry tracing support with W3C context propagation
Signed-off-by: Fang Han <fhan0520@gmail.com>
1 parent ab2c023 commit 777823a

File tree

14 files changed

+696
-58
lines changed

14 files changed

+696
-58
lines changed

helm/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,14 @@ This table documents all available configuration values for the Production Stack
201201
| `routerSpec.readinessProbe.failureThreshold` | integer |`3`| Failure threshold for router's readiness probe |
202202
| `routerSpec.readinessProbe.httpGet.path` | string |`"/health"`| Endpoint that the router's readiness probe will be testing |
203203

204+
#### Router OpenTelemetry Configuration
205+
206+
| Field | Type | Default | Description |
207+
|-------|------|---------|-------------|
208+
| `routerSpec.otel.endpoint` | string | `""` | OTLP endpoint for tracing (e.g., "otel-collector:4317"). Tracing is enabled when this is set. |
209+
| `routerSpec.otel.serviceName` | string | `"vllm-router"` | Service name for OpenTelemetry traces |
210+
| `routerSpec.otel.secure` | boolean | `false` | Use secure (TLS) connection for OTLP exporter |
211+
204212
#### Router Ingress Configuration
205213

206214
| Field | Type | Default | Description |

helm/templates/deployment-router.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,15 @@ spec:
136136
- "--lmcache-controller-port"
137137
- "{{ .Values.routerSpec.lmcacheControllerPort }}"
138138
{{- end }}
139+
{{- if .Values.routerSpec.otel.endpoint }}
140+
- "--otel-endpoint"
141+
- "{{ .Values.routerSpec.otel.endpoint }}"
142+
- "--otel-service-name"
143+
- "{{ .Values.routerSpec.otel.serviceName | default "vllm-router" }}"
144+
{{- if .Values.routerSpec.otel.secure }}
145+
- "--otel-secure"
146+
{{- end }}
147+
{{- end }}
139148
{{- if .Values.routerSpec.resources }}
140149
resources:
141150
{{- if .Values.routerSpec.resources.requests }}

helm/tests/routerOtel_test.yaml

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
suite: test router OpenTelemetry configuration
2+
templates:
3+
- deployment-router.yaml
4+
tests:
5+
- it: should not include otel args when endpoint is not set
6+
set:
7+
routerSpec:
8+
enableRouter: true
9+
otel:
10+
endpoint: ""
11+
asserts:
12+
- template: deployment-router.yaml
13+
notContains:
14+
path: spec.template.spec.containers[0].args
15+
content: "--otel-endpoint"
16+
17+
- it: should include otel args when endpoint is set
18+
set:
19+
routerSpec:
20+
enableRouter: true
21+
otel:
22+
endpoint: "otel-collector:4317"
23+
serviceName: "vllm-router"
24+
secure: false
25+
asserts:
26+
- template: deployment-router.yaml
27+
contains:
28+
path: spec.template.spec.containers[0].args
29+
content: "--otel-endpoint"
30+
- template: deployment-router.yaml
31+
contains:
32+
path: spec.template.spec.containers[0].args
33+
content: "otel-collector:4317"
34+
- template: deployment-router.yaml
35+
contains:
36+
path: spec.template.spec.containers[0].args
37+
content: "--otel-service-name"
38+
- template: deployment-router.yaml
39+
contains:
40+
path: spec.template.spec.containers[0].args
41+
content: "vllm-router"
42+
- template: deployment-router.yaml
43+
notContains:
44+
path: spec.template.spec.containers[0].args
45+
content: "--otel-secure"
46+
47+
- it: should use custom service name when specified
48+
set:
49+
routerSpec:
50+
enableRouter: true
51+
otel:
52+
endpoint: "jaeger:4317"
53+
serviceName: "my-custom-router"
54+
secure: false
55+
asserts:
56+
- template: deployment-router.yaml
57+
contains:
58+
path: spec.template.spec.containers[0].args
59+
content: "my-custom-router"
60+
61+
- it: should include otel-secure flag when secure is true
62+
set:
63+
routerSpec:
64+
enableRouter: true
65+
otel:
66+
endpoint: "otel-collector:4317"
67+
serviceName: "vllm-router"
68+
secure: true
69+
asserts:
70+
- template: deployment-router.yaml
71+
contains:
72+
path: spec.template.spec.containers[0].args
73+
content: "--otel-endpoint"
74+
- template: deployment-router.yaml
75+
contains:
76+
path: spec.template.spec.containers[0].args
77+
content: "--otel-secure"

helm/values.schema.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,26 @@
580580
"additionalProperties": {
581581
"type": "string"
582582
}
583+
},
584+
"otel": {
585+
"type": "object",
586+
"description": "OpenTelemetry tracing configuration for the router",
587+
"properties": {
588+
"endpoint": {
589+
"type": "string",
590+
"description": "OTLP endpoint for tracing (e.g., 'otel-collector:4317'). Tracing is enabled when this is set."
591+
},
592+
"serviceName": {
593+
"type": "string",
594+
"description": "Service name for OpenTelemetry traces",
595+
"default": "vllm-router"
596+
},
597+
"secure": {
598+
"type": "boolean",
599+
"description": "Use secure (TLS) connection for OTLP exporter",
600+
"default": false
601+
}
602+
}
583603
}
584604
}
585605
}

helm/values.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,16 @@ routerSpec:
377377
# -- Window size in seconds to calculate the request statistics
378378
requestStatsWindow: 60
379379

380+
# -- OpenTelemetry tracing configuration
381+
# When otelEndpoint is set, tracing is automatically enabled
382+
otel:
383+
# -- OTLP endpoint for tracing (e.g., "localhost:4317" or "otel-collector:4317")
384+
endpoint: ""
385+
# -- Service name for traces (default: "vllm-router")
386+
serviceName: "vllm-router"
387+
# -- Use secure (TLS) connection for OTLP exporter (default: false, i.e., insecure)
388+
secure: false
389+
380390
# -- deployment strategy
381391
strategy: {}
382392

src/tests/test_otel_tracing.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import pytest
2+
from opentelemetry.trace import SpanKind
3+
4+
import vllm_router.experimental.otel.tracing as tracing_module
5+
from vllm_router.experimental.otel.tracing import (
6+
end_span,
7+
extract_context,
8+
initialize_tracing,
9+
inject_context,
10+
is_tracing_enabled,
11+
shutdown_tracing,
12+
start_span,
13+
)
14+
15+
16+
@pytest.fixture(autouse=True)
17+
def reset_tracing_state():
18+
"""Reset global tracing state before each test."""
19+
tracing_module._tracer = None
20+
tracing_module._provider = None
21+
tracing_module._tracing_enabled = False
22+
yield
23+
# Cleanup after test
24+
if tracing_module._tracing_enabled:
25+
shutdown_tracing()
26+
27+
28+
class TestTracingIntegration:
29+
def test_full_request_flow(self):
30+
"""Test a complete request tracing flow."""
31+
initialize_tracing(service_name="vllm-router", otlp_endpoint="localhost:4317")
32+
33+
# Simulate incoming request with trace context
34+
incoming_headers = {
35+
"traceparent": "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01",
36+
}
37+
incoming_context = extract_context(incoming_headers)
38+
39+
# Create parent span (router)
40+
parent_span, parent_context = start_span(
41+
name="router /v1/chat/completions",
42+
parent_context=incoming_context,
43+
kind=SpanKind.SERVER,
44+
attributes={
45+
"http.method": "POST",
46+
"vllm.model": "Qwen/Qwen2.5-7B-Instruct",
47+
},
48+
)
49+
50+
# Create child span (backend request)
51+
child_span, child_context = start_span(
52+
name="backend_request",
53+
parent_context=parent_context,
54+
kind=SpanKind.CLIENT,
55+
attributes={
56+
"http.url": "http://backend:8000/v1/chat/completions",
57+
},
58+
)
59+
60+
# Inject context into outgoing headers
61+
outgoing_headers = {}
62+
inject_context(outgoing_headers, child_context)
63+
64+
assert "traceparent" in outgoing_headers
65+
66+
# End spans in reverse order
67+
end_span(child_span, status_code=200)
68+
end_span(parent_span, status_code=200)
69+
70+
def test_tracing_disabled_flow(self):
71+
"""Test that operations handle disabled tracing gracefully."""
72+
assert is_tracing_enabled() is False
73+
74+
# These should not raise even when tracing is disabled
75+
headers = {}
76+
inject_context(headers)
77+
end_span(None)
78+
79+
80+
if __name__ == "__main__":
81+
pytest.main([__file__, "-v"])

src/vllm_router/app.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,18 @@
7979
except ImportError:
8080
semantic_cache_available = False
8181

82+
try:
83+
# OpenTelemetry tracing integration
84+
from vllm_router.experimental.otel import (
85+
initialize_tracing,
86+
is_tracing_enabled,
87+
shutdown_tracing,
88+
)
89+
90+
otel_available = True
91+
except ImportError:
92+
otel_available = False
93+
8294
logger = logging.getLogger("uvicorn")
8395

8496

@@ -121,6 +133,11 @@ async def lifespan(app: FastAPI):
121133
logger.info("Closing routing logic instances")
122134
cleanup_routing_logic()
123135

136+
# Shutdown OpenTelemetry tracing if enabled
137+
if otel_available and app.state.otel_enabled:
138+
logger.info("Shutting down OpenTelemetry tracing")
139+
shutdown_tracing()
140+
124141

125142
def initialize_all(app: FastAPI, args):
126143
"""
@@ -142,6 +159,23 @@ def initialize_all(app: FastAPI, args):
142159
profile_session_sample_rate=args.sentry_profile_session_sample_rate,
143160
)
144161

162+
if otel_available and args.otel_endpoint:
163+
initialize_tracing(
164+
service_name=args.otel_service_name,
165+
otlp_endpoint=args.otel_endpoint,
166+
insecure=not args.otel_secure,
167+
)
168+
app.state.otel_enabled = is_tracing_enabled()
169+
if app.state.otel_enabled:
170+
logger.info(
171+
f"OpenTelemetry tracing enabled, exporting to {args.otel_endpoint}"
172+
)
173+
elif args.otel_endpoint and not otel_available:
174+
logger.warning(
175+
"OpenTelemetry endpoint specified but OpenTelemetry packages not installed. "
176+
"Install with: pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp"
177+
)
178+
145179
if args.service_discovery == "static":
146180
initialize_service_discovery(
147181
ServiceDiscoveryType.STATIC,
@@ -292,6 +326,7 @@ def initialize_all(app: FastAPI, args):
292326
app.include_router(metrics_router)
293327
app.state.aiohttp_client_wrapper = AiohttpClientWrapper()
294328
app.state.semantic_cache_available = semantic_cache_available
329+
app.state.otel_enabled = False
295330

296331

297332
def main():
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright 2024-2025 The vLLM Production Stack Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""OpenTelemetry tracing module for vLLM Router."""
16+
17+
from vllm_router.experimental.otel.tracing import (
18+
end_span,
19+
extract_context,
20+
get_tracer,
21+
initialize_tracing,
22+
inject_context,
23+
is_tracing_enabled,
24+
shutdown_tracing,
25+
start_span,
26+
)
27+
28+
__all__ = [
29+
"initialize_tracing",
30+
"shutdown_tracing",
31+
"get_tracer",
32+
"is_tracing_enabled",
33+
"extract_context",
34+
"inject_context",
35+
"start_span",
36+
"end_span",
37+
]

0 commit comments

Comments
 (0)