Skip to content

Commit 8eba2f0

Browse files
committed
Remove unused metrics and update comments
1 parent 468539f commit 8eba2f0

File tree

2 files changed

+60
-117
lines changed

2 files changed

+60
-117
lines changed

ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py

Lines changed: 32 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -37,79 +37,68 @@
3737
sys.path.append("../../common")
3838
from test_util import TestResultCollector, UserData, callback, create_vllm_request
3939

40-
_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
4140

42-
PROMPTS = [
43-
"The most dangerous animal is",
44-
"The capital of France is",
45-
"The future of AI is",
46-
]
47-
SAMPLING_PARAMETERS = {"temperature": "0", "top_p": "1"}
48-
49-
50-
def get_metrics():
51-
"""
52-
Store vllm metrics in a dictionary.
53-
"""
54-
r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
55-
r.raise_for_status()
56-
57-
# Regular expression to match the pattern
58-
pattern = r"^(vllm:.*){.*} (\d+)$"
59-
vllm_dict = {}
41+
class VLLMTritonMetricsTest(TestResultCollector):
42+
def setUp(self):
43+
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
44+
self.tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
45+
self.vllm_model_name = "vllm_opt"
46+
self.prompts = [
47+
"The most dangerous animal is",
48+
"The capital of France is",
49+
"The future of AI is",
50+
]
51+
self.sampling_parameters = {"temperature": "0", "top_p": "1"}
6052

61-
# Find all matches in the text
62-
matches = re.findall(pattern, r.text, re.MULTILINE)
53+
def get_metrics(self):
54+
"""
55+
Store vllm metrics in a dictionary.
56+
"""
57+
r = requests.get(f"http://{self.tritonserver_ipaddr}:8002/metrics")
58+
r.raise_for_status()
6359

64-
for match in matches:
65-
key, value = match
66-
vllm_dict[key] = int(value)
60+
# Regular expression to match the pattern
61+
pattern = r"^(vllm:.*){.*} (\d+)$"
62+
vllm_dict = {}
6763

68-
return vllm_dict
64+
# Find all matches in the text
65+
matches = re.findall(pattern, r.text, re.MULTILINE)
6966

67+
for match in matches:
68+
key, value = match
69+
vllm_dict[key] = int(value)
7070

71-
class VLLMTritonMetricsTest(TestResultCollector):
72-
def setUp(self):
73-
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
74-
self.vllm_model_name = "vllm_opt"
71+
return vllm_dict
7572

7673
def test_vllm_metrics(self):
7774
# Supported vLLM metrics
7875
expected_metrics_dict = {
79-
"vllm:num_requests_running": 0,
80-
"vllm:num_requests_waiting": 0,
81-
"vllm:num_requests_swapped": 0,
82-
"vllm:gpu_cache_usage_perc": 0,
83-
"vllm:cpu_cache_usage_perc": 0,
84-
"vllm:num_preemptions_total": 0,
8576
"vllm:prompt_tokens_total": 0,
8677
"vllm:generation_tokens_total": 0,
8778
}
8879

8980
# Test vLLM metrics
9081
self._test_vllm_model(
91-
prompts=PROMPTS,
92-
sampling_parameters=SAMPLING_PARAMETERS,
82+
prompts=self.prompts,
83+
sampling_parameters=self.sampling_parameters,
9384
stream=False,
9485
send_parameters_as_tensor=True,
9586
model_name=self.vllm_model_name,
9687
)
9788
expected_metrics_dict["vllm:prompt_tokens_total"] = 18
9889
expected_metrics_dict["vllm:generation_tokens_total"] = 48
99-
print(get_metrics())
100-
print(expected_metrics_dict)
101-
self.assertEqual(get_metrics(), expected_metrics_dict)
90+
self.assertEqual(self.get_metrics(), expected_metrics_dict)
10291

10392
self._test_vllm_model(
104-
prompts=PROMPTS,
105-
sampling_parameters=SAMPLING_PARAMETERS,
93+
prompts=self.prompts,
94+
sampling_parameters=self.sampling_parameters,
10695
stream=False,
10796
send_parameters_as_tensor=False,
10897
model_name=self.vllm_model_name,
10998
)
11099
expected_metrics_dict["vllm:prompt_tokens_total"] = 36
111100
expected_metrics_dict["vllm:generation_tokens_total"] = 96
112-
self.assertEqual(get_metrics(), expected_metrics_dict)
101+
self.assertEqual(self.get_metrics(), expected_metrics_dict)
113102

114103
def _test_vllm_model(
115104
self,

src/utils/metrics.py

Lines changed: 28 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -34,41 +34,8 @@
3434

3535
class TritonMetrics:
3636
def __init__(self, labels):
37-
# System stats
38-
# Scheduler State
39-
self.gauge_scheduler_running_family = pb_utils.MetricFamily(
40-
name="vllm:num_requests_running",
41-
description="Number of requests currently running on GPU.",
42-
kind=pb_utils.MetricFamily.GAUGE,
43-
)
44-
self.gauge_scheduler_waiting_family = pb_utils.MetricFamily(
45-
name="vllm:num_requests_waiting",
46-
description="Number of requests waiting to be processed.",
47-
kind=pb_utils.MetricFamily.GAUGE,
48-
)
49-
self.gauge_scheduler_swapped_family = pb_utils.MetricFamily(
50-
name="vllm:num_requests_swapped",
51-
description="Number of requests swapped to CPU.",
52-
kind=pb_utils.MetricFamily.GAUGE,
53-
)
54-
# KV Cache Usage in %
55-
self.gauge_gpu_cache_usage_family = pb_utils.MetricFamily(
56-
name="vllm:gpu_cache_usage_perc",
57-
description="GPU KV-cache usage. 1 means 100 percent usage.",
58-
kind=pb_utils.MetricFamily.GAUGE,
59-
)
60-
self.gauge_cpu_cache_usage_family = pb_utils.MetricFamily(
61-
name="vllm:cpu_cache_usage_perc",
62-
description="CPU KV-cache usage. 1 means 100 percent usage.",
63-
kind=pb_utils.MetricFamily.GAUGE,
64-
)
65-
37+
# Initialize metric families
6638
# Iteration stats
67-
self.counter_num_preemption_family = pb_utils.MetricFamily(
68-
name="vllm:num_preemptions_total",
69-
description="Cumulative number of preemption from the engine.",
70-
kind=pb_utils.MetricFamily.COUNTER,
71-
)
7239
self.counter_prompt_tokens_family = pb_utils.MetricFamily(
7340
name="vllm:prompt_tokens_total",
7441
description="Number of prefill tokens processed.",
@@ -80,29 +47,8 @@ def __init__(self, labels):
8047
kind=pb_utils.MetricFamily.COUNTER,
8148
)
8249

83-
# System stats
84-
# Scheduler State
85-
self.gauge_scheduler_running = self.gauge_scheduler_running_family.Metric(
86-
labels=labels
87-
)
88-
self.gauge_scheduler_waiting = self.gauge_scheduler_waiting_family.Metric(
89-
labels=labels
90-
)
91-
self.gauge_scheduler_swapped = self.gauge_scheduler_swapped_family.Metric(
92-
labels=labels
93-
)
94-
# KV Cache Usage in %
95-
self.gauge_gpu_cache_usage = self.gauge_gpu_cache_usage_family.Metric(
96-
labels=labels
97-
)
98-
self.gauge_cpu_cache_usage = self.gauge_cpu_cache_usage_family.Metric(
99-
labels=labels
100-
)
101-
50+
# Initialize metrics
10251
# Iteration stats
103-
self.counter_num_preemption = self.counter_num_preemption_family.Metric(
104-
labels=labels
105-
)
10652
self.counter_prompt_tokens = self.counter_prompt_tokens_family.Metric(
10753
labels=labels
10854
)
@@ -124,30 +70,38 @@ def info(self, type: str, obj: SupportsMetricsInfo) -> None:
12470
raise NotImplementedError
12571

12672
def _log_gauge(self, gauge, data: Union[int, float]) -> None:
127-
# Convenience function for logging to gauge.
73+
"""Convenience function for logging to gauge.
74+
75+
Args:
76+
gauge: A gauge metric instance.
77+
data: An int or float to set the gauge metric.
78+
79+
Returns:
80+
None
81+
"""
12882
gauge.set(data)
12983

13084
def _log_counter(self, counter, data: Union[int, float]) -> None:
131-
# Convenience function for logging to counter.
132-
counter.increment(data)
85+
"""Convenience function for logging to counter.
86+
87+
Args:
88+
counter: A counter metric instance.
89+
data: An int or float to increment the count metric.
13390
134-
def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
135-
# Convenience function for logging list to histogram.
136-
for datum in data:
137-
histogram.observe(datum)
91+
Returns:
92+
None
93+
"""
94+
counter.increment(data)
13895

13996
def log(self, stats: VllmStats) -> None:
140-
# System state data
141-
self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys)
142-
self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys)
143-
self._log_gauge(self.metrics.gauge_scheduler_swapped, stats.num_swapped_sys)
144-
self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys)
145-
self._log_gauge(self.metrics.gauge_cpu_cache_usage, stats.cpu_cache_usage_sys)
146-
147-
# Iteration level data
148-
self._log_counter(
149-
self.metrics.counter_num_preemption, stats.num_preemption_iter
150-
)
97+
"""Logs tracked stats to triton metrics server every iteration.
98+
99+
Args:
100+
stats: Created by LLMEngine for use by VllmStatLogger.
101+
102+
Returns:
103+
None
104+
"""
151105
self._log_counter(
152106
self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter
153107
)

0 commit comments

Comments
 (0)