Skip to content

Commit fb056b1

Browse files
authored
test: Refactor cpu metrics tests to make L0_metrics more stable (#7476)
1 parent ceec296 commit fb056b1

File tree

5 files changed

+261
-91
lines changed

5 files changed

+261
-91
lines changed

qa/L0_metrics/cpu_metrics_test.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
#!/usr/bin/python
2+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
import os
29+
import re
30+
import threading
31+
import time
32+
import unittest
33+
from collections import defaultdict
34+
35+
import numpy as np
36+
import requests
37+
import tritonclient.http as httpclient
38+
39+
_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
40+
CPU_UTILIZATION = "nv_cpu_utilization"
41+
CPU_USED_MEMORY = "nv_cpu_memory_used_bytes"
42+
CPU_TOTAL_MEMORY = "nv_cpu_memory_total_bytes"
43+
44+
45+
def get_metrics():
46+
utilization_pattern = re.compile(rf"{CPU_UTILIZATION} (\d+\.?\d*)")
47+
used_bytes_pattern = re.compile(rf"{CPU_USED_MEMORY} (\d+)")
48+
total_bytes_pattern = re.compile(rf"{CPU_TOTAL_MEMORY} (\d+)")
49+
50+
r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
51+
r.raise_for_status()
52+
53+
utilization_match = utilization_pattern.search(r.text)
54+
utilization_value = float(utilization_match.group(1))
55+
56+
used_bytes_match = used_bytes_pattern.search(r.text)
57+
used_bytes_value = int(used_bytes_match.group(1))
58+
59+
total_bytes_match = total_bytes_pattern.search(r.text)
60+
total_bytes_value = int(total_bytes_match.group(1))
61+
62+
return utilization_value, used_bytes_value, total_bytes_value
63+
64+
65+
class TestCpuMetrics(unittest.TestCase):
66+
def setUp(self):
67+
self.inference_completed = threading.Event()
68+
69+
shape = [1, 16]
70+
self.model_name = "libtorch_float32_float32_float32"
71+
input0_data = np.random.rand(*shape).astype(np.float32)
72+
input1_data = np.random.rand(*shape).astype(np.float32)
73+
74+
self.inputs = [
75+
httpclient.InferInput(
76+
"INPUT0", input0_data.shape, "FP32"
77+
).set_data_from_numpy(input0_data),
78+
httpclient.InferInput(
79+
"INPUT1", input1_data.shape, "FP32"
80+
).set_data_from_numpy(input1_data),
81+
]
82+
83+
def _validate_metric_variance(self, observed_metrics: dict):
84+
dupe_value_tolerance = 5
85+
for metric in [CPU_UTILIZATION, CPU_USED_MEMORY]:
86+
observed_values = observed_metrics[metric]
87+
observed_count = len(observed_values)
88+
print(
89+
f"Observed {metric} count: {observed_count}, values: {observed_values}"
90+
)
91+
92+
# Must have at least 1 more than the duplicate tolerance
93+
self.assertGreater(
94+
observed_count,
95+
dupe_value_tolerance,
96+
f"Found too many sequential duplicate values for {metric}. Double check the server-side --metrics-interval and observation interval in this test, or consider tuning the duplicate tolerance.",
97+
)
98+
99+
# Don't allow observed metric values to be repeated sequentially
100+
# more than a certain tolerance. The expectation is that these metrics
101+
# will vary while the server is processing requests in the background,
102+
# provided the server was configured with a small metrics update interval.
103+
sequential_dupes = 0
104+
max_sequential_dupes = 0
105+
prev_value = observed_values[0]
106+
for value in observed_values[1:]:
107+
if value == prev_value:
108+
sequential_dupes += 1
109+
else:
110+
# If unique value found, reset counter
111+
sequential_dupes = 0
112+
113+
# For future observability on dupe frequency to tune the tolerance
114+
if sequential_dupes > max_sequential_dupes:
115+
max_sequential_dupes = sequential_dupes
116+
117+
self.assertLess(sequential_dupes, dupe_value_tolerance)
118+
prev_value = value
119+
120+
print(
121+
f"Max sequential duplicate values found for {metric}: {max_sequential_dupes}"
122+
)
123+
124+
def _collect_metrics(self, observed_metrics, interval_secs=1):
125+
"""
126+
Collects metrics at provided 'interval_secs' and stores them in the
127+
provided 'observed_metrics' dictionary for postprocessing.
128+
"""
129+
# Give the test and server some time to begin processing requests
130+
# before beginning observation loop.
131+
time.sleep(1)
132+
133+
while not self.inference_completed.is_set():
134+
util_value, used_memory_value, _ = get_metrics()
135+
observed_metrics[CPU_UTILIZATION].append(util_value)
136+
observed_metrics[CPU_USED_MEMORY].append(used_memory_value)
137+
time.sleep(interval_secs)
138+
139+
def test_cpu_metrics_during_inference(self):
140+
with httpclient.InferenceServerClient(
141+
url=f"{_tritonserver_ipaddr}:8000", concurrency=10
142+
) as client:
143+
# Start a thread to collect metrics asynchronously while inferences are
144+
# executing, store them in a dictionary for postprocessing validation.
145+
observed_metrics = defaultdict(list)
146+
metrics_thread = threading.Thread(
147+
target=self._collect_metrics, args=(observed_metrics,)
148+
)
149+
metrics_thread.start()
150+
151+
# Fire off many asynchronous inference requests to keep server
152+
# busy while monitoring the CPU metrics. Ideal target is about
153+
# 20-30 seconds of inference to get a good number of metric samples.
154+
async_requests = []
155+
for _ in range(2000):
156+
async_requests.append(
157+
client.async_infer(
158+
model_name=self.model_name,
159+
inputs=self.inputs,
160+
)
161+
)
162+
163+
# Wait for all inference requests to complete
164+
for async_request in async_requests:
165+
async_request.get_result()
166+
167+
# Set the event to indicate that inference is completed
168+
self.inference_completed.set()
169+
170+
# Wait for the metrics thread to complete
171+
metrics_thread.join()
172+
173+
self._validate_metric_variance(observed_metrics)
174+
175+
def test_cpu_metrics_ranges(self):
176+
# Test some simple sanity checks on the expected ranges of values
177+
# for the CPU related metrics.
178+
utilization, used_memory, total_memory = get_metrics()
179+
self.assertTrue(0 <= utilization <= 1.0)
180+
self.assertTrue(0 <= used_memory <= total_memory)
181+
# NOTE: Can be improved in future to compare upper bound against psutil
182+
# system memory if we introduce the dependency into the test/container.
183+
self.assertGreater(total_memory, 0)
184+
185+
186+
if __name__ == "__main__":
187+
unittest.main()

qa/L0_metrics/metrics_config_test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
import requests
3636
import test_util as tu
3737

38+
_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
39+
3840
INF_COUNTER_PATTERNS = [
3941
"nv_inference_request_duration",
4042
"nv_inference_queue_duration",
@@ -64,7 +66,7 @@
6466

6567
class MetricsConfigTest(tu.TestResultCollector):
6668
def _get_metrics(self):
67-
metrics_url = "http://localhost:8002/metrics"
69+
metrics_url = f"http://{_tritonserver_ipaddr}:8002/metrics"
6870
r = requests.get(metrics_url)
6971
r.raise_for_status()
7072
return r.text

qa/L0_metrics/metrics_queue_size_test.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2626
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

28+
import os
2829
import sys
2930

3031
sys.path.append("../common")
@@ -40,6 +41,8 @@
4041
import tritonclient.http
4142
from tritonclient.utils import triton_to_np_dtype
4243

44+
_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
45+
4346
QUEUE_METRIC_TEMPLATE = (
4447
'nv_inference_pending_request_count{{model="{model_name}",version="1"}}'
4548
)
@@ -50,8 +53,8 @@
5053
class MetricsPendingRequestCountTest(tu.TestResultCollector):
5154
def setUp(self):
5255
self.metrics = None
53-
self.metrics_url = "http://localhost:8002/metrics"
54-
self.server_url = "localhost:8000"
56+
self.metrics_url = f"http://{_tritonserver_ipaddr}:8002/metrics"
57+
self.server_url = f"{_tritonserver_ipaddr}:8000"
5558

5659
# Used to verify model config is set to expected values
5760
self.max_batch_size = 4

qa/L0_metrics/pinned_memory_metrics_test.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import tritonclient.http as httpclient
3737
from tritonclient.utils import *
3838

39+
_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
3940
# Triton server reserves 256 MB for pinned memory by default.
4041
DEFAULT_TOTAL_PINNED_MEMORY_SIZE = 2**28 # bytes, Equivalent to 256 MB
4142
TOTAL_PINNED_MEMORY_SIZE = int(
@@ -51,7 +52,7 @@ def get_metrics():
5152
total_bytes_pattern = re.compile(r"pool_total_bytes (\d+)")
5253
used_bytes_pattern = re.compile(r"pool_used_bytes (\d+)")
5354

54-
r = requests.get("http://localhost:8002/metrics")
55+
r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
5556
r.raise_for_status()
5657

5758
total_bytes_match = total_bytes_pattern.search(r.text)
@@ -103,7 +104,7 @@ def _collect_metrics(self):
103104

104105
def test_pinned_memory_metrics_asynchronous_requests(self):
105106
with httpclient.InferenceServerClient(
106-
url="localhost:8000", concurrency=10
107+
url=f"{_tritonserver_ipaddr}:8000", concurrency=10
107108
) as client:
108109
if not client.is_model_ready(self.model_name):
109110
client.load_model(self.model_name)
@@ -142,7 +143,9 @@ def test_pinned_memory_metrics_asynchronous_requests(self):
142143
self._assert_pinned_memory_utilization()
143144

144145
def test_pinned_memory_metrics_synchronous_requests(self):
145-
with httpclient.InferenceServerClient(url="localhost:8000") as client:
146+
with httpclient.InferenceServerClient(
147+
url=f"{_tritonserver_ipaddr}:8000"
148+
) as client:
146149
if not client.is_model_ready(self.model_name):
147150
client.load_model(self.model_name)
148151

0 commit comments

Comments
 (0)