Skip to content

Commit 7080291

Browse files
tgerdesnvmc-nv
authored andcommitted
use gpu metrics from PA (#520)
* support and use new CLI options * Updates to use new gpu metrics * working unit tests * Update FreeMemory support * Review feedback * fix tests * add fixme
1 parent c157d15 commit 7080291

File tree

6 files changed

+181
-105
lines changed

6 files changed

+181
-105
lines changed

model_analyzer/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
INTERVAL_SLEEP_TIME = 1
5050
PERF_ANALYZER_MEASUREMENT_WINDOW = 5000
5151
PERF_ANALYZER_MINIMUM_REQUEST_COUNT = 50
52+
SECONDS_TO_MILLISECONDS_MULTIPLIER = 1000
5253

5354
# Triton Server
5455
SERVER_OUTPUT_TIMEOUT_SECS = 5

model_analyzer/perf_analyzer/perf_analyzer.py

Lines changed: 64 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
from model_analyzer.record.types.gpu_utilization import GPUUtilization
3939
from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
4040
from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
41-
from model_analyzer.record.types.gpu_total_memory import GPUTotalMemory
41+
from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory
4242

4343
from model_analyzer.constants import \
4444
INTERVAL_SLEEP_TIME, LOGGER_NAME, MEASUREMENT_REQUEST_COUNT_STEP, \
@@ -86,10 +86,10 @@ class PerfAnalyzer:
8686
]
8787

8888
gpu_metric_table = [
89-
["gpu_utilization", "Avg GPU Utilizations", GPUUtilization],
90-
["gpu_power_usage", "Avg GPU Power Usages", GPUPowerUsage],
91-
["gpu_used_memory", "Max GPU Memory Usages", GPUUsedMemory],
92-
["gpu_total_memory", "Total GPU Memory Usages", GPUTotalMemory]
89+
["gpu_utilization", "Avg GPU Utilization", GPUUtilization, "0.01"],
90+
["gpu_power_usage", "Avg GPU Power Usage", GPUPowerUsage, "1"],
91+
["gpu_used_memory", "Max GPU Memory Usage", GPUUsedMemory, "1000000"],
92+
["gpu_free_memory", "Total GPU Memory", GPUFreeMemory, "1000000"]
9393
]
9494
#yapf: enable
9595

@@ -133,6 +133,7 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util):
133133
self._timeout = timeout
134134
self._output = ""
135135
self._perf_records = {}
136+
self._gpu_records = []
136137
self._max_cpu_util = max_cpu_util
137138

138139
def run(self, metrics, env=None):
@@ -183,11 +184,11 @@ def run(self, metrics, env=None):
183184

184185
return self.PA_SUCCESS
185186

186-
def get_records(self):
187+
def get_perf_records(self):
187188
"""
188189
Returns
189190
-------
190-
The records from the last perf_analyzer run
191+
The perf records from the last perf_analyzer run
191192
"""
192193

193194
if self._perf_records:
@@ -196,6 +197,15 @@ def get_records(self):
196197
"Attempted to get perf_analyzer results"
197198
"without calling run first.")
198199

200+
def get_gpu_records(self):
201+
"""
202+
Returns
203+
-------
204+
The gpu records from the last perf_analyzer run
205+
"""
206+
207+
return self._gpu_records
208+
199209
def output(self):
200210
"""
201211
Returns
@@ -331,7 +341,16 @@ def _get_process_output(self):
331341
self._cmd_log.seek(0)
332342
tmp_output = self._cmd_log.read()
333343
self._cmd_log.close()
334-
return tmp_output.decode('utf-8')
344+
345+
# PA has occasionally output non-UTF-8 bytes which would cause MA
346+
# to assert. In that case, just ignore the result instead of asserting
347+
result = ""
348+
try:
349+
result = tmp_output.decode('utf-8')
350+
except:
351+
pass
352+
353+
return result
335354

336355
def _auto_adjust_parameters(self, process):
337356
"""
@@ -419,28 +438,17 @@ def _parse_outputs(self, metrics):
419438

420439
for row in csv_reader:
421440
self._perf_records[perf_config[
422-
'model-name']] = self._extract_metrics_from_row(
441+
'model-name']] = self._extract_perf_records_from_row(
423442
metrics, row)
443+
self._gpu_records = self._extract_gpu_records_from_row(
444+
metrics, row)
424445

425446
for perf_config in [
426447
mrc.perf_config() for mrc in self._config.model_run_configs()
427448
]:
428449
os.remove(perf_config['latency-report-file'])
429450

430-
def _extract_metrics_from_row(self, requested_metrics: List[Record],
431-
row_metrics: Dict[str, str]) -> List[Record]:
432-
"""
433-
Extracts the requested metrics from the CSV's row and creates a list of Records
434-
"""
435-
perf_records = self._create_records_from_perf_metrics(
436-
requested_metrics, row_metrics)
437-
438-
gpu_records = self._create_records_from_gpu_metrics(
439-
requested_metrics, row_metrics)
440-
441-
return perf_records + gpu_records
442-
443-
def _create_records_from_perf_metrics(
451+
def _extract_perf_records_from_row(
444452
self, requested_metrics: List[Record],
445453
row_metrics: Dict[str, str]) -> List[Record]:
446454
perf_records: List[Record] = []
@@ -459,7 +467,7 @@ def _create_records_from_perf_metrics(
459467

460468
return perf_records
461469

462-
def _create_records_from_gpu_metrics(
470+
def _extract_gpu_records_from_row(
463471
self, requested_metrics: List[Record],
464472
row_metrics: Dict[str, str]) -> List[Record]:
465473
# GPU metrics have the following format: UUID0:value0;UUID1:value1;...
@@ -484,14 +492,41 @@ def _create_records_from_gpu_metrics(
484492
for gpu_metric_string_tuple in gpu_metric_string_tuples:
485493
gpu_metric_tuple = gpu_metric_string_tuple.split(':')
486494

487-
gpu_records.append(gpu_metric[PerfAnalyzer.RECORD_CLASS](
488-
value=float(
489-
gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_VALUE]),
490-
device_uuid=gpu_metric_tuple[
491-
PerfAnalyzer.GPU_METRIC_UUID])) # type: ignore
495+
uuid = gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_UUID]
496+
tmp_value = float(
497+
gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_VALUE])
498+
reduction_factor = float(
499+
str(gpu_metric[PerfAnalyzer.REDUCTION_FACTOR]))
500+
value = tmp_value / reduction_factor
501+
502+
record = gpu_metric[PerfAnalyzer.RECORD_CLASS](
503+
value=value, device_uuid=uuid) # type: ignore
492504

505+
gpu_records.append(record)
506+
507+
self._cleanup_gpu_records(gpu_records)
493508
return gpu_records
494509

510+
def _cleanup_gpu_records(self, gpu_records):
511+
# Recalculate GPUFreeMemory by removing the value of the associated GPUUsedMemory
512+
# Remove any GPUFreeMemory records that don't have a matching GPUUsedMemory
513+
indexes_to_remove = []
514+
for i, record in enumerate(gpu_records):
515+
if type(record) == GPUFreeMemory:
516+
# Find matching UUID UsedMemory
517+
found = False
518+
for other_record in gpu_records:
519+
if type(other_record
520+
) == GPUUsedMemory and record.device_uuid(
521+
) == other_record.device_uuid():
522+
found = True
523+
record._value = record.value() - other_record.value()
524+
break
525+
if not found:
526+
indexes_to_remove.append(i)
527+
for i in reversed(indexes_to_remove):
528+
del gpu_records[i]
529+
495530
def _is_metric_requested_and_in_row(self, metric: List[object],
496531
requested_metrics: List[Record],
497532
row_metrics: Dict[str, str]) -> bool:

model_analyzer/perf_analyzer/perf_config.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from model_analyzer.model_analyzer_exceptions \
1616
import TritonModelAnalyzerException
1717
from model_analyzer.config.input.config_defaults import DEFAULT_MEASUREMENT_MODE
18+
from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER
1819

1920

2021
class PerfAnalyzerConfig:
@@ -39,7 +40,8 @@ class PerfAnalyzerConfig:
3940
'ssl-https-verify-host', 'ssl-https-ca-certificates-file',
4041
'ssl-https-client-certificate-type',
4142
'ssl-https-client-certificate-file', 'ssl-https-private-key-type',
42-
'ssl-https-private-key-file'
43+
'ssl-https-private-key-file', 'collect-metrics', 'metrics-url',
44+
'metrics-interval'
4345
]
4446

4547
input_to_options = [
@@ -52,7 +54,8 @@ class PerfAnalyzerConfig:
5254
additive_args = ['input-data', 'shape']
5355

5456
boolean_args = [
55-
'streaming', 'async', 'sync', 'binary-search', 'ssl-grpc-use-ssl'
57+
'streaming', 'async', 'sync', 'binary-search', 'ssl-grpc-use-ssl',
58+
'collect-metrics'
5659
]
5760

5861
def __init__(self):
@@ -166,6 +169,14 @@ def update_config_from_profile_config(self, model_name, profile_config):
166169
'protocol': profile_config.client_protocol,
167170
'url': url
168171
})
172+
173+
metrics_interval = profile_config.monitoring_interval * SECONDS_TO_MILLISECONDS_MULTIPLIER
174+
params.update({
175+
'collect-metrics': 'True',
176+
'metrics-url': profile_config.triton_metrics_url,
177+
'metrics-interval': metrics_interval
178+
})
179+
169180
self.update_config(params)
170181

171182
@classmethod

model_analyzer/record/metrics_manager.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,7 @@ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):
142142
gpu_metrics, perf_metrics, cpu_metrics = [], [], []
143143
# Separates metrics and objectives into related lists
144144
for metric in MetricsManager.get_metric_types(metric_tags):
145-
if metric in DCGMMonitor.model_analyzer_to_dcgm_field or metric in RemoteMonitor.gpu_metrics.values(
146-
):
145+
if metric in PerfAnalyzer.get_gpu_metrics():
147146
gpu_metrics.append(metric)
148147
elif metric in PerfAnalyzer.get_perf_metrics():
149148
perf_metrics.append(metric)
@@ -223,17 +222,16 @@ def profile_models(self, run_config):
223222

224223
self._start_monitors(cpu_only=cpu_only)
225224

226-
perf_analyzer_metrics = self._run_perf_analyzer(run_config,
227-
perf_output_writer)
225+
perf_analyzer_metrics, model_gpu_metrics = self._run_perf_analyzer(
226+
run_config, perf_output_writer)
228227

229228
if not perf_analyzer_metrics:
230229
self._stop_monitors(cpu_only=cpu_only)
231230
self._destroy_monitors(cpu_only=cpu_only)
232231
return None
233232

234233
# Get metrics for model inference and combine metrics that do not have GPU UUID
235-
model_gpu_metrics = {}
236-
if not cpu_only:
234+
if not cpu_only and not model_gpu_metrics:
237235
model_gpu_metrics = self._get_gpu_inference_metrics()
238236
model_cpu_metrics = self._get_cpu_inference_metrics()
239237

@@ -369,6 +367,7 @@ def _start_monitors(self, cpu_only=False):
369367
Start any metrics monitors
370368
"""
371369

370+
self._gpu_monitor = None
372371
if not cpu_only:
373372
try:
374373
if self._config.use_local_gpu_monitor:
@@ -446,7 +445,8 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
446445
timeout=self._config.perf_analyzer_timeout,
447446
max_cpu_util=self._config.perf_analyzer_cpu_util)
448447

449-
status = perf_analyzer.run(self._perf_metrics, env=perf_analyzer_env)
448+
metrics_to_gather = self._perf_metrics + self._gpu_metrics
449+
status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)
450450

451451
if perf_output_writer:
452452
perf_output_writer.write(
@@ -459,16 +459,23 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
459459

460460
# PerfAnalyzer run was not succesful
461461
if status == 1:
462-
return None
462+
return (None, None)
463+
464+
perf_records = perf_analyzer.get_perf_records()
465+
gpu_records = perf_analyzer.get_gpu_records()
463466

464-
per_model_perf_records = perf_analyzer.get_records()
467+
aggregated_perf_records = self._aggregate_perf_records(perf_records)
468+
aggregated_gpu_records = self._aggregate_gpu_records(gpu_records)
465469

466-
for (model, perf_records) in per_model_perf_records.items():
470+
return aggregated_perf_records, aggregated_gpu_records
471+
472+
def _aggregate_perf_records(self, perf_records):
473+
per_model_perf_records = {}
474+
for (model, records) in perf_records.items():
467475
perf_record_aggregator = RecordAggregator()
468-
perf_record_aggregator.insert_all(perf_records)
476+
perf_record_aggregator.insert_all(records)
469477

470478
per_model_perf_records[model] = perf_record_aggregator.aggregate()
471-
472479
return per_model_perf_records
473480

474481
def _get_gpu_inference_metrics(self):
@@ -485,6 +492,10 @@ def _get_gpu_inference_metrics(self):
485492
# Stop and destroy DCGM monitor
486493
gpu_records = self._gpu_monitor.stop_recording_metrics()
487494

495+
gpu_metrics = self._aggregate_gpu_records(gpu_records)
496+
return gpu_metrics
497+
498+
def _aggregate_gpu_records(self, gpu_records):
488499
# Insert all records into aggregator and get aggregated DCGM records
489500
gpu_record_aggregator = RecordAggregator()
490501
gpu_record_aggregator.insert_all(gpu_records)

tests/common/test_utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,12 @@
2525
from model_analyzer.record.metrics_manager import MetricsManager
2626
from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
2727
from model_analyzer.state.analyzer_state_manager import AnalyzerStateManager
28-
28+
from model_analyzer.constants import SECONDS_TO_MILLISECONDS_MULTIPLIER
2929
from model_analyzer.config.input.config_defaults import \
3030
DEFAULT_BATCH_SIZES, DEFAULT_TRITON_LAUNCH_MODE, DEFAULT_CLIENT_PROTOCOL, \
3131
DEFAULT_MEASUREMENT_MODE, DEFAULT_TRITON_GRPC_ENDPOINT, DEFAULT_TRITON_HTTP_ENDPOINT, \
32-
DEFAULT_TRITON_INSTALL_PATH, DEFAULT_OUTPUT_MODEL_REPOSITORY
32+
DEFAULT_TRITON_INSTALL_PATH, DEFAULT_OUTPUT_MODEL_REPOSITORY, DEFAULT_TRITON_METRICS_URL, \
33+
DEFAULT_MONITORING_INTERVAL
3334

3435
import os
3536

@@ -221,6 +222,10 @@ def construct_perf_analyzer_config(model_name='my-model',
221222
pa_config._args['triton-server-directory'] = DEFAULT_TRITON_INSTALL_PATH
222223
pa_config._args['model-repository'] = DEFAULT_OUTPUT_MODEL_REPOSITORY
223224
else:
225+
pa_config._args['collect-metrics'] = 'True'
226+
pa_config._args['metrics-url'] = DEFAULT_TRITON_METRICS_URL
227+
pa_config._args[
228+
'metrics-interval'] = SECONDS_TO_MILLISECONDS_MULTIPLIER * DEFAULT_MONITORING_INTERVAL
224229
pa_config._options['-i'] = client_protocol
225230
if client_protocol == 'http':
226231
pa_config._options['-u'] = DEFAULT_TRITON_HTTP_ENDPOINT

0 commit comments

Comments
 (0)