Skip to content

Commit 25b27d6

Browse files
nv-brafmc-nv
authored andcommitted
Capture GPU metrics from PA (#519)
* Can parse GPU metrics provided by PA. Unit testing added * Added type checking + checking for new PA cases * Making uuid a string * Combining find and create methods * Fixing type checking errors * Changed return to continue
1 parent 24e2e78 commit 25b27d6

File tree

4 files changed

+296
-33
lines changed

4 files changed

+296
-33
lines changed

model_analyzer/perf_analyzer/perf_analyzer.py

Lines changed: 101 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from typing import Any, Dict, List, Union, Tuple
1516
from model_analyzer.model_analyzer_exceptions \
1617
import TritonModelAnalyzerException
1718
from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg
@@ -33,6 +34,12 @@
3334
from model_analyzer.record.types.perf_server_compute_output \
3435
import PerfServerComputeOutput
3536

37+
from model_analyzer.record.record import Record
38+
from model_analyzer.record.types.gpu_utilization import GPUUtilization
39+
from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
40+
from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
41+
from model_analyzer.record.types.gpu_total_memory import GPUTotalMemory
42+
3643
from model_analyzer.constants import \
3744
INTERVAL_SLEEP_TIME, LOGGER_NAME, MEASUREMENT_REQUEST_COUNT_STEP, \
3845
MEASUREMENT_WINDOW_STEP, PERF_ANALYZER_MEASUREMENT_WINDOW, \
@@ -56,23 +63,33 @@ class PerfAnalyzer:
5663
with perf_analyzer.
5764
"""
5865

66+
GPU_METRIC_UUID = 0
67+
GPU_METRIC_VALUE = 1
68+
5969
#yapf: disable
6070
PA_SUCCESS, PA_FAIL, PA_RETRY = 0, 1, 2
6171

6272
METRIC_TAG, CSV_STRING, RECORD_CLASS, REDUCTION_FACTOR = 0, 1, 2, 3
6373
perf_metric_table = [
64-
["perf_latency_avg", "Avg latency", PerfLatencyAvg, 1000],
65-
["perf_latency_p90", "p90 latency", PerfLatencyP90, 1000],
66-
["perf_latency_p95", "p95 latency", PerfLatencyP95, 1000],
67-
["perf_latency_p99", "p99 latency", PerfLatencyP99, 1000],
68-
["perf_throughput", "Inferences/Second", PerfThroughput, 1],
69-
["perf_client_send_recv", "request/response", PerfClientSendRecv, 1000],
70-
["perf_client_send_recv", "send/recv", PerfClientSendRecv, 1000],
71-
["perf_client_response_wait", "response wait", PerfClientResponseWait, 1000],
72-
["perf_server_queue", "Server Queue", PerfServerQueue, 1000],
73-
["perf_server_compute_infer", "Server Compute Infer", PerfServerComputeInfer, 1000],
74-
["perf_server_compute_input", "Server Compute Input", PerfServerComputeInput, 1000],
75-
["perf_server_compute_output", "Server Compute Output", PerfServerComputeOutput, 1000]
74+
["perf_latency_avg", "Avg latency", PerfLatencyAvg, "1000"],
75+
["perf_latency_p90", "p90 latency", PerfLatencyP90, "1000"],
76+
["perf_latency_p95", "p95 latency", PerfLatencyP95, "1000"],
77+
["perf_latency_p99", "p99 latency", PerfLatencyP99, "1000"],
78+
["perf_throughput", "Inferences/Second", PerfThroughput, "1"],
79+
["perf_client_send_recv", "request/response", PerfClientSendRecv, "1000"],
80+
["perf_client_send_recv", "send/recv", PerfClientSendRecv, "1000"],
81+
["perf_client_response_wait", "response wait", PerfClientResponseWait, "1000"],
82+
["perf_server_queue", "Server Queue", PerfServerQueue, "1000"],
83+
["perf_server_compute_infer", "Server Compute Infer", PerfServerComputeInfer, "1000"],
84+
["perf_server_compute_input", "Server Compute Input", PerfServerComputeInput, "1000"],
85+
["perf_server_compute_output", "Server Compute Output", PerfServerComputeOutput, "1000"]
86+
]
87+
88+
gpu_metric_table = [
89+
["gpu_utilization", "Avg GPU Utilizations", GPUUtilization],
90+
["gpu_power_usage", "Avg GPU Power Usages", GPUPowerUsage],
91+
["gpu_used_memory", "Max GPU Memory Usages", GPUUsedMemory],
92+
["gpu_total_memory", "Total GPU Memory Usages", GPUTotalMemory]
7693
]
7794
#yapf: enable
7895

@@ -84,6 +101,14 @@ def get_perf_metrics():
84101
]
85102
return perf_metrics
86103

104+
@staticmethod
105+
def get_gpu_metrics():
106+
gpu_metrics = [
107+
gpu_metric[PerfAnalyzer.RECORD_CLASS]
108+
for gpu_metric in PerfAnalyzer.gpu_metric_table
109+
]
110+
return gpu_metrics
111+
87112
def __init__(self, path, config, max_retries, timeout, max_cpu_util):
88113
"""
89114
Parameters
@@ -402,26 +427,75 @@ def _parse_outputs(self, metrics):
402427
]:
403428
os.remove(perf_config['latency-report-file'])
404429

405-
def _extract_metrics_from_row(self, requested_metrics, row_metrics):
430+
def _extract_metrics_from_row(self, requested_metrics: List[Record],
431+
row_metrics: Dict[str, str]) -> List[Record]:
406432
"""
407433
Extracts the requested metrics from the CSV's row and creates a list of Records
408434
"""
409-
perf_records = []
410-
for perf_metric in PerfAnalyzer.perf_metric_table:
411-
if self._is_perf_metric_requested_and_in_row(
412-
perf_metric, requested_metrics, row_metrics):
413-
value = float(row_metrics[perf_metric[PerfAnalyzer.CSV_STRING]]
414-
) / perf_metric[PerfAnalyzer.REDUCTION_FACTOR]
435+
perf_records = self._create_records_from_perf_metrics(
436+
requested_metrics, row_metrics)
437+
438+
gpu_records = self._create_records_from_gpu_metrics(
439+
requested_metrics, row_metrics)
415440

416-
perf_records.append(
417-
perf_metric[PerfAnalyzer.RECORD_CLASS](value))
441+
return perf_records + gpu_records
442+
443+
def _create_records_from_perf_metrics(
444+
self, requested_metrics: List[Record],
445+
row_metrics: Dict[str, str]) -> List[Record]:
446+
perf_records: List[Record] = []
447+
for perf_metric in PerfAnalyzer.perf_metric_table:
448+
if self._is_metric_requested_and_in_row(perf_metric,
449+
requested_metrics,
450+
row_metrics):
451+
value = float(row_metrics[str(
452+
perf_metric[PerfAnalyzer.CSV_STRING])])
453+
reduction_factor = float(
454+
str(perf_metric[PerfAnalyzer.REDUCTION_FACTOR]))
455+
perf_value = value / reduction_factor
456+
457+
perf_records.append(perf_metric[PerfAnalyzer.RECORD_CLASS](
458+
perf_value)) # type: ignore
418459

419460
return perf_records
420461

421-
def _is_perf_metric_requested_and_in_row(self, perf_metric,
422-
requested_metrics, row_metrics):
423-
tag_match = any(
424-
perf_metric[PerfAnalyzer.METRIC_TAG] in requested_metric.tag
425-
for requested_metric in requested_metrics)
462+
def _create_records_from_gpu_metrics(
463+
self, requested_metrics: List[Record],
464+
row_metrics: Dict[str, str]) -> List[Record]:
465+
# GPU metrics have the following format: UUID0:value0;UUID1:value1;...
466+
gpu_records: List[Record] = []
467+
for gpu_metric in PerfAnalyzer.gpu_metric_table:
468+
if self._is_metric_requested_and_in_row(gpu_metric,
469+
requested_metrics,
470+
row_metrics):
471+
gpu_metric_string = row_metrics[str(
472+
gpu_metric[PerfAnalyzer.CSV_STRING])]
473+
474+
# Covers the case where PA didn't provide data
475+
if not gpu_metric_string:
476+
continue
477+
478+
# Needed because PA might terminate substring with a ;
479+
if gpu_metric_string and gpu_metric_string[-1] == ';':
480+
gpu_metric_string = gpu_metric_string[:-1]
481+
482+
gpu_metric_string_tuples = gpu_metric_string.split(';')
483+
484+
for gpu_metric_string_tuple in gpu_metric_string_tuples:
485+
gpu_metric_tuple = gpu_metric_string_tuple.split(':')
486+
487+
gpu_records.append(gpu_metric[PerfAnalyzer.RECORD_CLASS](
488+
value=float(
489+
gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_VALUE]),
490+
device_uuid=gpu_metric_tuple[
491+
PerfAnalyzer.GPU_METRIC_UUID])) # type: ignore
492+
493+
return gpu_records
494+
495+
def _is_metric_requested_and_in_row(self, metric: List[object],
496+
requested_metrics: List[Record],
497+
row_metrics: Dict[str, str]) -> bool:
498+
tag_match = any(metric[PerfAnalyzer.METRIC_TAG] in requested_metric.tag
499+
for requested_metric in requested_metrics)
426500

427-
return tag_match and perf_metric[PerfAnalyzer.CSV_STRING] in row_metrics
501+
return tag_match and metric[PerfAnalyzer.CSV_STRING] in row_metrics
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from functools import total_ordering
16+
from model_analyzer.record.gpu_record import GPURecord
17+
18+
19+
@total_ordering
20+
class GPUTotalMemory(GPURecord):
21+
"""
22+
The total memory in the GPU.
23+
"""
24+
25+
tag = "gpu_total_memory"
26+
27+
def __init__(self, value, device_uuid=None, timestamp=0):
28+
"""
29+
Parameters
30+
----------
31+
value : float
32+
The value of the GPU metrtic
33+
device_uuid : str
34+
The GPU device uuid this metric is associated
35+
with.
36+
timestamp : int
37+
The timestamp for the record in nanoseconds
38+
"""
39+
40+
super().__init__(value, device_uuid, timestamp)
41+
42+
@staticmethod
43+
def header(aggregation_tag=False):
44+
"""
45+
Parameters
46+
----------
47+
aggregation_tag: bool
48+
An optional tag that may be displayed
49+
as part of the header indicating that
50+
this record has been aggregated using
51+
max, min or average etc.
52+
53+
Returns
54+
-------
55+
str
56+
The full name of the
57+
metric.
58+
"""
59+
60+
return ("Max " if aggregation_tag else "") + "GPU Memory Available (MB)"
61+
62+
def __eq__(self, other):
63+
"""
64+
Allows checking for
65+
equality between two records
66+
"""
67+
68+
return self.value() == other.value()
69+
70+
def __lt__(self, other):
71+
"""
72+
Allows checking if
73+
this record is less than
74+
the other
75+
"""
76+
77+
return self.value() < other.value()
78+
79+
def __add__(self, other):
80+
"""
81+
Allows adding two records together
82+
to produce a brand new record.
83+
"""
84+
85+
return GPUTotalMemory(device_uuid=None,
86+
value=(self.value() + other.value()))
87+
88+
def __sub__(self, other):
89+
"""
90+
Allows subtracting two records together
91+
to produce a brand new record.
92+
"""
93+
94+
return GPUTotalMemory(device_uuid=None,
95+
value=(self.value() - other.value()))

0 commit comments

Comments
 (0)