3838from model_analyzer .record .types .gpu_utilization import GPUUtilization
3939from model_analyzer .record .types .gpu_power_usage import GPUPowerUsage
4040from model_analyzer .record .types .gpu_used_memory import GPUUsedMemory
41- from model_analyzer .record .types .gpu_total_memory import GPUTotalMemory
41+ from model_analyzer .record .types .gpu_free_memory import GPUFreeMemory
4242
4343from model_analyzer .constants import \
4444 INTERVAL_SLEEP_TIME , LOGGER_NAME , MEASUREMENT_REQUEST_COUNT_STEP , \
@@ -86,10 +86,10 @@ class PerfAnalyzer:
8686 ]
8787
8888 gpu_metric_table = [
89- ["gpu_utilization" , "Avg GPU Utilizations " , GPUUtilization ],
90- ["gpu_power_usage" , "Avg GPU Power Usages " , GPUPowerUsage ],
91- ["gpu_used_memory" , "Max GPU Memory Usages " , GPUUsedMemory ],
92- ["gpu_total_memory " , "Total GPU Memory Usages " , GPUTotalMemory ]
89+ ["gpu_utilization" , "Avg GPU Utilization " , GPUUtilization , "0.01" ],
90+ ["gpu_power_usage" , "Avg GPU Power Usage " , GPUPowerUsage , "1" ],
91+ ["gpu_used_memory" , "Max GPU Memory Usage " , GPUUsedMemory , "1000000" ],
92+ ["gpu_free_memory " , "Total GPU Memory" , GPUFreeMemory , "1000000" ]
9393 ]
9494 #yapf: enable
9595
@@ -133,6 +133,7 @@ def __init__(self, path, config, max_retries, timeout, max_cpu_util):
133133 self ._timeout = timeout
134134 self ._output = ""
135135 self ._perf_records = {}
136+ self ._gpu_records = []
136137 self ._max_cpu_util = max_cpu_util
137138
138139 def run (self , metrics , env = None ):
@@ -183,11 +184,11 @@ def run(self, metrics, env=None):
183184
184185 return self .PA_SUCCESS
185186
186- def get_records (self ):
187+ def get_perf_records (self ):
187188 """
188189 Returns
189190 -------
190- The records from the last perf_analyzer run
191+ The perf records from the last perf_analyzer run
191192 """
192193
193194 if self ._perf_records :
@@ -196,6 +197,15 @@ def get_records(self):
196197 "Attempted to get perf_analyzer results"
197198 "without calling run first." )
198199
200+ def get_gpu_records (self ):
201+ """
202+ Returns
203+ -------
204+ The gpu records from the last perf_analyzer run
205+ """
206+
207+ return self ._gpu_records
208+
199209 def output (self ):
200210 """
201211 Returns
@@ -331,7 +341,16 @@ def _get_process_output(self):
331341 self ._cmd_log .seek (0 )
332342 tmp_output = self ._cmd_log .read ()
333343 self ._cmd_log .close ()
334- return tmp_output .decode ('utf-8' )
344+
345+ # PA has occasionally output non-UTF-8 bytes which would cause MA
346+ # to assert. In that case, just ignore the result instead of asserting
347+ result = ""
348+ try :
349+ result = tmp_output .decode ('utf-8' )
350+ except :
351+ pass
352+
353+ return result
335354
336355 def _auto_adjust_parameters (self , process ):
337356 """
@@ -419,28 +438,17 @@ def _parse_outputs(self, metrics):
419438
420439 for row in csv_reader :
421440 self ._perf_records [perf_config [
422- 'model-name' ]] = self ._extract_metrics_from_row (
441+ 'model-name' ]] = self ._extract_perf_records_from_row (
423442 metrics , row )
443+ self ._gpu_records = self ._extract_gpu_records_from_row (
444+ metrics , row )
424445
425446 for perf_config in [
426447 mrc .perf_config () for mrc in self ._config .model_run_configs ()
427448 ]:
428449 os .remove (perf_config ['latency-report-file' ])
429450
430- def _extract_metrics_from_row (self , requested_metrics : List [Record ],
431- row_metrics : Dict [str , str ]) -> List [Record ]:
432- """
433- Extracts the requested metrics from the CSV's row and creates a list of Records
434- """
435- perf_records = self ._create_records_from_perf_metrics (
436- requested_metrics , row_metrics )
437-
438- gpu_records = self ._create_records_from_gpu_metrics (
439- requested_metrics , row_metrics )
440-
441- return perf_records + gpu_records
442-
443- def _create_records_from_perf_metrics (
451+ def _extract_perf_records_from_row (
444452 self , requested_metrics : List [Record ],
445453 row_metrics : Dict [str , str ]) -> List [Record ]:
446454 perf_records : List [Record ] = []
@@ -459,7 +467,7 @@ def _create_records_from_perf_metrics(
459467
460468 return perf_records
461469
462- def _create_records_from_gpu_metrics (
470+ def _extract_gpu_records_from_row (
463471 self , requested_metrics : List [Record ],
464472 row_metrics : Dict [str , str ]) -> List [Record ]:
465473 # GPU metrics have the following format: UUID0:value0;UUID1:value1;...
@@ -484,14 +492,41 @@ def _create_records_from_gpu_metrics(
484492 for gpu_metric_string_tuple in gpu_metric_string_tuples :
485493 gpu_metric_tuple = gpu_metric_string_tuple .split (':' )
486494
487- gpu_records .append (gpu_metric [PerfAnalyzer .RECORD_CLASS ](
488- value = float (
489- gpu_metric_tuple [PerfAnalyzer .GPU_METRIC_VALUE ]),
490- device_uuid = gpu_metric_tuple [
491- PerfAnalyzer .GPU_METRIC_UUID ])) # type: ignore
495+ uuid = gpu_metric_tuple [PerfAnalyzer .GPU_METRIC_UUID ]
496+ tmp_value = float (
497+ gpu_metric_tuple [PerfAnalyzer .GPU_METRIC_VALUE ])
498+ reduction_factor = float (
499+ str (gpu_metric [PerfAnalyzer .REDUCTION_FACTOR ]))
500+ value = tmp_value / reduction_factor
501+
502+ record = gpu_metric [PerfAnalyzer .RECORD_CLASS ](
503+ value = value , device_uuid = uuid ) # type: ignore
492504
505+ gpu_records .append (record )
506+
507+ self ._cleanup_gpu_records (gpu_records )
493508 return gpu_records
494509
510+ def _cleanup_gpu_records (self , gpu_records ):
511+ # Recalculate GPUFreeMemory by removing the value of the associated GPUUsedMemory
512+ # Remove any GPUFreeMemory records that don't have a matching GPUUsedMemory
513+ indexes_to_remove = []
514+ for i , record in enumerate (gpu_records ):
515+ if type (record ) == GPUFreeMemory :
516+ # Find matching UUID UsedMemory
517+ found = False
518+ for other_record in gpu_records :
519+ if type (other_record
520+ ) == GPUUsedMemory and record .device_uuid (
521+ ) == other_record .device_uuid ():
522+ found = True
523+ record ._value = record .value () - other_record .value ()
524+ break
525+ if not found :
526+ indexes_to_remove .append (i )
527+ for i in reversed (indexes_to_remove ):
528+ del gpu_records [i ]
529+
495530 def _is_metric_requested_and_in_row (self , metric : List [object ],
496531 requested_metrics : List [Record ],
497532 row_metrics : Dict [str , str ]) -> bool :
0 commit comments