@@ -44,7 +44,7 @@ class MetricsManager:
4444 "gpu_power_usage"
4545 ]
4646
47- def __init__ (self , config , client , server , result_manager ):
47+ def __init__ (self , config , client , server , result_manager , state_manager ):
4848 """
4949 Parameters
5050 ----------
@@ -58,19 +58,45 @@ def __init__(self, config, client, server, result_manager):
5858 result_manager : ResultManager
5959 instance that manages the result tables and
6060 adding results
61+ state_manager: AnalyzerStateManager
62+ manages the analyzer state
6163 """
6264
6365 self ._config = config
6466 self ._client = client
6567 self ._server = server
6668 self ._result_manager = result_manager
69+ self ._state_manager = state_manager
6770
6871 self ._dcgm_metrics , self ._perf_metrics , self ._cpu_metrics = \
6972 MetricsManager .categorize_metrics ()
7073 self ._gpus = GPUDeviceFactory .verify_requested_gpus (self ._config .gpus )
74+ self ._init_state ()
7175
72- self ._dcgm_monitor = None
73- self ._cpu_monitor = None
76+ def _init_state (self ):
77+ """
78+ Sets MetricsManager object managed
79+ state variables in AnalyerState
80+ """
81+
82+ gpu_info = self ._state_manager .get_state_variable (
83+ 'MetricsManager.gpu_info' )
84+
85+ if self ._state_manager .starting_fresh_run () or gpu_info is None :
86+ gpu_info = {}
87+
88+ for i in range (len (self ._gpus )):
89+ if self ._gpus [i ] not in gpu_info :
90+ device_info = {}
91+ device = numba .cuda .list_devices ()[i ]
92+ device_info ['name' ] = device .name
93+ with device :
94+ # convert bytes to GB
95+ device_info ['total_memory' ] = numba .cuda .current_context (
96+ ).get_memory_info ().total
97+ gpu_info [self ._gpus [i ]] = device_info
98+
99+ self ._state_manager .set_state_variable ('MetricsManager.gpus' , gpu_info )
74100
75101 @classmethod
76102 def categorize_metrics (cls ):
@@ -86,7 +112,7 @@ def categorize_metrics(cls):
86112
87113 dcgm_metrics , perf_metrics , cpu_metrics = [], [], []
88114 # Separates metrics and objectives into related lists
89- for metric in MetricsManager .get_metric_types (cls .metric_tags ):
115+ for metric in MetricsManager .get_metric_types (tags = cls .metric_tags ):
90116 if metric in DCGMMonitor .model_analyzer_to_dcgm_field :
91117 dcgm_metrics .append (metric )
92118 elif metric in PerfAnalyzer .perf_metrics :
@@ -147,7 +173,7 @@ def profile_model(self, run_config, perf_output_writer=None):
147173 else :
148174 perf_analyzer_metrics = perf_analyzer_metrics_or_status
149175
150- # Get metrics for model inference and combine metrics that do not have GPU ID
176+ # Get metrics for model inference and combine metrics that do not have GPU UUID
151177 model_gpu_metrics = {}
152178 if not cpu_only :
153179 model_gpu_metrics = self ._get_gpu_inference_metrics ()
@@ -273,12 +299,13 @@ def _get_gpu_inference_metrics(self):
273299
274300 records_groupby_gpu = {}
275301 records_groupby_gpu = dcgm_record_aggregator .groupby (
276- self ._dcgm_metrics , lambda record : record .device ().device_id ())
302+ self ._dcgm_metrics , lambda record : str (
303+ record .device ().device_uuid (), encoding = 'ascii' ))
277304
278305 gpu_metrics = defaultdict (list )
279306 for _ , metric in records_groupby_gpu .items ():
280- for gpu_id , metric_value in metric .items ():
281- gpu_metrics [gpu_id ].append (metric_value )
307+ for gpu_uuid , metric_value in metric .items ():
308+ gpu_metrics [gpu_uuid ].append (metric_value )
282309
283310 return gpu_metrics
284311
0 commit comments