1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15+ from typing import Optional , Tuple , Dict , List
16+
1517from .record_aggregator import RecordAggregator
16- from .record import RecordType
17- from model_analyzer .constants import LOGGER_NAME
18+ from .record import RecordType , Record
19+ from model_analyzer .constants import LOGGER_NAME , PA_ERROR_LOG_FILENAME
1820from model_analyzer .model_analyzer_exceptions \
1921 import TritonModelAnalyzerException
2022from model_analyzer .monitor .cpu_monitor import CPUMonitor
2123from model_analyzer .monitor .dcgm .dcgm_monitor import DCGMMonitor
2224from model_analyzer .monitor .remote_monitor import RemoteMonitor
2325from model_analyzer .output .file_writer import FileWriter
2426from model_analyzer .perf_analyzer .perf_analyzer import PerfAnalyzer
27+ from model_analyzer .config .run .run_config import RunConfig
2528from model_analyzer .result .run_config_measurement import RunConfigMeasurement
2629from model_analyzer .config .generate .base_model_config_generator import BaseModelConfigGenerator
2730
@@ -90,6 +93,7 @@ def __init__(self, config, client, server, gpus, result_manager,
9093 self ._loaded_models = None
9194
9295 self ._cpu_warning_printed = False
96+ self ._encountered_perf_analyzer_error = False
9397
9498 self ._gpu_metrics , self ._perf_metrics , self ._cpu_metrics = self ._categorize_metrics (
9599 self .metrics , self ._config .collect_cpu_metrics )
@@ -100,6 +104,9 @@ def start_new_model(self):
100104 """ Indicate that profiling of a new model is starting """
101105 self ._first_config_variant = {}
102106
107+ def encountered_perf_analyzer_error (self ) -> bool :
108+ return self ._encountered_perf_analyzer_error
109+
103110 def _init_state (self ):
104111 """
105112 Sets MetricsManager object managed
@@ -165,7 +172,8 @@ def profile_server(self):
165172 self ._result_manager .add_server_data (data = server_gpu_metrics )
166173 self ._destroy_monitors (cpu_only = cpu_only )
167174
168- def execute_run_config (self , run_config ):
175+ def execute_run_config (
176+ self , run_config : RunConfig ) -> Optional [RunConfigMeasurement ]:
169177 """
170178 Executes the RunConfig. Returns obtained measurement. Also sends
171179 measurement to the result manager
@@ -188,15 +196,16 @@ def execute_run_config(self, run_config):
188196 if not self ._load_model_variants (run_config ):
189197 self ._server .stop ()
190198 self ._loaded_models = None
191- return
199+ return None
192200
193201 self ._loaded_models = current_model_variants
194202
195203 measurement = self .profile_models (run_config )
196204
197205 return measurement
198206
199- def profile_models (self , run_config ):
207+ def profile_models (self ,
208+ run_config : RunConfig ) -> Optional [RunConfigMeasurement ]:
200209 """
201210 Runs monitors while running perf_analyzer with a specific set of
202211 arguments. This will profile model inferencing.
@@ -266,7 +275,7 @@ def profile_models(self, run_config):
266275 def finalize (self ):
267276 self ._server .stop ()
268277
269- def _create_model_variants (self , run_config ) :
278+ def _create_model_variants (self , run_config : RunConfig ) -> None :
270279 """
271280 Creates and fills all model variant directories
272281 """
@@ -441,7 +450,9 @@ def _destroy_monitors(self, cpu_only=False):
441450 self ._gpu_monitor = None
442451 self ._cpu_monitor = None
443452
444- def _run_perf_analyzer (self , run_config , perf_output_writer ):
453+ def _run_perf_analyzer (
454+ self , run_config : RunConfig , perf_output_writer : Optional [FileWriter ]
455+ ) -> Tuple [Optional [Dict ], Optional [Dict [int , List [Record ]]]]:
445456 """
446457 Runs perf_analyzer and returns the aggregated metrics
447458
@@ -450,7 +461,7 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
450461 run_config : RunConfig
451462 The RunConfig to execute on perf analyzer
452463
453- perf_output_writer : OutputWriter
464+ perf_output_writer : FileWriter
454465 Writer that writes the output from perf_analyzer to the output
455466 stream/file. If None, the output is not written
456467
@@ -476,17 +487,10 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
476487 metrics_to_gather = self ._perf_metrics + self ._gpu_metrics
477488 status = perf_analyzer .run (metrics_to_gather , env = perf_analyzer_env )
478489
479- if perf_output_writer :
480- perf_output_writer .write (
481- '============== Perf Analyzer Launched ==============\n '
482- f'Command: { perf_analyzer .get_cmd ()} \n \n ' ,
483- append = True )
484- if perf_analyzer .output ():
485- perf_output_writer .write (perf_analyzer .output () + '\n ' ,
486- append = True )
490+ self ._write_perf_analyzer_output (perf_output_writer , perf_analyzer )
487491
488- # PerfAnalyzer run was not successful
489492 if status == 1 :
493+ self ._handle_unsuccessful_perf_analyzer_run (perf_analyzer )
490494 return (None , None )
491495
492496 perf_records = perf_analyzer .get_perf_records ()
@@ -497,6 +501,41 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
497501
498502 return aggregated_perf_records , aggregated_gpu_records
499503
504+ def _write_perf_analyzer_output (self ,
505+ perf_output_writer : Optional [FileWriter ],
506+ perf_analyzer : PerfAnalyzer ) -> None :
507+ if perf_output_writer :
508+ perf_output_writer .write (
509+ '============== Perf Analyzer Launched ==============\n '
510+ f'Command: { perf_analyzer .get_cmd ()} \n \n ' ,
511+ append = True )
512+ if perf_analyzer .output ():
513+ perf_output_writer .write (perf_analyzer .output () + '\n ' ,
514+ append = True )
515+
516+ def _handle_unsuccessful_perf_analyzer_run (
517+ self , perf_analyzer : PerfAnalyzer ) -> None :
518+ output_file = f"{ self ._config .export_path } /{ PA_ERROR_LOG_FILENAME } "
519+
520+ if not self ._encountered_perf_analyzer_error :
521+ self ._encountered_perf_analyzer_error = True
522+ if os .path .exists (output_file ):
523+ os .remove (output_file )
524+
525+ perf_error_log = FileWriter (output_file )
526+ perf_error_log .write ('Command: \n ' + perf_analyzer .get_cmd () + '\n \n ' ,
527+ append = True )
528+
529+ if perf_analyzer .output ():
530+ perf_error_log .write ('Error: \n ' + perf_analyzer .output () + '\n ' ,
531+ append = True )
532+ else :
533+ perf_error_log .write (
534+ 'Error: ' +
535+ 'perf_analyzer did not produce any output. It was likely terminated with a SIGABRT.'
536+ + '\n \n ' ,
537+ append = True )
538+
500539 def _aggregate_perf_records (self , perf_records ):
501540 per_model_perf_records = {}
502541 for (model , records ) in perf_records .items ():
0 commit comments