Skip to content

Commit 66bb4f3

Browse files
authored
Capture PA errors in a log file (#689)
* Capturing PA errors to log and printing warning message * Refactoring and general cleanup * Removing unneeded line * Adding full path to PA warning message. * Choose btw. perf and export paths * Removing existing PA error log file * Going back to using export_path * Add check for DEVNULL
1 parent 7777706 commit 66bb4f3

File tree

4 files changed

+66
-19
lines changed

4 files changed

+66
-19
lines changed

model_analyzer/analyzer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from typing import List, Union, Optional
1616
import sys
17-
from model_analyzer.constants import LOGGER_NAME
17+
from model_analyzer.constants import LOGGER_NAME, PA_ERROR_LOG_FILENAME
1818
from .model_manager import ModelManager
1919
from .result.result_manager import ResultManager
2020
from .result.result_table_manager import ResultTableManager
@@ -134,6 +134,10 @@ def profile(self, client: TritonClient, gpus: List[GPUDevice], mode: str,
134134
self._get_report_command_help_string(
135135
model.model_name()))
136136

137+
if self._metrics_manager.encountered_perf_analyzer_error():
138+
logger.warning(f"Perf Analyzer encountered an error when profiling one or more configurations. " \
139+
f"See {self._config.export_path}/{PA_ERROR_LOG_FILENAME} for further details.\n")
140+
137141
def report(self, mode: str) -> None:
138142
"""
139143
Subcommand: REPORT

model_analyzer/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,5 +57,8 @@
5757
# Logging
5858
LOGGER_NAME = "model_analyzer_logger"
5959

60+
# PA Error Log Filename
61+
PA_ERROR_LOG_FILENAME = "perf_analyzer_error.log"
62+
6063
# Constraints
6164
GLOBAL_CONSTRAINTS_KEY = "__default__"

model_analyzer/record/metrics_manager.py

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,19 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from typing import Optional, Tuple, Dict, List
16+
1517
from .record_aggregator import RecordAggregator
16-
from .record import RecordType
17-
from model_analyzer.constants import LOGGER_NAME
18+
from .record import RecordType, Record
19+
from model_analyzer.constants import LOGGER_NAME, PA_ERROR_LOG_FILENAME
1820
from model_analyzer.model_analyzer_exceptions \
1921
import TritonModelAnalyzerException
2022
from model_analyzer.monitor.cpu_monitor import CPUMonitor
2123
from model_analyzer.monitor.dcgm.dcgm_monitor import DCGMMonitor
2224
from model_analyzer.monitor.remote_monitor import RemoteMonitor
2325
from model_analyzer.output.file_writer import FileWriter
2426
from model_analyzer.perf_analyzer.perf_analyzer import PerfAnalyzer
27+
from model_analyzer.config.run.run_config import RunConfig
2528
from model_analyzer.result.run_config_measurement import RunConfigMeasurement
2629
from model_analyzer.config.generate.base_model_config_generator import BaseModelConfigGenerator
2730

@@ -90,6 +93,7 @@ def __init__(self, config, client, server, gpus, result_manager,
9093
self._loaded_models = None
9194

9295
self._cpu_warning_printed = False
96+
self._encountered_perf_analyzer_error = False
9397

9498
self._gpu_metrics, self._perf_metrics, self._cpu_metrics = self._categorize_metrics(
9599
self.metrics, self._config.collect_cpu_metrics)
@@ -100,6 +104,9 @@ def start_new_model(self):
100104
""" Indicate that profiling of a new model is starting """
101105
self._first_config_variant = {}
102106

107+
def encountered_perf_analyzer_error(self) -> bool:
108+
return self._encountered_perf_analyzer_error
109+
103110
def _init_state(self):
104111
"""
105112
Sets MetricsManager object managed
@@ -165,7 +172,8 @@ def profile_server(self):
165172
self._result_manager.add_server_data(data=server_gpu_metrics)
166173
self._destroy_monitors(cpu_only=cpu_only)
167174

168-
def execute_run_config(self, run_config):
175+
def execute_run_config(
176+
self, run_config: RunConfig) -> Optional[RunConfigMeasurement]:
169177
"""
170178
Executes the RunConfig. Returns obtained measurement. Also sends
171179
measurement to the result manager
@@ -188,15 +196,16 @@ def execute_run_config(self, run_config):
188196
if not self._load_model_variants(run_config):
189197
self._server.stop()
190198
self._loaded_models = None
191-
return
199+
return None
192200

193201
self._loaded_models = current_model_variants
194202

195203
measurement = self.profile_models(run_config)
196204

197205
return measurement
198206

199-
def profile_models(self, run_config):
207+
def profile_models(self,
208+
run_config: RunConfig) -> Optional[RunConfigMeasurement]:
200209
"""
201210
Runs monitors while running perf_analyzer with a specific set of
202211
arguments. This will profile model inferencing.
@@ -266,7 +275,7 @@ def profile_models(self, run_config):
266275
def finalize(self):
267276
self._server.stop()
268277

269-
def _create_model_variants(self, run_config):
278+
def _create_model_variants(self, run_config: RunConfig) -> None:
270279
"""
271280
Creates and fills all model variant directories
272281
"""
@@ -441,7 +450,9 @@ def _destroy_monitors(self, cpu_only=False):
441450
self._gpu_monitor = None
442451
self._cpu_monitor = None
443452

444-
def _run_perf_analyzer(self, run_config, perf_output_writer):
453+
def _run_perf_analyzer(
454+
self, run_config: RunConfig, perf_output_writer: Optional[FileWriter]
455+
) -> Tuple[Optional[Dict], Optional[Dict[int, List[Record]]]]:
445456
"""
446457
Runs perf_analyzer and returns the aggregated metrics
447458
@@ -450,7 +461,7 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
450461
run_config : RunConfig
451462
The RunConfig to execute on perf analyzer
452463
453-
perf_output_writer : OutputWriter
464+
perf_output_writer : FileWriter
454465
Writer that writes the output from perf_analyzer to the output
455466
stream/file. If None, the output is not written
456467
@@ -476,17 +487,10 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
476487
metrics_to_gather = self._perf_metrics + self._gpu_metrics
477488
status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)
478489

479-
if perf_output_writer:
480-
perf_output_writer.write(
481-
'============== Perf Analyzer Launched ==============\n'
482-
f'Command: {perf_analyzer.get_cmd()}\n\n',
483-
append=True)
484-
if perf_analyzer.output():
485-
perf_output_writer.write(perf_analyzer.output() + '\n',
486-
append=True)
490+
self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
487491

488-
# PerfAnalyzer run was not successful
489492
if status == 1:
493+
self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
490494
return (None, None)
491495

492496
perf_records = perf_analyzer.get_perf_records()
@@ -497,6 +501,41 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
497501

498502
return aggregated_perf_records, aggregated_gpu_records
499503

504+
def _write_perf_analyzer_output(self,
505+
perf_output_writer: Optional[FileWriter],
506+
perf_analyzer: PerfAnalyzer) -> None:
507+
if perf_output_writer:
508+
perf_output_writer.write(
509+
'============== Perf Analyzer Launched ==============\n'
510+
f'Command: {perf_analyzer.get_cmd()}\n\n',
511+
append=True)
512+
if perf_analyzer.output():
513+
perf_output_writer.write(perf_analyzer.output() + '\n',
514+
append=True)
515+
516+
def _handle_unsuccessful_perf_analyzer_run(
517+
self, perf_analyzer: PerfAnalyzer) -> None:
518+
output_file = f"{self._config.export_path}/{PA_ERROR_LOG_FILENAME}"
519+
520+
if not self._encountered_perf_analyzer_error:
521+
self._encountered_perf_analyzer_error = True
522+
if os.path.exists(output_file):
523+
os.remove(output_file)
524+
525+
perf_error_log = FileWriter(output_file)
526+
perf_error_log.write('Command: \n' + perf_analyzer.get_cmd() + '\n\n',
527+
append=True)
528+
529+
if perf_analyzer.output():
530+
perf_error_log.write('Error: \n' + perf_analyzer.output() + '\n',
531+
append=True)
532+
else:
533+
perf_error_log.write(
534+
'Error: ' +
535+
'perf_analyzer did not produce any output. It was likely terminated with a SIGABRT.'
536+
+ '\n\n',
537+
append=True)
538+
500539
def _aggregate_perf_records(self, perf_records):
501540
per_model_perf_records = {}
502541
for (model, records) in perf_records.items():

model_analyzer/triton/client/client.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from model_analyzer.model_analyzer_exceptions \
1717
import TritonModelAnalyzerException
1818

19+
from subprocess import DEVNULL
1920
import time
2021
import logging
2122

@@ -187,7 +188,7 @@ def is_server_ready(self):
187188
return self._client.is_server_ready()
188189

189190
def _check_for_triton_log_errors(self, log_file):
190-
if not log_file:
191+
if not log_file or log_file == DEVNULL:
191192
return
192193

193194
log_file.seek(0)

0 commit comments

Comments
 (0)