Skip to content

Commit 7777706

Browse files
authored
Detect when tritonserver fails to launch (#688)
* Initial changes. Unit tests failing * Unit tests passing * Needed to convert bytes to string depending on if output path was specified * Adding unit test * Fixing codeQL issues * Second attempt at fixing codeQL error * Updates based on review comments
1 parent ca96f10 commit 7777706

File tree

12 files changed

+94
-15
lines changed

12 files changed

+94
-15
lines changed

model_analyzer/analyzer.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,9 @@ def _get_server_only_metrics(self, client, gpus):
205205

206206
logger.info('Profiling server only metrics...')
207207
self._server.start()
208-
client.wait_for_server_ready(self._config.client_max_retries)
208+
client.wait_for_server_ready(
209+
num_retries=self._config.client_max_retries,
210+
log_file=self._server.log_file())
209211
self._metrics_manager.profile_server()
210212
self._server.stop()
211213

model_analyzer/record/metrics_manager.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def start_new_model(self):
103103
def _init_state(self):
104104
"""
105105
Sets MetricsManager object managed
106-
state variables in AnalyerState
106+
state variables in AnalyzerState
107107
"""
108108

109109
gpu_info = self._state_manager.get_state_variable(
@@ -361,7 +361,9 @@ def _do_load_model_variant(self, variant_config):
361361
"""
362362
Loads a model variant in the client
363363
"""
364-
self._client.wait_for_server_ready(self._config.client_max_retries)
364+
self._client.wait_for_server_ready(
365+
num_retries=self._config.client_max_retries,
366+
log_file=self._server.log_file())
365367

366368
variant_name = variant_config.get_field('name')
367369
if self._client.load_model(model_name=variant_name) == -1:
@@ -483,7 +485,7 @@ def _run_perf_analyzer(self, run_config, perf_output_writer):
483485
perf_output_writer.write(perf_analyzer.output() + '\n',
484486
append=True)
485487

486-
# PerfAnalyzer run was not succesful
488+
# PerfAnalyzer run was not successful
487489
if status == 1:
488490
return (None, None)
489491

@@ -539,7 +541,7 @@ def _aggregate_gpu_records(self, gpu_records):
539541
def _get_cpu_inference_metrics(self):
540542
"""
541543
Stops any monitors that just need the records to be aggregated
542-
like the CPU mmetrics
544+
like the CPU metrics
543545
"""
544546

545547
cpu_records = self._cpu_monitor.stop_recording_metrics()
@@ -558,7 +560,9 @@ def _check_triton_and_model_analyzer_gpus(self):
558560
"""
559561

560562
if self._config.triton_launch_mode != 'remote' and self._config.triton_launch_mode != 'c_api':
561-
self._client.wait_for_server_ready(self._config.client_max_retries)
563+
self._client.wait_for_server_ready(
564+
num_retries=self._config.client_max_retries,
565+
log_file=self._server.log_file())
562566

563567
model_analyzer_gpus = [gpu.device_uuid() for gpu in self._gpus]
564568
triton_gpus = self._get_triton_metrics_gpus()

model_analyzer/triton/client/client.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,23 @@ class TritonClient:
2828
TritonClientFactory
2929
"""
3030

31-
def wait_for_server_ready(self, num_retries, sleep_time=1):
31+
def wait_for_server_ready(
32+
self,
33+
num_retries,
34+
sleep_time=1,
35+
log_file=None,
36+
):
3237
"""
3338
Parameters
3439
----------
3540
num_retries : int
3641
number of times to send a ready status
3742
request to the server before raising
3843
an exception
44+
sleep_time: int
45+
amount of time in seconds to sleep between retries
46+
log_file: TextIOWrapper
47+
file that contains the server's output log
3948
Raises
4049
------
4150
TritonModelAnalyzerException
@@ -50,9 +59,11 @@ def wait_for_server_ready(self, num_retries, sleep_time=1):
5059
time.sleep(sleep_time)
5160
return
5261
else:
62+
self._check_for_triton_log_errors(log_file)
5363
time.sleep(sleep_time)
5464
retries -= 1
5565
except Exception as e:
66+
self._check_for_triton_log_errors(log_file)
5667
time.sleep(sleep_time)
5768
retries -= 1
5869
if retries == 0:
@@ -162,7 +173,7 @@ def get_model_config(self, model_name, num_retries):
162173
Returns
163174
-------
164175
dict or None
165-
A dictionary containg the model config.
176+
A dictionary containing the model config.
166177
"""
167178

168179
self.wait_for_model_ready(model_name, num_retries)
@@ -174,3 +185,20 @@ def is_server_ready(self):
174185
Returns true if the server is ready. Else False
175186
"""
176187
return self._client.is_server_ready()
188+
189+
def _check_for_triton_log_errors(self, log_file):
190+
if not log_file:
191+
return
192+
193+
log_file.seek(0)
194+
log_output = log_file.read()
195+
196+
if not type(log_output) == str:
197+
log_output = log_output.decode('utf-8')
198+
199+
if log_output:
200+
if "Unexpected argument:" in log_output:
201+
error_start = log_output.find("Unexpected argument:")
202+
raise TritonModelAnalyzerException(
203+
f'Error: TritonServer did not launch successfully\n\n{log_output[error_start:]}'
204+
)

model_analyzer/triton/client/grpc_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def get_model_config(self, model_name, num_retries):
6868
Returns
6969
-------
7070
dict
71-
A dictionary containg the model config.
71+
A dictionary containing the model config.
7272
"""
7373

7474
self.wait_for_model_ready(model_name, num_retries)

model_analyzer/triton/model/model_config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ def _get_default_config_from_server(config, client, gpus, model_name,
138138
config, gpus, use_model_repository=True)
139139

140140
server.start()
141-
client.wait_for_server_ready(config.client_max_retries)
141+
client.wait_for_server_ready(num_retries=config.client_max_retries,
142+
log_file=server.log_file())
142143

143144
if (client.load_model(model_name) == -1):
144145
server.stop()

model_analyzer/triton/server/server.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
from abc import ABC, abstractmethod
16+
from io import TextIOWrapper
1617

1718

1819
class TritonServer(ABC):
@@ -38,6 +39,12 @@ def stop(self):
3839
Stops and cleans up after the server
3940
"""
4041

42+
@abstractmethod
43+
def log_file(self) -> TextIOWrapper:
44+
"""
45+
Returns the server's log file
46+
"""
47+
4148
@abstractmethod
4249
def cpu_stats(self):
4350
"""

model_analyzer/triton/server/server_docker.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
from model_analyzer.model_analyzer_exceptions \
2222
import TritonModelAnalyzerException
2323

24+
from io import TextIOWrapper
25+
2426
LOCAL_HTTP_PORT = 8000
2527
LOCAL_GRPC_PORT = 8001
2628
LOCAL_METRICS_PORT = 8002
@@ -205,3 +207,6 @@ def cpu_stats(self):
205207
# Divide by 1.0e6 to convert from kilobytes to MB
206208
return float(used_mem_bytes.decode("utf-8")) // 1.0e3, float(
207209
available_mem_bytes.decode("utf-8")) // 1.0e3
210+
211+
def log_file(self) -> TextIOWrapper:
212+
return self._log_file

model_analyzer/triton/server/server_local.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
from subprocess import Popen, DEVNULL, STDOUT, TimeoutExpired
2121
import psutil
2222
import logging
23+
import tempfile
2324
import os
25+
from io import TextIOWrapper
2426

2527
logger = logging.getLogger(LOGGER_NAME)
2628

@@ -50,6 +52,8 @@ def __init__(self, path, config, gpus, log_path):
5052
self._server_path = path
5153
self._gpus = gpus
5254
self._log_path = log_path
55+
self._log_file = DEVNULL
56+
self._is_first_time_starting_server = True
5357

5458
assert self._server_config['model-repository'], \
5559
"Triton Server requires --model-repository argument to be set."
@@ -82,11 +86,16 @@ def start(self, env=None):
8286

8387
if self._log_path:
8488
try:
89+
if self._is_first_time_starting_server:
90+
if os.path.exists(self._log_path):
91+
os.remove(self._log_path)
8592
self._log_file = open(self._log_path, 'a+')
8693
except OSError as e:
8794
raise TritonModelAnalyzerException(e)
8895
else:
89-
self._log_file = DEVNULL
96+
self._log_file = tempfile.NamedTemporaryFile()
97+
98+
self._is_first_time_starting_server = False
9099

91100
# Construct Popen command
92101
try:
@@ -135,3 +144,6 @@ def cpu_stats(self):
135144
1.0e6), (system_memory_info.available // 1.0e6)
136145
else:
137146
return 0.0, 0.0
147+
148+
def log_file(self) -> TextIOWrapper:
149+
return self._log_file

tests/mocks/mock_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class MockServerMethods(MockBase):
2424
"""
2525

2626
@abstractmethod
27-
def assert_server_process_start_called_with(self, **args):
27+
def assert_server_process_start_called_with(self, *args, **kwargs):
2828
"""
2929
Asserts that the tritonserver process was started with
3030
the supplied arguments

tests/mocks/mock_server_local.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,10 @@ def _fill_patchers(self):
6969
self._patchers.append(self.patcher_pipe)
7070
self._patchers.append(self.patcher_psutil)
7171

72-
def assert_server_process_start_called_with(self, cmd, gpus):
72+
def assert_server_process_start_called_with(self,
73+
cmd,
74+
gpus,
75+
stdout=MagicMock()):
7376
"""
7477
Asserts that Popen was called
7578
with the cmd provided.
@@ -80,7 +83,7 @@ def assert_server_process_start_called_with(self, cmd, gpus):
8083
[gpu.device_uuid() for gpu in gpus])
8184

8285
self.popen_mock.assert_called_once_with(cmd,
83-
stdout=self.pipe_mock,
86+
stdout=stdout,
8487
stderr=self.stdout_mock,
8588
start_new_session=True,
8689
universal_newlines=True,

0 commit comments

Comments
 (0)