Skip to content

Commit 3c4193a

Browse files
authored
Detect and handle invalid metrics url (#715)
* Raise exception if no gpu records returned * Skipping check if remote mode * Skipping check if c_api mode * Moved checking into remote monitor * Fixing typo * Attempt at codeQL fix * Ignoring for C_API * Fixing bad indexes * Fixing ports in ssl_https * Revert http port change * Fix typo * Fixing metrics port
1 parent 1b7a746 commit 3c4193a

File tree

6 files changed

+46
-9
lines changed

6 files changed

+46
-9
lines changed

model_analyzer/monitor/cpu_monitor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ def __init__(self, server, frequency, metrics):
4141
self._cpu_memory_records = []
4242
self._server = server
4343

44+
def is_monitoring_connected(self) -> bool:
45+
return True
46+
4447
def _monitoring_iteration(self):
4548
"""
4649
Get memory info of process and

model_analyzer/monitor/dcgm/dcgm_monitor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ def __init__(self, gpus, frequency, metrics, dcgmPath=None):
8989
dcgm_handle, self.group_id, self.dcgm_field_group_id.value,
9090
structs.DCGM_OPERATION_MODE_MANUAL, frequency, 3600, 0, 0)
9191

92+
def is_monitoring_connected(self) -> bool:
93+
return True
94+
9295
def _monitoring_iteration(self):
9396
self.group_watcher.GetMore()
9497

model_analyzer/monitor/monitor.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,20 @@ def _collect_records(self):
8787

8888
pass
8989

90+
@abstractmethod
91+
def is_monitoring_connected(self) -> bool:
92+
"""
93+
This method is called to determine if we can connect to the
94+
monitor
95+
96+
Returns
97+
-------
98+
bool
99+
True if connection to the monitor was successful
100+
"""
101+
102+
pass
103+
90104
def start_recording_metrics(self):
91105
"""
92106
Start recording the metrics.
@@ -97,7 +111,7 @@ def start_recording_metrics(self):
97111

98112
def stop_recording_metrics(self):
99113
"""
100-
Stop recording metrics. This will stop monitring all the metrics.
114+
Stop recording metrics. This will stop monitoring all the metrics.
101115
102116
Returns
103117
------

model_analyzer/monitor/remote_monitor.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,18 @@ def __init__(self, metrics_url, frequency, metrics):
5252
f"GPU monitoring does not currently support the following metrics: {unsupported_metrics}]"
5353
)
5454

55+
def is_monitoring_connected(self) -> bool:
56+
try:
57+
status_code = requests.get(self._metrics_url).status_code
58+
except Exception as ex:
59+
return False
60+
61+
return status_code == requests.codes["okay"]
62+
5563
def _monitoring_iteration(self):
5664
"""
5765
When this function runs, it requests all the metrics
58-
that triton has collected aand organizes them into
66+
that triton has collected and organizes them into
5967
the dict. This function should run as fast
6068
as possible
6169
"""
@@ -66,7 +74,7 @@ def _monitoring_iteration(self):
6674
def _collect_records(self):
6775
"""
6876
This function will organize the metrics responses
69-
and creat Records out of them
77+
and create Records out of them
7078
"""
7179

7280
records = []

model_analyzer/record/metrics_manager.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -416,10 +416,18 @@ def _start_monitors(self, cpu_only=False):
416416
self._gpu_monitor = RemoteMonitor(
417417
self._config.triton_metrics_url,
418418
self._config.monitoring_interval, self._gpu_metrics)
419+
419420
self._gpu_monitor.start_recording_metrics()
420421
except TritonModelAnalyzerException:
421422
self._destroy_monitors()
422423
raise
424+
finally:
425+
if not self._gpu_monitor.is_monitoring_connected(
426+
) and self._config.triton_launch_mode != 'c_api':
427+
raise TritonModelAnalyzerException(
428+
f'Failed to connect to Tritonserver\'s GPU metrics monitor. ' \
429+
f'Please check that the `triton_metrics_url` value is set correctly: {self._config.triton_metrics_url}.'
430+
)
423431

424432
self._cpu_monitor = CPUMonitor(self._server,
425433
self._config.monitoring_interval,
@@ -558,7 +566,6 @@ def _get_gpu_inference_metrics(self):
558566

559567
# Stop and destroy DCGM monitor
560568
gpu_records = self._gpu_monitor.stop_recording_metrics()
561-
562569
gpu_metrics = self._aggregate_gpu_records(gpu_records)
563570
return gpu_metrics
564571

qa/L0_ssl_https/test.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ TRITON_LAUNCH_MODE=${TRITON_LAUNCH_MODE:="remote"}
2525
CLIENT_PROTOCOL="http"
2626
PORTS=(`find_available_ports 2`)
2727
HTTP_PORT="8000"
28+
GRPC_PORT="${PORTS[0]}"
29+
METRICS_PORT="${PORTS[1]}"
2830
GPUS=(`get_all_gpus_uuids`)
2931
OUTPUT_MODEL_REPOSITORY=${OUTPUT_MODEL_REPOSITORY:=`get_output_directory`}
3032
WORKING_CONFIG_FILE="working_config.yml"
@@ -64,7 +66,7 @@ cp client.crt client2.crt && sed -i "s/\b\(.\)/\u\1/g" client2.crt
6466

6567
# For remote launch, set server args and start server
6668
SERVER=`which tritonserver`
67-
SERVER_ARGS="--model-repository=$MODEL_REPOSITORY --model-control-mode explicit --http-port ${HTTP_PORT} --grpc-port ${PORTS[0]} --metrics-port ${PORTS[1]}"
69+
SERVER_ARGS="--model-repository=$MODEL_REPOSITORY --model-control-mode explicit --http-port ${HTTP_PORT} --grpc-port ${GRPC_PORT} --metrics-port ${METRICS_PORT}"
6870
SERVER_HTTP_PORT=${HTTP_PORT}
6971
SERVER_LOG="$LOGS_DIR/server.log"
7072

@@ -87,8 +89,8 @@ create_result_paths -test-name $TEST_NAME
8789

8890
MODEL_ANALYZER_ARGS="-m $MODEL_REPOSITORY -f $WORKING_CONFIG_FILE -e $EXPORT_PATH --checkpoint-directory $CHECKPOINT_DIRECTORY"
8991
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --client-protocol=$CLIENT_PROTOCOL --triton-launch-mode=$TRITON_LAUNCH_MODE"
90-
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint https://localhost:443 --triton-grpc-endpoint localhost:${PORTS[1]}"
91-
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url https://localhost:${PORTS[2]}/metrics"
92+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint https://localhost:443 --triton-grpc-endpoint localhost:${GRPC_PORT}"
93+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url http://localhost:${METRICS_PORT}/metrics"
9294
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --output-model-repository-path $OUTPUT_MODEL_REPOSITORY --override-output-model-repository"
9395
MODEL_ANALYZER_SUBCOMMAND="profile"
9496
run_analyzer
@@ -104,8 +106,8 @@ create_result_paths -test-name $TEST_NAME
104106

105107
MODEL_ANALYZER_ARGS="-m $MODEL_REPOSITORY -f $BROKEN_CONFIG_FILE -e $EXPORT_PATH --checkpoint-directory $CHECKPOINT_DIRECTORY"
106108
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --client-protocol=$CLIENT_PROTOCOL --triton-launch-mode=$TRITON_LAUNCH_MODE"
107-
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint https://localhost:443 --triton-grpc-endpoint localhost:${PORTS[1]}"
108-
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url https://localhost:${PORTS[2]}/metrics"
109+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint https://localhost:443 --triton-grpc-endpoint localhost:${GRPC_PORT}"
110+
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url http://localhost:${METRICS_PORT}/metrics"
109111
MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --output-model-repository-path $OUTPUT_MODEL_REPOSITORY --override-output-model-repository"
110112
MODEL_ANALYZER_SUBCOMMAND="profile"
111113
run_analyzer

0 commit comments

Comments
 (0)