Detect and handle invalid metrics url (#715)

nv-braf · web-flow · commit 3c4193a5f6b9 · 2023-06-22T10:50:52.000-07:00
* Raise exception if no gpu records returned

* Skipping check if remote mode

* Skipping check if c_api mode

* Moved checking into remote monitor

* Fixing typo

* Attempt at codeQL fix

* Ignoring for C_API

* Fixing bad indexes

* Fixing ports in ssl_https

* Revert http port change

* Fix typo

* Fixing metrics port
diff --git a/model_analyzer/monitor/cpu_monitor.py b/model_analyzer/monitor/cpu_monitor.py
@@ -41,6 +41,9 @@ def __init__(self, server, frequency, metrics):
         self._cpu_memory_records = []
         self._server = server
 
+    def is_monitoring_connected(self) -> bool:
+        return True
+
     def _monitoring_iteration(self):
         """
         Get memory info of process and 
diff --git a/model_analyzer/monitor/dcgm/dcgm_monitor.py b/model_analyzer/monitor/dcgm/dcgm_monitor.py
@@ -89,6 +89,9 @@ def __init__(self, gpus, frequency, metrics, dcgmPath=None):
             dcgm_handle, self.group_id, self.dcgm_field_group_id.value,
             structs.DCGM_OPERATION_MODE_MANUAL, frequency, 3600, 0, 0)
 
+    def is_monitoring_connected(self) -> bool:
+        return True
+
     def _monitoring_iteration(self):
         self.group_watcher.GetMore()
 
diff --git a/model_analyzer/monitor/monitor.py b/model_analyzer/monitor/monitor.py
@@ -87,6 +87,20 @@ def _collect_records(self):
 
         pass
 
+    @abstractmethod
+    def is_monitoring_connected(self) -> bool:
+        """
+        This method is called to determine if we can connect to the
+        monitor
+        
+        Returns
+        -------
+        bool
+           True if connection to the monitor was successful
+        """
+
+        pass
+
     def start_recording_metrics(self):
         """
         Start recording the metrics.
@@ -97,7 +111,7 @@ def start_recording_metrics(self):
 
     def stop_recording_metrics(self):
         """
-        Stop recording metrics. This will stop monitring all the metrics.
+        Stop recording metrics. This will stop monitoring all the metrics.
 
         Returns
         ------
diff --git a/model_analyzer/monitor/remote_monitor.py b/model_analyzer/monitor/remote_monitor.py
@@ -52,10 +52,18 @@ def __init__(self, metrics_url, frequency, metrics):
                 f"GPU monitoring does not currently support the following metrics: {unsupported_metrics}]"
             )
 
+    def is_monitoring_connected(self) -> bool:
+        try:
+            status_code = requests.get(self._metrics_url).status_code
+        except Exception as ex:
+            return False
+
+        return status_code == requests.codes["okay"]
+
     def _monitoring_iteration(self):
         """
         When this function runs, it requests all the metrics
-        that triton has collected aand organizes them into
+        that triton has collected and organizes them into
         the dict. This function should run as fast
         as possible
         """
@@ -66,7 +74,7 @@ def _monitoring_iteration(self):
     def _collect_records(self):
         """
         This function will organize the metrics responses
-        and creat Records out of them
+        and create Records out of them
         """
 
         records = []
diff --git a/model_analyzer/record/metrics_manager.py b/model_analyzer/record/metrics_manager.py
@@ -416,10 +416,18 @@ def _start_monitors(self, cpu_only=False):
                 self._gpu_monitor = RemoteMonitor(
                     self._config.triton_metrics_url,
                     self._config.monitoring_interval, self._gpu_metrics)
+
                 self._gpu_monitor.start_recording_metrics()
             except TritonModelAnalyzerException:
                 self._destroy_monitors()
                 raise
+            finally:
+                if not self._gpu_monitor.is_monitoring_connected(
+                ) and self._config.triton_launch_mode != 'c_api':
+                    raise TritonModelAnalyzerException(
+                    f'Failed to connect to Tritonserver\'s GPU metrics monitor. ' \
+                    f'Please check that the `triton_metrics_url` value is set correctly: {self._config.triton_metrics_url}.'
+                    )
 
         self._cpu_monitor = CPUMonitor(self._server,
                                        self._config.monitoring_interval,
@@ -558,7 +566,6 @@ def _get_gpu_inference_metrics(self):
 
         # Stop and destroy DCGM monitor
         gpu_records = self._gpu_monitor.stop_recording_metrics()
-
         gpu_metrics = self._aggregate_gpu_records(gpu_records)
         return gpu_metrics
 
diff --git a/qa/L0_ssl_https/test.sh b/qa/L0_ssl_https/test.sh
@@ -25,6 +25,8 @@ TRITON_LAUNCH_MODE=${TRITON_LAUNCH_MODE:="remote"}
 CLIENT_PROTOCOL="http"
 PORTS=(`find_available_ports 2`)
 HTTP_PORT="8000"
+GRPC_PORT="${PORTS[0]}"
+METRICS_PORT="${PORTS[1]}"
 GPUS=(`get_all_gpus_uuids`)
 OUTPUT_MODEL_REPOSITORY=${OUTPUT_MODEL_REPOSITORY:=`get_output_directory`}
 WORKING_CONFIG_FILE="working_config.yml"
@@ -64,7 +66,7 @@ cp client.crt client2.crt && sed -i "s/\b\(.\)/\u\1/g" client2.crt
 
 # For remote launch, set server args and start server
 SERVER=`which tritonserver`
-SERVER_ARGS="--model-repository=$MODEL_REPOSITORY --model-control-mode explicit --http-port ${HTTP_PORT} --grpc-port ${PORTS[0]} --metrics-port ${PORTS[1]}"
+SERVER_ARGS="--model-repository=$MODEL_REPOSITORY --model-control-mode explicit --http-port ${HTTP_PORT} --grpc-port ${GRPC_PORT} --metrics-port ${METRICS_PORT}"
 SERVER_HTTP_PORT=${HTTP_PORT}
 SERVER_LOG="$LOGS_DIR/server.log"
 
@@ -87,8 +89,8 @@ create_result_paths -test-name $TEST_NAME
 
 MODEL_ANALYZER_ARGS="-m $MODEL_REPOSITORY -f $WORKING_CONFIG_FILE -e $EXPORT_PATH --checkpoint-directory $CHECKPOINT_DIRECTORY"
 MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --client-protocol=$CLIENT_PROTOCOL --triton-launch-mode=$TRITON_LAUNCH_MODE"
-MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint https://localhost:443 --triton-grpc-endpoint localhost:${PORTS[1]}"
-MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url https://localhost:${PORTS[2]}/metrics"
+MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint https://localhost:443 --triton-grpc-endpoint localhost:${GRPC_PORT}"
+MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url http://localhost:${METRICS_PORT}/metrics"
 MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --output-model-repository-path $OUTPUT_MODEL_REPOSITORY --override-output-model-repository"
 MODEL_ANALYZER_SUBCOMMAND="profile"
 run_analyzer
@@ -104,8 +106,8 @@ create_result_paths -test-name $TEST_NAME
 
 MODEL_ANALYZER_ARGS="-m $MODEL_REPOSITORY -f $BROKEN_CONFIG_FILE -e $EXPORT_PATH --checkpoint-directory $CHECKPOINT_DIRECTORY"
 MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --client-protocol=$CLIENT_PROTOCOL --triton-launch-mode=$TRITON_LAUNCH_MODE"
-MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint https://localhost:443 --triton-grpc-endpoint localhost:${PORTS[1]}"
-MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url https://localhost:${PORTS[2]}/metrics"
+MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-http-endpoint https://localhost:443 --triton-grpc-endpoint localhost:${GRPC_PORT}"
+MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url http://localhost:${METRICS_PORT}/metrics"
 MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --output-model-repository-path $OUTPUT_MODEL_REPOSITORY --override-output-model-repository"
 MODEL_ANALYZER_SUBCOMMAND="profile"
 run_analyzer