test: Tests for Metrics API enhancement to include error counters (#7423)

indrajit96 · web-flow · commit 70a0eeeb58d1 · 2024-07-11T18:24:52.000-07:00
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
@@ -100,6 +100,25 @@ Count*. The count metrics are illustrated by the following examples:
 |              |Execution Count |`nv_inference_exec_count` |Number of inference batch executions (see [Inference Request Metrics](#inference-request-metrics), does not include cached requests)|Per model|Per request|
 |              |Pending Request Count |`nv_inference_pending_request_count` |Number of inference requests awaiting execution by a backend. This number is incremented when a request is enqueued to the server (`TRITONSERVER_ServerInferAsync`) and is decremented when a backend is about to start executing the request. More details can be found below. |Per model|Per request|
 
+#### Failure Count Categories
+
+| Failed Request Reason |Description |
+|------------|------------|
+| REJECTED  | Number of inference failures due to request timeout in the schedular. |
+| CANCELED  |  Number of inference failures due to request cancellation in the core. |
+| BACKEND |  Number of inference failures during execution of requests in the backend/model. |
+| OTHER  | Number of inference failures due to other uncategorized reasons in the core. |
+
+> **Note**
+>
+> Ensemble failure metrics will reflect the failure counts of their composing models as well as the parent model, but currently do not capture the same granularity for the "reason" label and will default to the "OTHER" reason.
+>
+> For example, if EnsembleA contains ModelA, and ModelA experiences a failed request due to a queue/backlog timeout in the scheduler, ModelA will have a failed request metric reflecting `reason=REJECTED` and `count=1`.
+> Additionally, EnsembleA will have a failed request metric reflecting `reason=OTHER` and `count=2`.
+> The `count=2` reflects 1 from the internally failed request captured by ModelA, as well as 1 from the failed top-level request sent to EnsembleA by the user/client.
+> The `reason=OTHER` reflects that fact that the ensemble doesn't currently capture the specific reason why
+> ModelA's request failed at this time.
+
 #### Pending Request Count (Queue Size) Per-Model
 
 The *Pending Request Count* reflects the number of requests that have been
diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -27,8 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import re
 import sys
 
+import requests
+
 sys.path.append("../../common")
 
 import queue
@@ -63,6 +66,29 @@ class LifecycleTest(unittest.TestCase):
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
+    def _get_metrics(self):
+        metrics_url = "http://localhost:8002/metrics"
+        r = requests.get(metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _metrics_before_test(self, model, reason):
+        pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
+        metrics = self._get_metrics()
+        match = re.search(pattern, metrics)
+        if match:
+            return int(match.group(1))
+        else:
+            raise Exception(f"Failure metrics for model='{model}' not found")
+
+    def _assert_metrics(
+        self, model_name, reason, expected_count_increase, initial_count
+    ):
+        metrics = self._get_metrics()
+        # Add initial count + expected count for the the test
+        expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
+        self.assertIn(expected_metric, metrics)
+
     def test_error_code(self):
         model_name = "error_code"
         shape = [1, 1]
@@ -181,7 +207,7 @@ def test_batch_error(self):
     def test_infer_pymodel_error(self):
         model_name = "wrong_model"
         shape = [2, 2]
-
+        initial_metrics_value = self._metrics_before_test(model_name, "BACKEND")
         with self._shm_leak_detector.Probe() as shm_probe:
             with httpclient.InferenceServerClient(
                 f"{_tritonserver_ipaddr}:8000"
@@ -207,6 +233,13 @@ def test_infer_pymodel_error(self):
                     self.assertTrue(
                         False, "Wrong exception raised or did not raise an exception"
                     )
+        expected_count_increase = 1
+        self._assert_metrics(
+            model_name,
+            "BACKEND",
+            expected_count_increase,
+            initial_metrics_value,
+        )
 
 
 if __name__ == "__main__":
diff --git a/qa/L0_model_queue/model_queue_test.py b/qa/L0_model_queue/model_queue_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,6 +30,7 @@
 
 sys.path.append("../common")
 
+import re
 import threading
 import time
 import unittest
@@ -38,6 +39,7 @@
 
 import infer_util as iu
 import numpy as np
+import requests
 import test_util as tu
 from tritonclientutils import InferenceServerException
 
@@ -69,6 +71,29 @@ def check_deferred_exception(self):
                 _deferred_exceptions.pop(0)
                 raise first_exception
 
+    def _get_metrics(self):
+        metrics_url = "http://localhost:8002/metrics"
+        r = requests.get(metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _metrics_before_test(self, model, reason):
+        pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
+        metrics = self._get_metrics()
+        match = re.search(pattern, metrics)
+        if match:
+            return int(match.group(1))
+        else:
+            raise Exception(f"Failure metrics for model='{model}' not found")
+
+    def _assert_metrics(
+        self, model_name, reason, expected_count_increase, initial_count
+    ):
+        metrics = self._get_metrics()
+        # Add initial count + expected count for the the test
+        expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
+        self.assertIn(expected_metric, metrics)
+
     def check_response(
         self,
         bs,
@@ -235,6 +260,12 @@ def test_policy_reject(self):
         # requests are sent after 'default_timeout_microseconds'.
         # Expect the first request is timed-out and rejected, which makes the
         # second and third request be batched together and executed.
+        initial_metrics_value_ensemble = self._metrics_before_test(
+            "ensemble_zero_1_float32", "OTHER"
+        )
+        initial_metrics_value_custom = self._metrics_before_test(
+            "custom_zero_1_float32", "REJECTED"
+        )
         dtype = np.float32
         shapes = ([16],)
         for trial in self.trials_:
@@ -283,6 +314,23 @@ def test_policy_reject(self):
                 self.check_deferred_exception()
             except InferenceServerException as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
+        expected_count_increase = 4
+        # NOTE: Ensemble failure metrics will reflect the failure counts
+        # of their composing models as well as the parent model, but currently do not capture the same granularity
+        # for the "reason" label and will default to the "OTHER" reason.
+        self._assert_metrics(
+            "ensemble_zero_1_float32",
+            "OTHER",
+            expected_count_increase,
+            initial_metrics_value_ensemble,
+        )
+        expected_count_increase = 4
+        self._assert_metrics(
+            "custom_zero_1_float32",
+            "REJECTED",
+            expected_count_increase,
+            initial_metrics_value_custom,
+        )
 
     def test_timeout_override(self):
         # Send requests with batch sizes 1, 1, 3 where the first request
diff --git a/qa/L0_request_cancellation/scheduler_test.py b/qa/L0_request_cancellation/scheduler_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -27,10 +27,12 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import concurrent.futures
+import re
 import time
 import unittest
 
 import numpy as np
+import requests
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import InferenceServerException
 
@@ -84,6 +86,29 @@ def _assert_streaming_response_is_cancelled(self, response):
                     cancelled_count += 1
         self.assertEqual(cancelled_count, 1)
 
+    def _get_metrics(self):
+        metrics_url = "http://localhost:8002/metrics"
+        r = requests.get(metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _metrics_before_test(self, model, reason):
+        pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
+        metrics = self._get_metrics()
+        match = re.search(pattern, metrics)
+        if match:
+            return int(match.group(1))
+        else:
+            raise Exception(f"Failure metrics for model='{model}' not found")
+
+    def _assert_metrics(
+        self, model_name, reason, expected_count_increase, initial_count
+    ):
+        metrics = self._get_metrics()
+        # Add initial count + expected count for the the test
+        expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
+        self.assertIn(expected_metric, metrics)
+
     # Test queued requests on dynamic batch scheduler can be cancelled
     def test_dynamic_batch_scheduler_request_cancellation(self):
         model_name = "dynamic_batch"
@@ -114,6 +139,7 @@ def test_dynamic_batch_scheduler_request_cancellation(self):
     # Test backlogged requests on sequence batch scheduler can be cancelled
     def test_sequence_batch_scheduler_backlog_request_cancellation(self):
         model_name = "sequence_direct"
+        initial_metrics_value = self._metrics_before_test(model_name, "CANCELED")
         with concurrent.futures.ThreadPoolExecutor() as pool:
             # Saturate the single sequence slot
             saturate_thread = pool.submit(
@@ -149,11 +175,26 @@ def test_sequence_batch_scheduler_backlog_request_cancellation(self):
             self._assert_response_is_cancelled(backlog_requests[1]["response"])
             # Join saturating thread
             saturate_thread.result()
+        expected_count_increase = 2
+        self._assert_metrics(
+            model_name,
+            "CANCELED",
+            expected_count_increase,
+            initial_metrics_value,
+        )
 
     # Test queued requests on direct sequence batch scheduler can be cancelled
     def test_direct_sequence_batch_scheduler_request_cancellation(self):
         model_name = "sequence_direct"
+        initial_metrics_value = self._metrics_before_test(model_name, "CANCELED")
         self._test_sequence_batch_scheduler_queued_request_cancellation(model_name)
+        expected_count_increase = 2
+        self._assert_metrics(
+            model_name,
+            "CANCELED",
+            expected_count_increase,
+            initial_metrics_value,
+        )
 
     # Test queued requests on oldest sequence batch scheduler can be cancelled
     def test_oldest_sequence_batch_scheduler_request_cancellation(self):