Stop MA if failures are detected on initial profiles (#697)

nv-braf · web-flow · commit c4ae5330bebb · 2023-05-31T10:56:29.000-07:00
* Detecting if no measurements are returned at the beginning of a profile

* Removing timeout PA test

* Adding missing newlines

* Fixes based on review comments
diff --git a/model_analyzer/cli/cli.py b/model_analyzer/cli/cli.py
@@ -22,7 +22,7 @@
 
 class CLI:
     """
-    CLI class to parse the commandline arguments
+    CLI class to parse the command line arguments
     """
 
     def __init__(self):
@@ -49,7 +49,7 @@ def _add_global_options(self):
             '-v',
             '--verbose',
             action='store_true',
-            help='Show detailed logs, messags and status.')
+            help='Show detailed logs, messages and status.')
         self._parser.add_argument('-m',
                                   '--mode',
                                   type=str,
diff --git a/model_analyzer/constants.py b/model_analyzer/constants.py
@@ -62,3 +62,6 @@
 
 # Constraints
 GLOBAL_CONSTRAINTS_KEY = "__default__"
+
+# Measurement constants
+INVALID_MEASUREMENT_THRESHOLD = 2
diff --git a/model_analyzer/model_manager.py b/model_analyzer/model_manager.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import List, Optional
 
-from model_analyzer.constants import LOGGER_NAME
+from model_analyzer.constants import LOGGER_NAME, INVALID_MEASUREMENT_THRESHOLD
 from model_analyzer.config.generate.run_config_generator_factory import RunConfigGeneratorFactory
 from .model_analyzer_exceptions import TritonModelAnalyzerException
 from model_analyzer.config.generate.model_variant_name_manager import ModelVariantNameManager
@@ -29,6 +29,7 @@
 from model_analyzer.state.analyzer_state_manager import AnalyzerStateManager
 from model_analyzer.triton.model.model_config import ModelConfig
 from model_analyzer.config.input.objects.config_model_profile_spec import ConfigModelProfileSpec
+from model_analyzer.result.run_config_measurement import RunConfigMeasurement
 
 import logging
 
@@ -79,6 +80,9 @@ def __init__(self, config: ConfigCommandProfile, gpus: List[GPUDevice],
         if state_manager.starting_fresh_run():
             self._init_state()
 
+        self._failed_measurement_attempts = 0
+        self._received_measurement_values_from_pa = False
+
         self._model_variant_name_manager = ModelVariantNameManager.from_dict(
             self._state_manager.get_state_variable(
                 'ModelManager.model_variant_name_manager'))
@@ -121,6 +125,9 @@ def run_models(self, models: List[ConfigModelProfileSpec]) -> None:
             if run_config.is_legal_combination():
                 measurement = self._metrics_manager.execute_run_config(
                     run_config)
+
+                self._check_for_valid_measurement(measurement)
+                self._stop_ma_if_no_valid_measurement_threshold_reached()
             else:
                 logger.info("Skipping illegal run configuration")
                 measurement = None
@@ -188,9 +195,27 @@ def _check_for_ensemble_model_incompatibility(
     def _init_state(self):
         """
         Sets ModelManager object managed
-        state variables in AnalyerState
+        state variables in AnalyzerState
         """
 
         self._state_manager.set_state_variable(
             'ModelManager.model_variant_name_manager',
             self._state_manager.default_encode(ModelVariantNameManager()))
+
+    def _check_for_valid_measurement(
+            self, measurement: Optional[RunConfigMeasurement]) -> None:
+        if measurement:
+            self._received_measurement_values_from_pa = True
+        else:
+            self._failed_measurement_attempts += 1
+
+    def _stop_ma_if_no_valid_measurement_threshold_reached(self) -> None:
+        if self._received_measurement_values_from_pa:
+            return
+
+        if self._failed_measurement_attempts >= INVALID_MEASUREMENT_THRESHOLD:
+            raise TritonModelAnalyzerException(
+                f'The first {INVALID_MEASUREMENT_THRESHOLD} attempts to acquire measurements ' \
+                'have failed. Please examine the Tritonserver/PA error logs ' \
+                'to determine what has gone wrong.'
+            )
diff --git a/qa/L0_perf_analyzer/test.sh b/qa/L0_perf_analyzer/test.sh
@@ -63,8 +63,6 @@ for CONFIG_FILE in ${LIST_OF_CONFIG_FILES[@]}; do
         TEST_NAME="count_window"
     elif [[ "$CONFIG_FILE" == "config-additive-args-count-no-adjust.yml" ]]; then
         TEST_NAME="count_window"
-    elif [[ "$CONFIG_FILE" == "config-perf-output-timeout.yml" ]]; then
-        TEST_NAME="perf_output_timeout"
     fi
 
     echo $TEST_NAME
diff --git a/qa/L0_perf_analyzer/test_config_generator.py b/qa/L0_perf_analyzer/test_config_generator.py
@@ -87,20 +87,6 @@ def generate_additive_args(self):
         with open('./config-additive-args-count-no-adjust.yml', 'w') as f:
             yaml.dump(model_config, f)
 
-    def generate_perf_output_timeout(self):
-        model_config = {
-            'profile_models': ['vgg19_libtorch'],
-            'skip_summary_reports': True,  # Don't fail analyzing no results
-            'perf_output': True,
-            'perf_analyzer_timeout': 2,
-            'perf_analyzer_flags': {
-                'measurement-mode': 'time_windows',
-                'measurement-interval': 2,
-            }
-        }
-        with open('./config-perf-output-timeout.yml', 'w') as f:
-            yaml.dump(model_config, f)
-
 
 if __name__ == '__main__':
     TestConfigGenerator()
diff --git a/tests/test_model_manager.py b/tests/test_model_manager.py
@@ -691,7 +691,7 @@ def test_default_config_always_run_automatic_search(self):
 
     def test_throughput_early_exit_minimum_runs(self):
         """
-        Test that there is an early backoff when sweeping concurrency
+        Test that there is an early back off when sweeping concurrency
 
         The behavior is that MA will try at least 4 concurrencies. If 
         at that point none of the last 3 attempts have had satisfactory 
@@ -738,7 +738,7 @@ def test_throughput_early_exit_minimum_runs(self):
 
     def test_no_early_exit_if_not_auto_search(self):
         """
-        Test that there is NOT an early backoff when sweeping concurrency if not in auto sweep mode
+        Test that there is NOT an early back off when sweeping concurrency if not in auto sweep mode
 
         This test hardcodes the 'throughput' to 1, so for all model
         configs the gain will be invalid. However, it should still sweep
@@ -780,7 +780,7 @@ def test_no_early_exit_if_not_auto_search(self):
 
     def test_throughput_early_exit(self):
         """
-        Test that there is an early backoff when sweeping concurrency
+        Test that there is an early back off when sweeping concurrency
 
         The behavior is that MA stop if it had 4 concurrencies in a row
         without any valid gain amongst any of them
@@ -822,9 +822,10 @@ def test_throughput_early_exit(self):
             ]
             self._test_model_manager(yaml_str, expected_ranges)
 
+    @patch('model_analyzer.model_manager.INVALID_MEASUREMENT_THRESHOLD', 999)
     def test_bad_result_early_PA_exit(self):
         """
-        Test that there is an early backoff for bad result (out of memory)
+        Test that there is an early back off for bad result (out of memory)
 
         If no measurements are returned in an attempt, no further concurrencies
         should be tried.
@@ -863,9 +864,30 @@ def test_bad_result_early_PA_exit(self):
             mock_method.return_value = None
             self._test_model_manager(yaml_str, expected_ranges)
 
+    def test_report_failure_no_measurements(self):
+        """
+        Test that MA takes an exception if we detect no measurements returned from
+        PA at the start of profile
+        """
+
+        yaml_str = ("""
+            profile_models: test_model
+            run_config_search_max_concurrency: 128
+            run_config_search_max_instance_count: 2
+            run_config_search_min_model_batch_size: 8
+            run_config_search_max_model_batch_size: 8
+            run_config_search_disable: False
+            """)
+
+        with patch.object(MetricsManagerSubclass,
+                          "_get_next_perf_throughput_value") as mock_method:
+            mock_method.return_value = None
+            with self.assertRaises(TritonModelAnalyzerException):
+                self._test_model_manager(yaml_str, None)
+
     def test_lower_throughput_early_batch_size_exit(self):
         """
-        Test that there is an early backoff for throughput decreasing
+        Test that there is an early back off for throughput decreasing
         when sweeping max_batch_size
 
         If a list of measurements is provided with a lower max throughput than the previous
@@ -919,10 +941,10 @@ def test_lower_throughput_early_batch_size_exit(self):
             #yapf: disable
             mock_method.side_effect = [
                 1, 2, 4,     # Default config, concurrency 1,2,4
-                1, 2, 4,     # 1 Instance, Batch size 8, concurency 1,2,4
+                1, 2, 4,     # 1 Instance, Batch size 8, concurrency 1,2,4
                 2, 4, 8,     # 1 Instance, Batch size 16, concurrency 1,2,4
                 2, 4, 8,     # 1 Instance, Batch size 32, concurrency 1,2,4
-                1, 2, 4,     # 1 Instance, Batch size 8, concurency 1,2,4
+                1, 2, 4,     # 1 Instance, Batch size 8, concurrency 1,2,4
                 8, 4, 2,     # 1 Instance, Batch size 16, concurrency 1,2,4
                 4, 8, 16,    # 1 Instance, Batch size 32, concurrency 1,2,4
                 4, 8, 16     # 1 Instance, Batch size 64, concurrency 1,2,4