Add Binary Concurrency Search to Quick Search (#679)

nv-braf · web-flow · commit 9d86539601a9 · 2023-05-04T10:14:18.000-07:00
* Config Search Class (#675) * Framework of class and testing env created * Testing and logic for objective saturation * Adding check to ensure measurements are added * Scaffolding completed for binary search * Binary search code and testing * Fixing type checking * Refactoring * Full sweep before bcs * Minor refactoring * Changes based on Tim's review * Adding TMA to fixme * Adding config option of max BCS steps (#676) * Adding new config option for max BCS steps * Adding documentation * Changing config name * Integrating concurrency search into quick search (#677) * Replacing magic numbers with constants * Adding constraints to L0 quick search. Added checks for cases when PA returns no result (#678) * Fixing QL import error * Fixing another QL error * Using config's max binary search steps instead of default
diff --git a/docs/config.md b/docs/config.md
@@ -212,6 +212,9 @@ bls_composing_models: <comma-delimited-string-list>
 # Maximum request rate used for the automatic/quick config search
 [ run_config_search_max_request_rate: <int> | default: 8092 ]
 
+# Maximum number of steps taken during a binary search
+[ run_config_search_max_binary_search_steps: <int> | default: 5 ]
+
 # Disables automatic config search
 [ run_config_search_disable: <bool> | default: false ]
 
diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -259,9 +259,14 @@ def _done_walking_parameters(self) -> bool:
         if self._early_exit_enable and not self._parameter_throughput_gain_valid(
         ):
             if not self._parameter_warning_printed:
-                logger.info(
-                    "No longer increasing concurrency as throughput has plateaued"
-                )
+                if self._config_specifies_request_rate():
+                    logger.info(
+                        "No longer increasing request rate as throughput has plateaued"
+                    )
+                else:
+                    logger.info(
+                        "No longer increasing concurrency as throughput has plateaued"
+                    )
                 self._parameter_warning_printed = True
             return True
         return False
diff --git a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py
@@ -27,6 +27,7 @@
 from model_analyzer.config.generate.model_profile_spec import ModelProfileSpec
 from model_analyzer.result.result_manager import ResultManager
 from model_analyzer.result.run_config_measurement import RunConfigMeasurement
+from model_analyzer.result.concurrency_search import ConcurrencySearch
 
 from model_analyzer.constants import LOGGER_NAME
 
@@ -130,26 +131,14 @@ def _sweep_concurrency_over_top_results(
                 n=self._config.num_configs_per_model,
                 include_default=True)
 
-            for count, result in enumerate(top_results):
+            for result in top_results:
                 run_config = deepcopy(result.run_config())
-
-                max_concurrency_index = int(
-                    log2(self._config.run_config_search_max_concurrency))
-
-                run_config_measurements = []
-                for concurrency in (
-                        2**i for i in range(0, max_concurrency_index + 1)):
+                concurrency_search = ConcurrencySearch(self._config)
+                for concurrency in concurrency_search.search_concurrencies():
                     run_config = self._set_concurrency(run_config, concurrency)
                     yield run_config
-
-                    run_config_measurements.append(self._last_measurement)
-
-                    if not PerfAnalyzerConfigGenerator.throughput_gain_valid_helper(
-                            throughputs=run_config_measurements):
-                        logger.info(
-                            "Terminating concurrency sweep - throughput is decreasing"
-                        )
-                        break
+                    concurrency_search.add_run_config_measurement(
+                        self._last_measurement)
 
     def _set_concurrency(self, run_config: RunConfig,
                          concurrency: int) -> RunConfig:
diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
@@ -36,7 +36,7 @@
     DEFAULT_PERF_OUTPUT_FLAG, DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, \
     DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE, DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE, \
     DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE, DEFAULT_RUN_CONFIG_SEARCH_MODE, \
-    DEFAULT_REQUEST_RATE_SEARCH_ENABLE, \
+    DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS, DEFAULT_REQUEST_RATE_SEARCH_ENABLE, \
     DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT, \
     DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE, DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE, \
     DEFAULT_RUN_CONFIG_SEARCH_DISABLE, DEFAULT_TRITON_DOCKER_IMAGE, DEFAULT_TRITON_GRPC_ENDPOINT, \
@@ -596,6 +596,15 @@ def _add_run_search_configs(self):
                 description=
                 "Value for the model's max_batch_size that run config search will start from."
             ))
+        self._add_config(
+            ConfigField(
+                'run_config_search_max_binary_search_steps',
+                flags=['--run-config-search-max-binary-search-steps'],
+                field_type=ConfigPrimitive(int),
+                default_value=DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS,
+                description=
+                "Maximum number of steps take during the binary concurrency search."
+            ))
         self._add_config(
             ConfigField(
                 'run_config_search_mode',
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
@@ -47,6 +47,7 @@
 DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT = 1
 DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1
 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128
+DEFAULT_RUN_CONFIG_MAX_BINARY_SEARCH_STEPS = 5
 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False
 DEFAULT_RUN_CONFIG_SEARCH_MODE = 'brute'
 DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False
diff --git a/model_analyzer/result/concurrency_search.py b/model_analyzer/result/concurrency_search.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Generator
+
+from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
+from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
+from model_analyzer.result.run_config_measurement import RunConfigMeasurement
+
+from math import log2
+
+import logging
+from model_analyzer.constants import LOGGER_NAME, THROUGHPUT_MINIMUM_GAIN, THROUGHPUT_MINIMUM_CONSECUTIVE_CONCURRENCY_TRIES
+
+logger = logging.getLogger(LOGGER_NAME)
+
+
+class ConcurrencySearch():
+    """
+    Generates the next concurrency value to use when searching through
+    RunConfigMeasurements for the best value (according to the users objective)
+      - Will sweep from by powers of two from min to max concurrency
+      - If the user specifies a constraint, the algorithm will perform a binary search 
+        around the boundary if the constraint is violated
+        
+    Invariant: It is necessary for the user to add new measurements as they are taken
+    """
+
+    def __init__(self, config: ConfigCommandProfile) -> None:
+        """
+        Parameters
+        ----------
+        config: ConfigCommandProfile
+            Profile configuration information
+        """
+        self._min_concurrency_index = int(
+            log2(config.run_config_search_min_concurrency))
+        self._max_concurrency_index = int(
+            log2(config.run_config_search_max_concurrency))
+        self._max_binary_search_steps = config.run_config_search_max_binary_search_steps
+
+        self._run_config_measurements: List[Optional[RunConfigMeasurement]] = []
+        self._concurrencies: List[int] = []
+        self._last_failing_concurrency = 0
+        self._last_passing_concurrency = 0
+
+    def add_run_config_measurement(
+            self,
+            run_config_measurement: Optional[RunConfigMeasurement]) -> None:
+        """
+        Adds a new RunConfigMeasurement
+        Invariant: Assumed that RCMs are added in the same order they are measured
+        """
+        self._run_config_measurements.append(run_config_measurement)
+
+    def search_concurrencies(self) -> Generator[int, None, None]:
+        """
+        First performs a concurrency sweep, and then, if necessary, perform
+        a binary concurrency search around the point where the constraint
+        violated
+        """
+        yield from self._perform_concurrency_sweep()
+
+        if self._was_constraint_violated():
+            yield from self._perform_binary_concurrency_search()
+
+    def _perform_concurrency_sweep(self) -> Generator[int, None, None]:
+        for concurrency in (2**i for i in range(
+                self._min_concurrency_index, self._max_concurrency_index + 1)):
+            if self._should_continue_concurrency_sweep():
+                self._concurrencies.append(concurrency)
+                yield concurrency
+            else:
+                logger.info(
+                    "Terminating concurrency sweep - throughput is decreasing")
+                return
+
+    def _should_continue_concurrency_sweep(self) -> bool:
+        self._check_measurement_count()
+
+        if not self._are_minimum_tries_reached():
+            return True
+        else:
+            return not self._has_objective_gain_saturated()
+
+    def _check_measurement_count(self) -> None:
+        if len(self._run_config_measurements) != len(self._concurrencies):
+            raise TritonModelAnalyzerException(f"Internal Measurement count: {self._concurrencies}, doesn't match number " \
+                f"of measurements added: {len(self._run_config_measurements)}.")
+
+    def _are_minimum_tries_reached(self) -> bool:
+        if len(self._run_config_measurements
+              ) < THROUGHPUT_MINIMUM_CONSECUTIVE_CONCURRENCY_TRIES:
+            return False
+        else:
+            return True
+
+    def _has_objective_gain_saturated(self) -> bool:
+        gain = self._calculate_gain()
+        return gain < THROUGHPUT_MINIMUM_GAIN
+
+    def _calculate_gain(self) -> float:
+        first_rcm = self._run_config_measurements[
+            -THROUGHPUT_MINIMUM_CONSECUTIVE_CONCURRENCY_TRIES]
+
+        best_rcm = self._get_best_rcm()
+
+        # These cover the cases where we don't get a result from PA
+        if not first_rcm and not best_rcm:
+            return 0
+        if not first_rcm:
+            return 1
+        elif not best_rcm:
+            return -1
+        else:
+            gain = first_rcm.compare_measurements(best_rcm)
+
+        return gain
+
+    def _get_best_rcm(self) -> Optional[RunConfigMeasurement]:
+        # Need to remove entries (None) with no result from PA before sorting
+        pruned_rcms = [
+            rcm for rcm in self._run_config_measurements[
+                -THROUGHPUT_MINIMUM_CONSECUTIVE_CONCURRENCY_TRIES:] if rcm
+        ]
+        best_rcm = max(pruned_rcms) if pruned_rcms else None
+
+        return best_rcm
+
+    def _was_constraint_violated(self) -> bool:
+        for i in range(len(self._run_config_measurements) - 1, 1, -1):
+            if self._at_constraint_failure_boundary(i):
+                self._last_failing_concurrency = self._concurrencies[i]
+                self._last_passing_concurrency = self._concurrencies[i - 1]
+                return True
+
+        if self._run_config_measurements[
+                0] and not self._run_config_measurements[
+                    0].is_passing_constraints():
+            self._last_failing_concurrency = self._concurrencies[i]
+            self._last_passing_concurrency = 0
+            return True
+        else:
+            return False
+
+    def _at_constraint_failure_boundary(self, index: int) -> bool:
+        if not self._run_config_measurements[
+                index] or not self._run_config_measurements[index - 1]:
+            return False
+
+        at_failure_boundary = not self._run_config_measurements[  # type: ignore
+            index].is_passing_constraints() and self._run_config_measurements[
+                index -  # type: ignore
+                1].is_passing_constraints()
+
+        return at_failure_boundary
+
+    def _perform_binary_concurrency_search(self) -> Generator[int, None, None]:
+        # This is needed because we are going to restart the search from the
+        # concurrency that failed - so we expect this to be at the end of the list
+        self._concurrencies.append(self._last_failing_concurrency)
+
+        for i in range(0, self._max_binary_search_steps):
+            concurrency = self._determine_next_binary_concurrency()
+
+            if concurrency != self._concurrencies[-1]:
+                self._concurrencies.append(concurrency)
+                yield concurrency
+
+    def _determine_next_binary_concurrency(self) -> int:
+        if not self._run_config_measurements[-1]:
+            return 0
+
+        if self._run_config_measurements[-1].is_passing_constraints():
+            self._last_passing_concurrency = self._concurrencies[-1]
+            concurrency = int(
+                (self._last_failing_concurrency + self._concurrencies[-1]) / 2)
+        else:
+            self._last_failing_concurrency = self._concurrencies[-1]
+            concurrency = int(
+                (self._last_passing_concurrency + self._concurrencies[-1]) / 2)
+
+        return concurrency
diff --git a/qa/L0_quick_search/test.sh b/qa/L0_quick_search/test.sh
@@ -52,6 +52,7 @@ MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --triton-metrics-url http://localhost:
 MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --output-model-repository-path $OUTPUT_MODEL_REPOSITORY --override-output-model-repository"
 MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS -e $EXPORT_PATH --filename-server-only=$FILENAME_SERVER_ONLY"
 MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --filename-model-inference=$FILENAME_INFERENCE_MODEL --filename-model-gpu=$FILENAME_GPU_MODEL"
+MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --latency-budget 10"
 MODEL_ANALYZER_ARGS="$MODEL_ANALYZER_ARGS --skip-summary-reports"
 MODEL_ANALYZER_SUBCOMMAND="profile"
 run_analyzer
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -85,6 +85,7 @@ def get_test_options():
         OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", "128"),
         OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", "1"),
         OptionStruct("int", "profile", "--run-config-search-max-instance-count", None, "10", "5"),
+        OptionStruct("int", "profile", "--run-config-search-max-binary-search-steps", None, "10", "5"),
         OptionStruct("float", "profile", "--monitoring-interval", "-i", "10.0", "1.0"),
         OptionStruct("float", "profile", "--perf-analyzer-cpu-util", None, "10.0", str(psutil.cpu_count() * 80.0)),
         OptionStruct("int", "profile", "--num-configs-per-model", None, "10", "3"),
diff --git a/tests/test_concurrency_search.py b/tests/test_concurrency_search.py