New config/CLI options for request-rate-range (#665)

nv-braf · tgerdesnv · commit 985fc23bbd64 · 2023-05-01T09:56:03.000-05:00
* Added config options for RRR

* Unit testing

* Adding in missing request-rate-range
diff --git a/docs/config.md b/docs/config.md
@@ -110,6 +110,9 @@ bls_composing_models: <comma-delimited-string-list>
 # Batch size values to be used
 [ batch_sizes: <comma-delimited-string|list|range> | default: 1 ]
 
+# Request rate range values to be used
+[ request_rate_range: <comma-delimited-string|list|range> ]
+
 # Specifies the maximum number of retries for any retry attempt
 [ client_max_retries: <int> | default: 50 ]
 
@@ -185,30 +188,39 @@ bls_composing_models: <comma-delimited-string-list>
 # Search mode. Options are "brute" and "quick"
 [ run_config_search_mode: <string> | default: brute]
 
-# Minimum concurrency used for the automatic config search
+# Minimum concurrency used for the automatic/quick config search
 [ run_config_search_min_concurrency: <int> | default: 1 ]
 
-# Maximum concurrency used for the automatic config search
+# Maximum concurrency used for the automatic/quick config search
 [ run_config_search_max_concurrency: <int> | default: 1024 ]
 
-# Minimum max_batch_size used for the automatic config search
+# Minimum max_batch_size used for the automatic/quick config search
 [ run_config_search_min_model_batch_size: <int> | default: 1 ]
 
-# Maximum max_batch_size used for the automatic config search
+# Maximum max_batch_size used for the automatic/quick config search
 [ run_config_search_max_model_batch_size: <int> | default: 128 ]
 
-# Minimum instance group count used for the automatic config search
+# Minimum instance group count used for the automatic/quick config search
 [ run_config_search_min_instance_count: <int> | default: 1 ]
 
-# Maximum instance group count used for the automatic config search
+# Maximum instance group count used for the automatic/quick config search
 [ run_config_search_max_instance_count: <int> | default: 5 ]
 
+# Minimum request rate range used for the automatic/quick config search
+[ run_config_search_min_request_rate_range: <int> | default: 1 ]
+
+# Maximum request rate range used for the automatic/quick config search
+[ run_config_search_max_request_rate_range: <int> | default: 1024 ]
+
 # Disables automatic config search
 [ run_config_search_disable: <bool> | default: false ]
 
 # Enables the profiling of all supplied models concurrently
 [ run_config_profile_models_concurrently_enable: <bool> | default: false]
 
+# Enables the searching of request rate range (instead of concurrency)
+[ request_rate_range_search_enable: <bool> | default: false]
+
 # Skips the generation of summary reports and tables
 [ skip_summary_reports: <bool> | default: false]
 
diff --git a/model_analyzer/config/input/config_command.py b/model_analyzer/config/input/config_command.py
@@ -120,6 +120,8 @@ def _check_for_illegal_config_settings(
         self._check_for_multi_model_incompatibility(args, yaml_config)
         self._check_for_quick_search_incompatibility(args, yaml_config)
         self._check_for_bls_incompatibility(args, yaml_config)
+        self._check_for_concurrency_rate_request_conflicts(args, yaml_config)
+        self._check_for_config_search_rate_request_conflicts(args, yaml_config)
 
     def _set_field_values(self, args: Namespace,
                           yaml_config: Optional[Dict[str, List]]) -> None:
@@ -288,6 +290,63 @@ def _check_no_concurrent_search(
                 '\nPlease remove `--run-config-profile-models-concurrently-enable from the config/CLI.'
             )
 
+    def _check_for_concurrency_rate_request_conflicts(
+            self, args: Namespace, yaml_config: Optional[Dict[str,
+                                                              List]]) -> None:
+        if self._get_config_value('concurrency', args, yaml_config):
+            if self._get_config_value('request_rate_range_search_enable', args,
+                                      yaml_config):
+                raise TritonModelAnalyzerException(
+                    f'\nCannot have both `request-rate-range-search-enable` and `concurrency` specified in the config/CLI.'
+                )
+            elif self._get_config_value('request_rate_range', args,
+                                        yaml_config):
+                raise TritonModelAnalyzerException(
+                    f'\nCannot have both `request-rate-range` and `concurrency` specified in the config/CLI.'
+                )
+            elif self._get_config_value(
+                    'run_config_search_min_request_rate_range', args,
+                    yaml_config):
+                raise TritonModelAnalyzerException(
+                    f'\nCannot have both `run-config-search-min-request-rate-range` and `concurrency` specified in the config/CLI.'
+                )
+            elif self._get_config_value(
+                    'run_config_search_max_request_rate_range', args,
+                    yaml_config):
+                raise TritonModelAnalyzerException(
+                    f'\nCannot have both `run-config-search-max-request-rate-range` and `concurrency` specified in the config/CLI.'
+                )
+
+    def _check_for_config_search_rate_request_conflicts(
+            self, args: Namespace, yaml_config: Optional[Dict[str,
+                                                              List]]) -> None:
+        if self._get_config_value('run_config_search_max_concurrency', args,
+                                  yaml_config) or self._get_config_value(
+                                      'run_config_search_min_concurrency', args,
+                                      yaml_config):
+            if self._get_config_value('request_rate_range_search_enable', args,
+                                      yaml_config):
+                raise TritonModelAnalyzerException(
+                    f'\nCannot have both `request-rate-range-search-enable` and `run-config-search-min/max-concurrency` specified in the config/CLI.'
+                )
+            elif self._get_config_value('request_rate_range', args,
+                                        yaml_config):
+                raise TritonModelAnalyzerException(
+                    f'\nCannot have both `request-rate-range` and `run-config-search-min/max-concurrency` specified in the config/CLI.'
+                )
+            elif self._get_config_value(
+                    'run_config_search_min_request_rate_range', args,
+                    yaml_config):
+                raise TritonModelAnalyzerException(
+                    f'\nCannot have both `run-config-search-min-request-rate-range` and `run-config-search-min/max-concurrency` specified in the config/CLI.'
+                )
+            elif self._get_config_value(
+                    'run_config_search_max_request_rate_range', args,
+                    yaml_config):
+                raise TritonModelAnalyzerException(
+                    f'\nCannot have both `run-config-search-max-request-rate-range` and `run-config-search-min/max-concurrency` specified in the config/CLI.'
+                )
+
     def _preprocess_and_verify_arguments(self):
         """
         Enforces some rules on the config.
diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
@@ -34,7 +34,9 @@
     DEFAULT_OUTPUT_MODEL_REPOSITORY, DEFAULT_OVERRIDE_OUTPUT_REPOSITORY_FLAG, \
     DEFAULT_PERF_ANALYZER_CPU_UTIL, DEFAULT_PERF_ANALYZER_PATH, DEFAULT_PERF_MAX_AUTO_ADJUSTS, \
     DEFAULT_PERF_OUTPUT_FLAG, DEFAULT_RUN_CONFIG_MAX_CONCURRENCY, DEFAULT_RUN_CONFIG_MIN_CONCURRENCY, \
+    DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE_RANGE, DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE_RANGE, \
     DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE, DEFAULT_RUN_CONFIG_SEARCH_MODE, \
+    DEFAULT_REQUEST_RATE_RANGE_SEARCH_ENABLE, \
     DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT, DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT, \
     DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE, DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE, \
     DEFAULT_RUN_CONFIG_SEARCH_DISABLE, DEFAULT_TRITON_DOCKER_IMAGE, DEFAULT_TRITON_GRPC_ENDPOINT, \
@@ -440,6 +442,14 @@ def _add_profile_models_configs(self):
                 description=
                 "Comma-delimited list of concurrency values or ranges <start:end:step>"
                 " to be used during profiling"))
+        self._add_config(
+            ConfigField(
+                'request_rate_range',
+                flags=['-rrr', '--request-rate-range'],
+                field_type=ConfigListNumeric(int),
+                description=
+                "Comma-delimited list of request rate range values or ranges <start:end:step>"
+                " to be used during profiling"))
         self._add_config(
             ConfigField(
                 'reload_model_disable',
@@ -509,7 +519,7 @@ def _add_run_search_configs(self):
                 default_value=False,
                 flags=['--early-exit-enable'],
                 description=
-                'Flag to indicate if Model Analyzer can skip some configurations when manually searching concurrency or max_batch_size'
+                'Flag to indicate if Model Analyzer can skip some configurations when manually searching concurrency/request rate range,  or max_batch_size'
             ))
         self._add_config(
             ConfigField(
@@ -529,6 +539,24 @@ def _add_run_search_configs(self):
                 description=
                 "Min concurrency value that run config search should start with."
             ))
+        self._add_config(
+            ConfigField(
+                'run_config_search_max_request_rate_range',
+                flags=['--run-config-search-max-request-rate-range'],
+                field_type=ConfigPrimitive(int),
+                default_value=DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE_RANGE,
+                description=
+                "Max request rate range value that run config search should not go beyond that."
+            ))
+        self._add_config(
+            ConfigField(
+                'run_config_search_min_request_rate_range',
+                flags=['--run-config-search-min-request-rate-range'],
+                field_type=ConfigPrimitive(int),
+                default_value=DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE_RANGE,
+                description=
+                "Min request rate range value that run config search should start with."
+            ))
         self._add_config(
             ConfigField(
                 'run_config_search_max_instance_count',
@@ -595,6 +623,16 @@ def _add_run_search_configs(self):
                 DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE,
                 description=
                 "Enable the profiling of all supplied models concurrently."))
+        self._add_config(
+            ConfigField(
+                'request_rate_range_search_enable',
+                flags=['--request-rate-range-search-enable'],
+                field_type=ConfigPrimitive(bool),
+                parser_args={'action': 'store_true'},
+                default_value=DEFAULT_REQUEST_RATE_RANGE_SEARCH_ENABLE,
+                description=
+                "Enables the searching of request rate range (instead of concurrency)."
+            ))
 
     def _add_triton_configs(self):
         """
diff --git a/model_analyzer/config/input/config_defaults.py b/model_analyzer/config/input/config_defaults.py
@@ -41,13 +41,16 @@
 DEFAULT_CLIENT_PROTOCOL = 'grpc'
 DEFAULT_RUN_CONFIG_MAX_CONCURRENCY = 1024
 DEFAULT_RUN_CONFIG_MIN_CONCURRENCY = 1
+DEFAULT_RUN_CONFIG_MAX_REQUEST_RATE_RANGE = 1024
+DEFAULT_RUN_CONFIG_MIN_REQUEST_RATE_RANGE = 1
 DEFAULT_RUN_CONFIG_MAX_INSTANCE_COUNT = 5
 DEFAULT_RUN_CONFIG_MIN_INSTANCE_COUNT = 1
 DEFAULT_RUN_CONFIG_MIN_MODEL_BATCH_SIZE = 1
 DEFAULT_RUN_CONFIG_MAX_MODEL_BATCH_SIZE = 128
 DEFAULT_RUN_CONFIG_SEARCH_DISABLE = False
 DEFAULT_RUN_CONFIG_SEARCH_MODE = 'brute'
 DEFAULT_RUN_CONFIG_PROFILE_MODELS_CONCURRENTLY_ENABLE = False
+DEFAULT_REQUEST_RATE_RANGE_SEARCH_ENABLE = False
 DEFAULT_TRITON_LAUNCH_MODE = 'local'
 DEFAULT_TRITON_DOCKER_IMAGE = 'nvcr.io/nvidia/tritonserver:23.04-py3'
 DEFAULT_TRITON_HTTP_ENDPOINT = 'localhost:8000'
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -63,6 +63,7 @@ def get_test_options():
         OptionStruct("bool", "profile","--perf-output"),
         OptionStruct("bool", "profile","--run-config-search-disable"),
         OptionStruct("bool", "profile","--run-config-profile-models-concurrently-enable"),
+        OptionStruct("bool", "profile","--request-rate-range-search-enable"),
         OptionStruct("bool", "profile","--reload-model-disable"),
         OptionStruct("bool", "profile","--early-exit-enable"),
         OptionStruct("bool", "profile","--skip-summary-reports"),
@@ -78,6 +79,8 @@ def get_test_options():
         OptionStruct("int", "profile", "--perf-analyzer-max-auto-adjusts", None, "100", "10"),
         OptionStruct("int", "profile", "--run-config-search-min-concurrency", None, "2", "1"),
         OptionStruct("int", "profile", "--run-config-search-max-concurrency", None, "100", "1024"),
+        OptionStruct("int", "profile", "--run-config-search-min-request-rate-range", None, "2", "1"),
+        OptionStruct("int", "profile", "--run-config-search-max-request-rate-range", None, "100", "1024"),
         OptionStruct("int", "profile", "--run-config-search-min-model-batch-size", None, "100", "1"),
         OptionStruct("int", "profile", "--run-config-search-max-model-batch-size", None, "100", "128"),
         OptionStruct("int", "profile", "--run-config-search-min-instance-count", None, "2", "1"),
@@ -132,6 +135,7 @@ def get_test_options():
         #   expected_default_value
         OptionStruct("intlist", "profile", "--batch-sizes", "-b", "2, 4, 6", "1"),
         OptionStruct("intlist", "profile", "--concurrency", "-c", "1, 2, 3", None),
+        OptionStruct("intlist", "profile", "--request-rate-range", "-rrr", "1, 2, 3", None),
         OptionStruct("stringlist", "profile", "--triton-docker-mounts", None, "a:b:c, d:e:f", None, extra_commands=["--triton-launch-mode", "docker"]),
         OptionStruct("stringlist", "profile", "--gpus", None, "a, b, c", "all"),
         OptionStruct("stringlist", "profile", "--inference-output-fields", None, "a, b, c",
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -18,7 +18,7 @@
 from .mocks.mock_numba import MockNumba
 from .mocks.mock_os import MockOSMethods
 
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any
 from argparse import Namespace
 
 from .common import test_result_collector as trc
@@ -1980,6 +1980,63 @@ def test_bls_illegal_config_combinations(self):
         with self.assertRaises(TritonModelAnalyzerException):
             self._evaluate_config(args, yaml_content)
 
+    def test_concurrency_rate_request_config_combinations(self):
+        """
+        Test for concurrency with rate request conflicts
+        """
+        base_args = [
+            'model-analyzer', 'profile', '--model-repository', 'cli-repository',
+            '--profile-models', 'modelA', '-c', '1,2,3'
+        ]
+        yaml_content = ''
+
+        self._test_request_rate_config_conflicts(base_args, yaml_content)
+
+    def test_config_search_min_rate_request_config_combinations(self):
+        """
+        Test for concurrency min request with rate request conflicts
+        """
+        base_args = [
+            'model-analyzer', 'profile', '--model-repository', 'cli-repository',
+            '--profile-models', 'modelA', '--run-config-search-min-concurrency',
+            '1'
+        ]
+        yaml_content = ''
+
+        self._test_request_rate_config_conflicts(base_args, yaml_content)
+
+    def test_config_search_max_rate_request_config_combinations(self):
+        """
+        Test for concurrency max request with rate request conflicts
+        """
+        base_args = [
+            'model-analyzer', 'profile', '--model-repository', 'cli-repository',
+            '--profile-models', 'modelA', '--run-config-search-max-concurrency',
+            '1'
+        ]
+        yaml_content = ''
+
+        self._test_request_rate_config_conflicts(base_args, yaml_content)
+
+    def _test_request_rate_config_conflicts(self, base_args: List[Any],
+                                            yaml_content: str) -> None:
+        self._test_arg_conflict(base_args, yaml_content,
+                                ['--request-rate-range-search-enable'])
+        self._test_arg_conflict(base_args, yaml_content,
+                                ['--request-rate-range', '1,2,3'])
+        self._test_arg_conflict(
+            base_args, yaml_content,
+            ['--run-config-search-min-request-rate-range', '1'])
+        self._test_arg_conflict(
+            base_args, yaml_content,
+            ['--run-config-search-max-request-rate-range', '1'])
+
+    def _test_arg_conflict(self, base_args: List[Any], yaml_content: str,
+                           new_args: List[Any]) -> None:
+        args = base_args.copy() + new_args
+        with self.assertRaises(TritonModelAnalyzerException):
+            self._evaluate_config(args, yaml_content)
+
 
 if __name__ == '__main__':
     unittest.main()