Add cpu_only option for ensemble composing models (#683)

nv-braf · web-flow · commit ca96f102166f · 2023-05-10T16:11:51.000-07:00
* Adding config option to specify cpu only composing models

* Adding config option to specify cpu only composing models

* Changing composing model to be cpu only w/ unit testing

* Updating documentation

* Remove duplicated test

* Enabling cpu_only for BLS composing models

* Removing ensemble from description
diff --git a/docs/config.md b/docs/config.md
@@ -92,6 +92,9 @@ profile_models: <comma-delimited-string-list>
 # List of composing models for BLS models
 bls_composing_models: <comma-delimited-string-list>
 
+# List of composing models that should be profiled using CPU instances only
+cpu_only_composing_models: <comma-delimited-string-list>
+
 # Full path to directory to which to read and write checkpoints and profile data
 [ checkpoint_directory: <string> | default: './checkpoints' ]
 
@@ -273,6 +276,9 @@ profile_models: <comma-delimited-string-list|list|profile_model>
 # List of composing models for BLS models
 bls_composing_models: <comma-delimited-string-list>
 
+# List of composing models that should be profiled using CPU instances only
+cpu_only_composing_models: <comma-delimited-string-list>
+
 # List of constraints placed on the config search results
 [ constraints: <constraint> ]
 
diff --git a/model_analyzer/config/generate/model_profile_spec.py b/model_analyzer/config/generate/model_profile_spec.py
@@ -34,6 +34,9 @@ def __init__(self, spec: ConfigModelProfileSpec,
         self._default_model_config = ModelConfig.create_model_config_dict(
             config, client, gpus, config.model_repository, spec.model_name())
 
+        if spec.model_name() in config.cpu_only_composing_models:
+            self._cpu_only = True
+
     def get_default_config(self) -> dict:
         """ Returns the default configuration for this model """
         return deepcopy(self._default_model_config)
diff --git a/model_analyzer/config/input/config_command_profile.py b/model_analyzer/config/input/config_command_profile.py
@@ -483,6 +483,14 @@ def _add_profile_models_configs(self):
                                        required=True),
                 default_value=[],
                 description='List of the models to be profiled'))
+        self._add_config(
+            ConfigField(
+                'cpu_only_composing_models',
+                field_type=ConfigListString(),
+                flags=['--cpu-only-composing-models'],
+                description=
+                ("A list of strings representing composing models that should be profiled using CPU instances only. "
+                )))
 
     def _add_client_configs(self):
         """
diff --git a/model_analyzer/model_manager.py b/model_analyzer/model_manager.py
@@ -179,6 +179,11 @@ def _check_for_ensemble_model_incompatibility(
                         )
                     else:
                         self._config.run_config_search_mode = 'quick'
+            elif not self._config.bls_composing_models:
+                if len(self._config.cpu_only_composing_models) > 0:
+                    raise TritonModelAnalyzerException(
+                        f'\nCan only specify --cpu-only-composing-models for ensemble or BLS models.'
+                    )
 
     def _init_state(self):
         """
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -156,6 +156,7 @@ def get_test_options():
         OptionStruct("noop", "profile", "--model-repository"),
         OptionStruct("noop", "profile", "--profile-models"),
         OptionStruct("noop", "profile", "--bls-composing-models"),
+        OptionStruct("noop", "profile", "--cpu-only-composing-models"),
 
         OptionStruct("noop", "report", "--report-model-configs"),
         OptionStruct("noop", "report", "--output-formats", "-o", ["pdf", "csv", "png"], "pdf", "SHOULD_FAIL"),
diff --git a/tests/test_model_manager.py b/tests/test_model_manager.py
@@ -1095,6 +1095,41 @@ def test_ensemble_makes_quick_default(self, *args):
 
         self.assertEqual(config.run_config_search_mode, "quick")
 
+    @patch('model_analyzer.triton.model.model_config.ModelConfig.is_ensemble',
+           return_value=False)
+    def test_cpu_only_composing_models_error(self, *args):
+        """
+        Test that --cpu-only-composing-models errors when 
+        set for non-ensemble/BLS models
+        """
+        yaml_str = ("""
+                  profile_models: test_model
+                  """)
+
+        args = self._args.copy()
+        args.append('--cpu-only-composing-models')
+        args.append('composing_modelA,composing_modelB')
+
+        self.mock_model_config = MockModelConfig(self._model_config_protobuf)
+        self.mock_model_config.start()
+        config = evaluate_mock_config(args, yaml_str, subcommand="profile")
+
+        state_manager = AnalyzerStateManager(config, MagicMock())
+        metrics_manager = MetricsManagerSubclass(config, MagicMock(),
+                                                 MagicMock(), MagicMock(),
+                                                 MagicMock(), state_manager)
+        model_manager = ModelManager(config, MagicMock(), MagicMock(),
+                                     MagicMock(), metrics_manager, MagicMock(),
+                                     state_manager, MagicMock())
+
+        # RunConfigSearch check
+        models = [
+            ConfigModelProfileSpec('test_model'),
+        ]
+        with self.assertRaises(TritonModelAnalyzerException):
+            model_manager._check_for_ensemble_model_incompatibility(models)
+        self.mock_model_config.stop()
+
     def _test_model_manager(self, yaml_content, expected_ranges, args=None):
         """ 
         Test helper function that passes the given yaml_str into
diff --git a/tests/test_quick_run_config_generator.py b/tests/test_quick_run_config_generator.py
@@ -899,22 +899,23 @@ def _get_next_run_config_ensemble(self,
         Test that get_next_run_config() creates a proper RunConfig for ensemble
 
         Sets up a case where the coordinate is [1,2,4,5], which corresponds to
-          - composing model 1 max_batch_size = 2
-          - composing model 1 instance_count = 3
-          - composing model 1 concurrency = 2*3*2 = 12
-          - composing model 2 max_batch_size = 16
-          - composing model 2 instance_count = 6
-          - composing model 2 concurrency = 16*6*2 = 192
+          - composing model A max_batch_size = 2
+          - composing model A instance_count = 3
+          - composing model A concurrency = 2*3*2 = 12
+          - composing model B max_batch_size = 16
+          - composing model B instance_count = 6
+          - composing model B concurrency = 16*6*2 = 192
           - ensemble model concurrency = 12 (minimum value of [12, 192])
 
         Also,
-        - sequence batching should be on for model 1
-        - dynamic batching should be on for model 2
+        - sequence batching should be on for model A
+        - dynamic batching should be on for model B
+        - cpu_only should be set for model B
         - existing values from the base model config should persist if they aren't overwritten
         - existing values for perf-analyzer config should persist if they aren't overwritten
         """
 
-        additional_args = []
+        additional_args = ['--cpu-only-composing-models', 'fake_model_B']
         if max_concurrency:
             additional_args.append('--run-config-search-max-concurrency')
             additional_args.append(f'{max_concurrency}')
@@ -923,7 +924,7 @@ def _get_next_run_config_ensemble(self,
             additional_args.append(f'{min_concurrency}')
 
         #yapf: disable
-        expected_model_config0 = {
+        expected_model_A_config_0 = {
             'cpu_only': False,
             'instanceGroup': [{
                 'count': 3,
@@ -939,12 +940,12 @@ def _get_next_run_config_ensemble(self,
             }]
         }
 
-        expected_model_config1 = {
-            'cpu_only': False,
+        expected_model_B_config_0 = {
+            'cpu_only': True,
             'dynamicBatching': {},
             'instanceGroup': [{
                 'count': 6,
-                'kind': 'KIND_GPU',
+                'kind': 'KIND_CPU',
             }],
             'maxBatchSize': 16,
             'name': 'fake_model_B_config_0',
@@ -1004,15 +1005,15 @@ def _get_next_run_config_ensemble(self,
 
         model_config = run_config.model_run_configs()[0].model_config()
         perf_config = run_config.model_run_configs()[0].perf_config()
-        composing_model_config0 = run_config.model_run_configs(
+        composing_model_A_config_0 = run_config.model_run_configs(
         )[0].composing_configs()[0]
-        composing_model_config1 = run_config.model_run_configs(
+        composing_model_B_config_0 = run_config.model_run_configs(
         )[0].composing_configs()[1]
 
-        self.assertEqual(composing_model_config0.to_dict(),
-                         expected_model_config0)
-        self.assertEqual(composing_model_config1.to_dict(),
-                         expected_model_config1)
+        self.assertEqual(composing_model_A_config_0.to_dict(),
+                         expected_model_A_config_0)
+        self.assertEqual(composing_model_B_config_0.to_dict(),
+                         expected_model_B_config_0)
 
         if max_concurrency:
             self.assertEqual(perf_config['concurrency-range'], max_concurrency)