Add Optuna Seed to Checkpoint (#913)

nv-braf · web-flow · commit 265cbae2f9e6 · 2024-07-22T13:37:33.000-07:00
* Adding seed

* Adding checkpointing of seed to optuna

* Fixing issue if checkpoint doesn't contain optuna seed
diff --git a/model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py b/model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py
@@ -32,6 +32,7 @@
 from model_analyzer.result.parameter_search import ParameterSearch
 from model_analyzer.result.result_manager import ResultManager
 from model_analyzer.result.run_config_measurement import RunConfigMeasurement
+from model_analyzer.state.analyzer_state_manager import AnalyzerStateManager
 
 from .config_generator_interface import ConfigGeneratorInterface
 
@@ -48,6 +49,7 @@ class OptunaPlusConcurrencySweepRunConfigGenerator(ConfigGeneratorInterface):
     def __init__(
         self,
         config: ConfigCommandProfile,
+        state_manager: AnalyzerStateManager,
         gpu_count: int,
         models: List[ModelProfileSpec],
         composing_models: List[ModelProfileSpec],
@@ -61,6 +63,8 @@ def __init__(
         ----------
         config: ConfigCommandProfile
             Profile configuration information
+        state_manager: AnalyzerStateManager
+            The object that allows control and update of checkpoint state
         gpu_count: Number of gpus in the system
         models: List of ModelProfileSpec
             List of models to profile
@@ -76,6 +80,7 @@ def __init__(
             The object that handles the users configuration search parameters for composing models
         """
         self._config = config
+        self._state_manager = state_manager
         self._gpu_count = gpu_count
         self._models = models
         self._composing_models = composing_models
@@ -123,6 +128,7 @@ def _execute_optuna_search(self) -> Generator[RunConfig, None, None]:
     def _create_optuna_run_config_generator(self) -> OptunaRunConfigGenerator:
         return OptunaRunConfigGenerator(
             config=self._config,
+            state_manager=self._state_manager,
             gpu_count=self._gpu_count,
             models=self._models,
             composing_models=self._composing_models,
diff --git a/model_analyzer/config/generate/optuna_run_config_generator.py b/model_analyzer/config/generate/optuna_run_config_generator.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import logging
+from random import randint
 from sys import maxsize
 from typing import Any, Dict, Generator, List, Optional, TypeAlias, Union
 
@@ -43,6 +44,7 @@
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
 from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
 from model_analyzer.result.run_config_measurement import RunConfigMeasurement
+from model_analyzer.state.analyzer_state_manager import AnalyzerStateManager
 from model_analyzer.triton.model.model_config import ModelConfig
 from model_analyzer.triton.model.model_config_variant import ModelConfigVariant
 
@@ -82,19 +84,22 @@ class OptunaRunConfigGenerator(ConfigGeneratorInterface):
     def __init__(
         self,
         config: ConfigCommandProfile,
+        state_manager: AnalyzerStateManager,
         gpu_count: int,
         models: List[ModelProfileSpec],
         composing_models: List[ModelProfileSpec],
         model_variant_name_manager: ModelVariantNameManager,
         search_parameters: Dict[str, SearchParameters],
         composing_search_parameters: Dict[str, SearchParameters],
-        seed: Optional[int] = None,
+        user_seed: Optional[int] = None,
     ):
         """
         Parameters
         ----------
         config: ConfigCommandProfile
             Profile configuration information
+        state_manager: AnalyzerStateManager
+            The object that allows control and update of checkpoint state
         gpu_count: Number of gpus in the system
         models: List of ModelProfileSpec
             List of models to profile
@@ -105,8 +110,11 @@ def __init__(
             The object that handles the users configuration search parameters
         composing_search_parameters: SearchParameters
             The object that handles the users configuration search parameters for composing models
+        user_seed: int
+            The seed to use. If not provided, one will be generated (fresh run) or read from checkpoint
         """
         self._config = config
+        self._state_manager = state_manager
         self._gpu_count = gpu_count
         self._models = models
         self._composing_models = composing_models
@@ -132,10 +140,9 @@ def __init__(
 
         self._done = False
 
-        if seed is not None:
-            self._sampler = optuna.samplers.TPESampler(seed=seed)
-        else:
-            self._sampler = optuna.samplers.TPESampler()
+        self._seed = self._create_seed(user_seed)
+
+        self._sampler = optuna.samplers.TPESampler(seed=self._seed)
 
         self._study_name = ",".join([model.model_name() for model in self._models])
 
@@ -145,6 +152,24 @@ def __init__(
             sampler=self._sampler,
         )
 
+        self._init_state()
+
+    def _get_seed(self) -> int:
+        return self._state_manager.get_state_variable("OptunaRunConfigGenerator.seed")
+
+    def _create_seed(self, user_seed: Optional[int]) -> int:
+        if self._state_manager.starting_fresh_run():
+            seed = randint(0, 10000) if user_seed is None else user_seed
+        else:
+            seed = self._get_seed() if user_seed is None else user_seed
+
+        return seed
+
+    def _init_state(self) -> None:
+        self._state_manager.set_state_variable(
+            "OptunaRunConfigGenerator.seed", self._seed
+        )
+
     def _is_done(self) -> bool:
         return self._done
 
diff --git a/model_analyzer/config/generate/run_config_generator_factory.py b/model_analyzer/config/generate/run_config_generator_factory.py
@@ -29,6 +29,7 @@
 from model_analyzer.device.gpu_device import GPUDevice
 from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
 from model_analyzer.result.result_manager import ResultManager
+from model_analyzer.state.analyzer_state_manager import AnalyzerStateManager
 from model_analyzer.triton.client.client import TritonClient
 from model_analyzer.triton.model.model_config import ModelConfig
 
@@ -55,6 +56,7 @@ class RunConfigGeneratorFactory:
     @staticmethod
     def create_run_config_generator(
         command_config: ConfigCommandProfile,
+        state_manager: AnalyzerStateManager,
         gpus: List[GPUDevice],
         models: List[ConfigModelProfileSpec],
         client: TritonClient,
@@ -68,6 +70,8 @@ def create_run_config_generator(
         ----------
         command_config: ConfigCommandProfile
             The Model Analyzer config file for the profile step
+        state_manager: AnalyzerStateManager
+            The object that allows control and update of checkpoint state
         gpus: List of GPUDevices
         models: list of ConfigModelProfileSpec
             The models to generate RunConfigs for
@@ -107,6 +111,7 @@ def create_run_config_generator(
         if command_config.run_config_search_mode == "optuna":
             return RunConfigGeneratorFactory._create_optuna_plus_concurrency_sweep_run_config_generator(
                 command_config=command_config,
+                state_manager=state_manager,
                 gpu_count=len(gpus),
                 models=new_models,
                 composing_models=composing_models,
@@ -159,6 +164,7 @@ def _create_brute_plus_binary_parameter_search_run_config_generator(
     @staticmethod
     def _create_optuna_plus_concurrency_sweep_run_config_generator(
         command_config: ConfigCommandProfile,
+        state_manager: AnalyzerStateManager,
         gpu_count: int,
         models: List[ModelProfileSpec],
         composing_models: List[ModelProfileSpec],
@@ -169,6 +175,7 @@ def _create_optuna_plus_concurrency_sweep_run_config_generator(
     ) -> ConfigGeneratorInterface:
         return OptunaPlusConcurrencySweepRunConfigGenerator(
             config=command_config,
+            state_manager=state_manager,
             gpu_count=gpu_count,
             composing_models=composing_models,
             models=models,
diff --git a/model_analyzer/model_manager.py b/model_analyzer/model_manager.py
@@ -136,6 +136,7 @@ def run_models(self, models: List[ConfigModelProfileSpec]) -> None:
 
         rcg = RunConfigGeneratorFactory.create_run_config_generator(
             command_config=self._config,
+            state_manager=self._state_manager,
             gpus=self._gpus,
             models=models,
             client=self._client,
diff --git a/model_analyzer/state/analyzer_state.py b/model_analyzer/state/analyzer_state.py
@@ -59,6 +59,11 @@ def from_dict(cls, state_dict):
         # GPU data
         state._state_dict["MetricsManager.gpus"] = state_dict["MetricsManager.gpus"]
 
+        # Optuna Seed
+        state._state_dict["OptunaRunConfigGenerator.seed"] = state_dict.get(
+            "OptunaRunConfigGenerator.seed", 0
+        )
+
         return state
 
     def get(self, name):
diff --git a/tests/test_optuna_run_config_generator.py b/tests/test_optuna_run_config_generator.py
@@ -68,13 +68,14 @@ def setUp(self):
 
         self._rcg = OptunaRunConfigGenerator(
             config=config,
+            state_manager=MagicMock(),
             gpu_count=1,
             models=self._mock_models,
             composing_models=[],
             model_variant_name_manager=ModelVariantNameManager(),
             search_parameters={"add_sub": search_parameters},
             composing_search_parameters={},
-            seed=100,
+            user_seed=100,
         )
 
     def test_max_number_of_configs_to_search_percentage(self):
@@ -204,7 +205,7 @@ def test_create_objective_based_run_config(self):
 
         self.assertEqual(model_config.to_dict()["name"], self._test_config_dict["name"])
 
-        # These values are the result of using a fixed seed of 100
+        # These values are the result of using a fixed user_seed of 100
         self.assertEqual(model_config.to_dict()["maxBatchSize"], 16)
         self.assertEqual(model_config.to_dict()["instanceGroup"][0]["count"], 2)
         self.assertEqual(
@@ -232,13 +233,14 @@ def test_create_run_config_with_concurrency_formula(self):
 
         rcg = OptunaRunConfigGenerator(
             config=config,
+            state_manager=MagicMock(),
             gpu_count=1,
             models=self._mock_models,
             composing_models=[],
             model_variant_name_manager=ModelVariantNameManager(),
             search_parameters={"add_sub": search_parameters},
             composing_search_parameters={},
-            seed=100,
+            user_seed=100,
         )
 
         trial = rcg._study.ask()
@@ -250,7 +252,7 @@ def test_create_run_config_with_concurrency_formula(self):
 
         self.assertEqual(model_config.to_dict()["name"], self._test_config_dict["name"])
 
-        # These values are the result of using a fixed seed of 100
+        # These values are the result of using a fixed user_seed of 100
         self.assertEqual(model_config.to_dict()["maxBatchSize"], 16)
         self.assertEqual(model_config.to_dict()["instanceGroup"][0]["count"], 2)
         self.assertEqual(
@@ -291,6 +293,7 @@ def test_create_run_bls_config(self):
         )
         rcg = OptunaRunConfigGenerator(
             config=config,
+            state_manager=MagicMock(),
             gpu_count=1,
             models=[bls_model],
             composing_models=[add_model, sub_model],
@@ -300,7 +303,7 @@ def test_create_run_bls_config(self):
                 "add": add_search_parameters,
                 "sub": sub_search_parameters,
             },
-            seed=100,
+            user_seed=100,
         )
 
         trial = rcg._study.ask()
@@ -315,7 +318,7 @@ def test_create_run_bls_config(self):
         sub_model_config = run_config.model_run_configs()[0].composing_configs()[1]
         perf_config = run_config.model_run_configs()[0].perf_config()
 
-        # BLS (Top Level Model) + PA Config (Seed=100)
+        # BLS (Top Level Model) + PA Config (user_seed=100)
         # =====================================================================
         self.assertEqual(bls_model_config.to_dict()["name"], "bls")
         self.assertEqual(bls_model_config.to_dict()["instanceGroup"][0]["count"], 3)
@@ -364,6 +367,7 @@ def test_create_run_multi_model_config(self):
         )
         rcg = OptunaRunConfigGenerator(
             config=config,
+            state_manager=MagicMock(),
             gpu_count=1,
             models=[add_model, vgg_model],
             composing_models=[],
@@ -373,7 +377,7 @@ def test_create_run_multi_model_config(self):
                 "vgg19_libtorch": vgg_search_parameters,
             },
             composing_search_parameters={},
-            seed=100,
+            user_seed=100,
         )
 
         trial = rcg._study.ask()
@@ -388,7 +392,7 @@ def test_create_run_multi_model_config(self):
         add_perf_config = run_config.model_run_configs()[0].perf_config()
         vgg_perf_config = run_config.model_run_configs()[0].perf_config()
 
-        # ADD_SUB + PA Config (Seed=100)
+        # ADD_SUB + PA Config (user_seed=100)
         # =====================================================================
         self.assertEqual(add_model_config.to_dict()["name"], "add_sub")
         self.assertEqual(add_model_config.to_dict()["maxBatchSize"], 16)
@@ -400,7 +404,7 @@ def test_create_run_multi_model_config(self):
         self.assertEqual(add_perf_config["batch-size"], DEFAULT_BATCH_SIZES)
         self.assertEqual(add_perf_config["concurrency-range"], 16)
 
-        # VGG19_LIBTORCH + PA Config (Seed=100)
+        # VGG19_LIBTORCH + PA Config (user_seed=100)
         # =====================================================================
         self.assertEqual(vgg_model_config.to_dict()["name"], "vgg19_libtorch")
         self.assertEqual(vgg_model_config.to_dict()["instanceGroup"][0]["count"], 4)