Periodic checkpointing in MA (#710)

pskiran1 · web-flow · commit 650d118aaf0c · 2023-06-13T08:42:12.000-07:00
* Initial test

* simple tests modification

* Capture test.log

* Captureing test log

* Captuering test log

* Capture test log

* Capture test logs

* Typo correction

* Collecting summary reports and checkpoints

* Create export_path and checkpoints directories

* L0_config file modifications

* Test L0_config

* undo L0_config

* Undo L0_config

* Modified L0_unit_tests

* Update tests

* Errors fix

* L0_profile_quick and L0_results modifications

* fix errors

* L0_profile_request_rate and L0_quick_search_multi_model modifications

* Error fixes

* Common function for creating results paths

* common function utilization

* stability scripts

* Quick search script

* Modifications

* modifications

* fix errors

* Fix errors

* Modifications

* Modify L0_state_management

* Modifications

* Fix errors

* Fix errors

* Fix errors

* Fix errors

* Fix errors

* Fix errors

* Fix errors

* Fix errors

* Fix unit tests error

* Fix errors

* Remove uaage

* Periodic Checkpointing

* Fix L0 state_management

* Fix L0_state_management

* Fix L0_state_management

* Fix L0_state_management

* Fix L0_state_management

* Modify comments
diff --git a/model_analyzer/model_manager.py b/model_analyzer/model_manager.py
@@ -143,6 +143,7 @@ def run_models(self, models: List[ConfigModelProfileSpec]) -> None:
                     model_config_weights=weightings)
 
             rcg.set_last_results([measurement])
+            self._state_manager.save_checkpoint()
 
         self._metrics_manager.finalize()
 
diff --git a/model_analyzer/state/analyzer_state_manager.py b/model_analyzer/state/analyzer_state_manager.py
@@ -165,7 +165,6 @@ def save_checkpoint(self):
                 json.dump(self._current_state, f, default=self.default_encode)
             logger.info(f"Saved checkpoint to {ckpt_filename}")
 
-            self._checkpoint_index += 1
             self._state_changed = False
         else:
             logger.info(
diff --git a/qa/L0_state_management/check_results.py b/qa/L0_state_management/check_results.py
@@ -46,7 +46,7 @@ def check_num_checkpoints(self):
         """
 
         checkpoint_files = os.listdir(self._checkpoint_dir)
-        return len(checkpoint_files) == len(self._profile_models)
+        return len(checkpoint_files) == 1
 
     def check_loading_checkpoints(self):
         """
@@ -74,7 +74,7 @@ def check_interrupt_handling(self):
         """
 
         checkpoint_files = os.listdir(self._checkpoint_dir)
-        if len(checkpoint_files) != 2:
+        if len(checkpoint_files) != 1:
             return False
 
         with open(self._analyzer_log, 'r') as f:
@@ -85,8 +85,8 @@ def check_interrupt_handling(self):
         if log_contents.find(token) == -1:
             return False
 
-        # check that 2nd model is profiled once
-        token = f"Profiling {self._profile_models[1]}"
+        # check that 1st model is profiled twice
+        token = f"Profiling {self._profile_models[0]}"
         token_idx = 0
         found_count = 0
         while True:
@@ -95,7 +95,7 @@ def check_interrupt_handling(self):
                 break
             found_count += 1
 
-        return found_count == 1
+        return found_count == 2
 
     def check_early_exit(self):
         """
@@ -117,7 +117,7 @@ def check_early_exit(self):
         return True
 
     def check_continue_after_checkpoint(self,
-                                        expected_resnet_count=3,
+                                        expected_resnet_count=4,
                                         expected_vgg_count=2):
         """
         Check that the 2nd model onwards have been run the correct
@@ -136,8 +136,6 @@ def check_continue_after_checkpoint(self,
 
         # resnet50 libtorch normally has 4 runs:
         #   ([2 models, one of which is default] x [2 concurrencies])
-        # but 1 was checkpointed from the previous interrupted run, so it
-        # will do the remaining 3
         #
         # vgg19 will have 2 runs:
         #   ([2 models, one of which is default] x [1 concurrency])
diff --git a/qa/L0_state_management/test.sh b/qa/L0_state_management/test.sh
@@ -92,7 +92,11 @@ else
 fi
 set -e
 
-# TEST CASE: run config multple and send SIGINT after 2 models run
+# TEST CASE: run config-multi.yml(first model contains two configurations) and 
+# wait for the first checkpoint file to be generated before sending a SIGINT signal.
+# The above checkpoint file ensures first config of the first model is profiled.
+# This interruption ensures the completion of the second configuration for the first model.
+
 TEST_NAME="interrupt_handling"
 
 # Create new EXPORT_PATH, CHECKPOINT_DIRECTORY and ANALYZER_LOG