Skip to content

Commit 650d118

Browse files
authored
Periodic checkpointing in MA (#710)
* Initial test * simple tests modification * Capture test.log * Captureing test log * Captuering test log * Capture test log * Capture test logs * Typo correction * Collecting summary reports and checkpoints * Create export_path and checkpoints directories * L0_config file modifications * Test L0_config * undo L0_config * Undo L0_config * Modified L0_unit_tests * Update tests * Errors fix * L0_profile_quick and L0_results modifications * fix errors * L0_profile_request_rate and L0_quick_search_multi_model modifications * Error fixes * Common function for creating results paths * common function utilization * stability scripts * Quick search script * Modifications * modifications * fix errors * Fix errors * Modifications * Modify L0_state_management * Modifications * Fix errors * Fix errors * Fix errors * Fix errors * Fix errors * Fix errors * Fix errors * Fix errors * Fix unit tests error * Fix errors * Remove uaage * Periodic Checkpointing * Fix L0 state_management * Fix L0_state_management * Fix L0_state_management * Fix L0_state_management * Fix L0_state_management * Modify comments
1 parent 9945cd8 commit 650d118

File tree

4 files changed

+12
-10
lines changed

4 files changed

+12
-10
lines changed

model_analyzer/model_manager.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ def run_models(self, models: List[ConfigModelProfileSpec]) -> None:
143143
model_config_weights=weightings)
144144

145145
rcg.set_last_results([measurement])
146+
self._state_manager.save_checkpoint()
146147

147148
self._metrics_manager.finalize()
148149

model_analyzer/state/analyzer_state_manager.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,6 @@ def save_checkpoint(self):
165165
json.dump(self._current_state, f, default=self.default_encode)
166166
logger.info(f"Saved checkpoint to {ckpt_filename}")
167167

168-
self._checkpoint_index += 1
169168
self._state_changed = False
170169
else:
171170
logger.info(

qa/L0_state_management/check_results.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def check_num_checkpoints(self):
4646
"""
4747

4848
checkpoint_files = os.listdir(self._checkpoint_dir)
49-
return len(checkpoint_files) == len(self._profile_models)
49+
return len(checkpoint_files) == 1
5050

5151
def check_loading_checkpoints(self):
5252
"""
@@ -74,7 +74,7 @@ def check_interrupt_handling(self):
7474
"""
7575

7676
checkpoint_files = os.listdir(self._checkpoint_dir)
77-
if len(checkpoint_files) != 2:
77+
if len(checkpoint_files) != 1:
7878
return False
7979

8080
with open(self._analyzer_log, 'r') as f:
@@ -85,8 +85,8 @@ def check_interrupt_handling(self):
8585
if log_contents.find(token) == -1:
8686
return False
8787

88-
# check that 2nd model is profiled once
89-
token = f"Profiling {self._profile_models[1]}"
88+
# check that 1st model is profiled twice
89+
token = f"Profiling {self._profile_models[0]}"
9090
token_idx = 0
9191
found_count = 0
9292
while True:
@@ -95,7 +95,7 @@ def check_interrupt_handling(self):
9595
break
9696
found_count += 1
9797

98-
return found_count == 1
98+
return found_count == 2
9999

100100
def check_early_exit(self):
101101
"""
@@ -117,7 +117,7 @@ def check_early_exit(self):
117117
return True
118118

119119
def check_continue_after_checkpoint(self,
120-
expected_resnet_count=3,
120+
expected_resnet_count=4,
121121
expected_vgg_count=2):
122122
"""
123123
Check that the 2nd model onwards have been run the correct
@@ -136,8 +136,6 @@ def check_continue_after_checkpoint(self,
136136

137137
# resnet50 libtorch normally has 4 runs:
138138
# ([2 models, one of which is default] x [2 concurrencies])
139-
# but 1 was checkpointed from the previous interrupted run, so it
140-
# will do the remaining 3
141139
#
142140
# vgg19 will have 2 runs:
143141
# ([2 models, one of which is default] x [1 concurrency])

qa/L0_state_management/test.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,11 @@ else
9292
fi
9393
set -e
9494

95-
# TEST CASE: run config multple and send SIGINT after 2 models run
95+
# TEST CASE: run config-multi.yml(first model contains two configurations) and
96+
# wait for the first checkpoint file to be generated before sending a SIGINT signal.
97+
# The above checkpoint file ensures first config of the first model is profiled.
98+
# This interruption ensures the completion of the second configuration for the first model.
99+
96100
TEST_NAME="interrupt_handling"
97101

98102
# Create new EXPORT_PATH, CHECKPOINT_DIRECTORY and ANALYZER_LOG

0 commit comments

Comments
 (0)