Skip to content

Commit d57c450

Browse files
authored
Merge pull request #238 from david-thrower/237-copy-purge-model-storage-functionality-for-main
237 copy purge model storage functionality for main
2 parents 1ff1e75 + ec9a04c commit d57c450

File tree

5 files changed

+109
-16
lines changed

5 files changed

+109
-16
lines changed

.github/workflows/automerge.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ name: Python application
66
on:
77
push:
88

9-
branches: [ "main", "218-fix-padding-token-mismatch-and-logging-tokenization-metadata" ]
9+
branches: [ "main", "237-copy-purge-model-storage-functionality-for-main" ]
1010

1111

1212
permissions:

cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py

Lines changed: 77 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from multiprocessing import Process, Lock
1515
import os
1616
from gc import collect
17+
from shutil import rmtree
1718

1819

1920
# import optuna
@@ -565,18 +566,31 @@ def run_random_search(self):
565566
print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
566567
print(
567568
f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
569+
def has_valid_metric(num):
570+
try:
571+
float(num)
572+
return True
573+
except Exception as exc:
574+
print(exc)
575+
return False
576+
# ~ pd.to_numeric(x['a'], errors="coerce").astype(float).isna()
577+
# rows_having_a_valid_metric = oracles[self.metric_to_rank_by].apply(lambda x: has_valid_metric(x))
578+
rows_having_a_valid_metric = ~ pd.to_numeric(oracles[self.metric_to_rank_by], errors="coerce").isna()
579+
oracles_having_valid_metrics = oracles[rows_having_a_valid_metric]
580+
568581
if self.direction == "maximize" or self.direction == "max":
569-
570-
best = float(oracles[oracles[self.metric_to_rank_by]
571-
!= self.metric_to_rank_by]
572-
[self.metric_to_rank_by].astype(float).max())
582+
best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).max())
583+
# best = float(oracles[oracles[self.metric_to_rank_by]
584+
# != self.metric_to_rank_by]
585+
# [self.metric_to_rank_by].astype(float).max())
573586
else:
574587
print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
575588
print(
576589
f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
577-
best = float(oracles[oracles[self.metric_to_rank_by]
578-
!= self.metric_to_rank_by]
579-
[self.metric_to_rank_by].astype(float).min())
590+
best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).min())
591+
# best = float(oracles[oracles[self.metric_to_rank_by]
592+
# != self.metric_to_rank_by]
593+
# [self.metric_to_rank_by].astype(float).min())
580594
print(f"Best result this trial was: {best}")
581595
print(f"Type of best result: {type(best)}")
582596
self.best_model_path =\
@@ -585,8 +599,63 @@ def run_random_search(self):
585599
print(f"Best model name: {self.best_model_path}")
586600
return best
587601

588-
def get_best_model(self):
602+
def purge_model_storage(self) -> None:
603+
"""Slates all cached models.
604+
Recommended when running in a container without a mounted volume.
605+
It is recommened to use an artifiact registry to accession the best model.
606+
"""
607+
model_cache_path = f"{self.project_name}/models"
608+
rmtree(model_cache_path)
609+
610+
611+
def purge_models_except_best_model(self) -> None:
612+
"""
613+
Recommended when running in a container without a mounted volume and building models that take considerable time to reproduce.
614+
It is recommened to use an artifiact registry to accession the best model, but this will preserve a redundant
615+
copy in case accessioning it to a registry is unsuccessful.
616+
"""
617+
if not self.best_model_path:
618+
return ValueError("The function purge_models_except_best_model was called prematurely: self.best_model_path is not set, maining there is no 'Best model'.")
619+
model_cache_path = f"{self.project_name}/models"
620+
files_path_obj = os.listdir(model_cache_path)
621+
files_str = [str(p) for p in files_path_obj]
622+
print("Files in model cache:")
623+
for file in files_str:
624+
model_file_path = f"{model_cache_path}/{file}"
625+
print(f" {model_file_path}")
626+
if model_file_path != self.best_model_path:
627+
print(f"Removing: {model_file_path}")
628+
os.remove(model_file_path)
629+
# Temp debug code:
630+
else:
631+
print(f"Not removing {model_file_path}")
632+
633+
634+
def get_best_model(self, purge_model_storage_files=0) -> tf.keras.Model:
635+
"""Returns the best model from this meta-trial.
636+
Optionally, purges cache of models stored on disk.
637+
638+
Params:
639+
- purge_model_storage_files Union[str, int]
640+
- Set to 0: Does not purge the cached modelsl, just returns the best model.
641+
- Set to 1: Purges all models except the best model found.
642+
- Set to "slate": Removes all models, whether the best or otherwise.
643+
When running ephemeral trials in a container without a mounted volume (to prevent
644+
memory pressure accumulating from ephemeral files in memory) or are otherwise working
645+
with hard disk space limitations, we recommend setting this:
646+
- 'slate': if you are working on models that are quick to reproduce and an accidental model loss is not problematic as long as you have the parameters to reproduce it approximately.
647+
- 1: If you are are workign on models that take considerable time to reproduce a given model or a small performance difference from another model from the same parameters is problematic.
648+
- 0 If you have unlimited disk space and are not in a container or in one with a suitable mounted volume.
649+
"""
589650
best_model = tf.keras.models.load_model(self.best_model_path)
651+
if purge_model_storage_files == 1:
652+
self.purge_models_except_best_model()
653+
elif purge_model_storage_files == "slate":
654+
self.purge_model_storage()
655+
elif purge_model_storage_files == 0:
656+
pass
657+
else:
658+
raise ValueError("The paramerter purge_model_storage_files in the method get_best_model() has 3 values: 0 (Don't purge),1 (Purge all but the best model), 'slate' (remove all cached models) ")
590659
return best_model
591660

592661
# ->

phishing_email_detection_gpt2.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,7 @@ def from_config(cls, config):
515515

516516

517517

518+
518519
print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.")
519520
print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.")
520521
print(f'Cerebros best accuracy achieved is {result}')
@@ -524,7 +525,7 @@ def from_config(cls, config):
524525

525526
MODEL_FILE_NAME = "cerebros-foundation-model.keras"
526527

527-
best_model_found = cerebros_automl.get_best_model()
528+
best_model_found = cerebros_automl.get_best_model(purge_model_storage_files=1)
528529
best_model_found.save(MODEL_FILE_NAME)
529530
del(best_model_found)
530531
del(cerebros_automl)

regression-example-ames-no-preproc-val-set.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@
1010
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
1111
import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
1212
from ast import literal_eval
13+
from os.path import exists
1314

1415
NUMBER_OF_TRAILS_PER_BATCH = 2
1516
NUMBER_OF_BATCHES_OF_TRIALS = 2
1617

18+
META_TRIAL_NUMBER = 1
19+
1720
###
1821

1922
LABEL_COLUMN = 'price'
@@ -24,6 +27,7 @@
2427
.replace(':', '_')\
2528
.replace('-', '_')
2629
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'
30+
PROJECT_NAME = f"{PROJECT_NAME}-meta-{META_TRIAL_NUMBER}"
2731

2832
def hash_a_row(row):
2933
"""casts a row of a Pandas DataFrame as a String, hashes it, and casts it
@@ -207,16 +211,23 @@ def hash_based_split(df, # Pandas dataframe
207211
metrics=[tf.keras.metrics.RootMeanSquaredError()],
208212
epochs=epochs,
209213
patience=7,
210-
project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
214+
project_name=PROJECT_NAME,
211215
# use_multiprocessing_for_multiple_neural_networks=False, # pull this param
212216
model_graphs='model_graphs',
213217
batch_size=batch_size,
214218
meta_trial_number=meta_trial_number)
215219
result = cerebros.run_random_search()
216220

217221
print("Best model: (May need to re-initialize weights, and retrain with early stopping callback)")
218-
best_model_found = cerebros.get_best_model()
222+
best_model_found = cerebros.get_best_model(purge_model_storage_files='slate')
219223
print(best_model_found.summary())
220224

225+
226+
# Verify purge_model_storage_files works:
227+
model_storage_path = f"{PROJECT_NAME}/models"
228+
if exists(model_storage_path):
229+
raise ValueError(f"Failed test: Parh {model_storage_path} should have beed deleted and was not.")
230+
231+
221232
print("result extracted from cerebros")
222233
print(f"Final result was (val_root_mean_squared_error): {result}")

regression-example-ames-no-preproc.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
1111
import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
1212
from ast import literal_eval
13+
from os import listdir
14+
from os.path import exists
15+
1316

1417
NUMBER_OF_TRAILS_PER_BATCH = 2
1518
NUMBER_OF_BATCHES_OF_TRIALS = 2
@@ -20,15 +23,15 @@
2023

2124
## your data:
2225

26+
META_TRIAL_NUMBER = 1
2327

2428
TIME = pendulum.now().__str__()[:16]\
2529
.replace('T', '_')\
2630
.replace(':', '_')\
2731
.replace('-', '_')
28-
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'
29-
32+
PROJECT_NAME = f"{TIME}_cerebros_auto_ml_test"
33+
PROJECT_NAME = f"{PROJECT_NAME}_meta_{META_TRIAL_NUMBER}"
3034

31-
# white = pd.read_csv('wine_data.csv')
3235

3336
raw_data = pd.read_csv('ames.csv')
3437
needed_cols = [
@@ -110,7 +113,7 @@
110113
metrics=[tf.keras.metrics.RootMeanSquaredError()],
111114
epochs=epochs,
112115
patience=7,
113-
project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
116+
project_name=PROJECT_NAME,
114117
# use_multiprocessing_for_multiple_neural_networks=False, # pull this param
115118
model_graphs='model_graphs',
116119
batch_size=batch_size,
@@ -121,5 +124,14 @@
121124
best_model_found = cerebros.get_best_model()
122125
print(best_model_found.summary())
123126

127+
# Validate that purge_model_storage is NOT active by default
128+
129+
model_storage_path = f"{PROJECT_NAME}/models"
130+
assert exists(model_storage_path)
131+
num_items = len(listdir(model_storage_path))
132+
print(f"There are {num_items} items in {model_storage_path}.")
133+
if num_items <= 0:
134+
raise ValueError(f"Failed test: {model_storage_path} was deleted and should not have been.")
135+
124136
print("result extracted from cerebros")
125137
print(f"Final result was (val_root_mean_squared_error): {result}")

0 commit comments

Comments
 (0)