Merge pull request #238 from david-thrower/237-copy-purge-model-storage-functionality-for-main

Aidyn-Lopez · web-flow · commit d57c450b2ef5 · 2025-09-24T03:03:33.000-04:00
237 copy purge model storage functionality for main
diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml
@@ -6,7 +6,7 @@ name: Python application
 on:
   push:
 
-    branches: [ "main", "218-fix-padding-token-mismatch-and-logging-tokenization-metadata" ]
+    branches: [ "main", "237-copy-purge-model-storage-functionality-for-main" ]
 
 
 permissions:
diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py
@@ -14,6 +14,7 @@
 from multiprocessing import Process, Lock
 import os
 from gc import collect
+from shutil import rmtree
 
 
 # import optuna
@@ -565,18 +566,31 @@ def run_random_search(self):
         print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
         print(
             f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
+        def has_valid_metric(num):
+            try:
+                float(num)
+                return True
+            except Exception as exc:
+                print(exc)
+                return False
+        # ~ pd.to_numeric(x['a'], errors="coerce").astype(float).isna()
+        # rows_having_a_valid_metric = oracles[self.metric_to_rank_by].apply(lambda x: has_valid_metric(x))
+        rows_having_a_valid_metric = ~ pd.to_numeric(oracles[self.metric_to_rank_by], errors="coerce").isna()
+        oracles_having_valid_metrics = oracles[rows_having_a_valid_metric]
+        
         if self.direction == "maximize" or self.direction == "max":
-
-            best = float(oracles[oracles[self.metric_to_rank_by]
-                         != self.metric_to_rank_by]
-                         [self.metric_to_rank_by].astype(float).max())
+            best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).max())
+            # best = float(oracles[oracles[self.metric_to_rank_by]
+            #              != self.metric_to_rank_by]
+            #              [self.metric_to_rank_by].astype(float).max())
         else:
             print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
             print(
                 f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
-            best = float(oracles[oracles[self.metric_to_rank_by]
-                                 != self.metric_to_rank_by]
-                         [self.metric_to_rank_by].astype(float).min())
+            best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).min())
+            # best = float(oracles[oracles[self.metric_to_rank_by]
+            #                      != self.metric_to_rank_by]
+            #              [self.metric_to_rank_by].astype(float).min())
         print(f"Best result this trial was: {best}")
         print(f"Type of best result: {type(best)}")
         self.best_model_path =\
@@ -585,8 +599,63 @@ def run_random_search(self):
         print(f"Best model name: {self.best_model_path}")
         return best
 
-    def get_best_model(self):
+    def purge_model_storage(self) -> None:
+        """Slates all cached models. 
+        Recommended when running in a container without a mounted volume.
+        It is recommened to use an artifiact registry to accession the best model.
+        """
+        model_cache_path = f"{self.project_name}/models"
+        rmtree(model_cache_path)
+
+
+    def purge_models_except_best_model(self) -> None:
+        """
+        Recommended when running in a container without a mounted volume and building models that take considerable time to reproduce.
+        It is recommened to use an artifiact registry to accession the best model, but this will preserve a redundant
+        copy in case accessioning it to a registry is unsuccessful.
+        """
+        if not self.best_model_path:
+            return ValueError("The function purge_models_except_best_model was called prematurely: self.best_model_path is not set, maining there is no 'Best model'.")
+        model_cache_path = f"{self.project_name}/models"
+        files_path_obj = os.listdir(model_cache_path)
+        files_str = [str(p) for p in files_path_obj]
+        print("Files in model cache:")
+        for file in files_str:
+            model_file_path = f"{model_cache_path}/{file}"
+            print(f"  {model_file_path}")
+            if model_file_path != self.best_model_path:
+                print(f"Removing: {model_file_path}")
+                os.remove(model_file_path)
+            # Temp debug code:
+            else:
+                print(f"Not removing {model_file_path}")
+
+
+    def get_best_model(self, purge_model_storage_files=0) -> tf.keras.Model:
+        """Returns the best model from this meta-trial. 
+        Optionally, purges cache of models stored on disk.
+
+        Params:
+            - purge_model_storage_files Union[str, int]
+                - Set to 0: Does not purge the cached modelsl, just returns the best model.
+                - Set to 1: Purges all models except the best model found.
+                - Set to "slate": Removes all models, whether the best or otherwise.
+        When running ephemeral trials in a container without a mounted volume (to prevent 
+        memory pressure accumulating from ephemeral files in memory) or are otherwise working
+        with hard disk space limitations, we recommend setting this:
+            - 'slate': if you are working on models that are quick to reproduce and an accidental model loss is not problematic as long as you have the parameters to reproduce it approximately.
+            - 1: If you are are workign on models that take considerable time to reproduce a given model or a small performance difference from another model from the same parameters is problematic.
+            - 0 If you have unlimited disk space and are not in a container or in one with a suitable mounted volume.  
+        """
         best_model = tf.keras.models.load_model(self.best_model_path)
+        if  purge_model_storage_files == 1:
+            self.purge_models_except_best_model()
+        elif purge_model_storage_files == "slate":
+            self.purge_model_storage()
+        elif purge_model_storage_files == 0:
+            pass
+        else:
+            raise ValueError("The paramerter purge_model_storage_files in the method get_best_model() has 3 values: 0 (Don't purge),1 (Purge all but the best model), 'slate' (remove all cached models) ")
         return best_model
 
 # ->
diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py
@@ -515,6 +515,7 @@ def from_config(cls, config):
 
 
 
+
 print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.")
 print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.")
 print(f'Cerebros best accuracy achieved is {result}')
@@ -524,7 +525,7 @@ def from_config(cls, config):
 
 MODEL_FILE_NAME = "cerebros-foundation-model.keras"
 
-best_model_found = cerebros_automl.get_best_model()
+best_model_found = cerebros_automl.get_best_model(purge_model_storage_files=1)
 best_model_found.save(MODEL_FILE_NAME)
 del(best_model_found)
 del(cerebros_automl)
diff --git a/regression-example-ames-no-preproc-val-set.py b/regression-example-ames-no-preproc-val-set.py
@@ -10,10 +10,13 @@
 from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
     import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
 from ast import literal_eval
+from os.path import exists
 
 NUMBER_OF_TRAILS_PER_BATCH = 2
 NUMBER_OF_BATCHES_OF_TRIALS = 2
 
+META_TRIAL_NUMBER = 1
+
 ###
 
 LABEL_COLUMN = 'price'
@@ -24,6 +27,7 @@
     .replace(':', '_')\
     .replace('-', '_')
 PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'
+PROJECT_NAME = f"{PROJECT_NAME}-meta-{META_TRIAL_NUMBER}"
 
 def hash_a_row(row):
     """casts a row of a Pandas DataFrame as a String, hashes it, and casts it
@@ -207,16 +211,23 @@ def hash_based_split(df,  # Pandas dataframe
         metrics=[tf.keras.metrics.RootMeanSquaredError()],
         epochs=epochs,
         patience=7,
-        project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
+        project_name=PROJECT_NAME,
         # use_multiprocessing_for_multiple_neural_networks=False,  # pull this param
         model_graphs='model_graphs',
         batch_size=batch_size,
         meta_trial_number=meta_trial_number)
 result = cerebros.run_random_search()
 
 print("Best model: (May need to re-initialize weights, and retrain with early stopping callback)")
-best_model_found = cerebros.get_best_model()
+best_model_found = cerebros.get_best_model(purge_model_storage_files='slate')
 print(best_model_found.summary())
 
+
+# Verify purge_model_storage_files works:
+model_storage_path = f"{PROJECT_NAME}/models"
+if exists(model_storage_path):
+    raise ValueError(f"Failed test: Parh {model_storage_path} should have beed deleted and was not.")
+
+
 print("result extracted from cerebros")
 print(f"Final result was (val_root_mean_squared_error): {result}")
diff --git a/regression-example-ames-no-preproc.py b/regression-example-ames-no-preproc.py
@@ -10,6 +10,9 @@
 from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
     import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
 from ast import literal_eval
+from os import listdir
+from os.path import exists
+
 
 NUMBER_OF_TRAILS_PER_BATCH = 2
 NUMBER_OF_BATCHES_OF_TRIALS = 2
@@ -20,15 +23,15 @@
 
 ## your data:
 
+META_TRIAL_NUMBER = 1
 
 TIME = pendulum.now().__str__()[:16]\
     .replace('T', '_')\
     .replace(':', '_')\
     .replace('-', '_')
-PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'
-
+PROJECT_NAME = f"{TIME}_cerebros_auto_ml_test"
+PROJECT_NAME = f"{PROJECT_NAME}_meta_{META_TRIAL_NUMBER}"
 
-# white = pd.read_csv('wine_data.csv')
 
 raw_data = pd.read_csv('ames.csv')
 needed_cols = [
@@ -110,7 +113,7 @@
         metrics=[tf.keras.metrics.RootMeanSquaredError()],
         epochs=epochs,
         patience=7,
-        project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
+        project_name=PROJECT_NAME,
         # use_multiprocessing_for_multiple_neural_networks=False,  # pull this param
         model_graphs='model_graphs',
         batch_size=batch_size,
@@ -121,5 +124,14 @@
 best_model_found = cerebros.get_best_model()
 print(best_model_found.summary())
 
+# Validate that purge_model_storage is NOT active by default 
+
+model_storage_path = f"{PROJECT_NAME}/models"
+assert exists(model_storage_path)
+num_items = len(listdir(model_storage_path))
+print(f"There are {num_items} items in {model_storage_path}.")
+if num_items <= 0:
+    raise ValueError(f"Failed test: {model_storage_path} was deleted and should not have been.")
+
 print("result extracted from cerebros")
 print(f"Final result was (val_root_mean_squared_error): {result}")