david-thrower · Aidyn-Lopez · Sep 24, 2025 · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025
diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml
@@ -6,7 +6,7 @@ name: Python application
 on:
   push:
 
-    branches: [ "main", "218-fix-padding-token-mismatch-and-logging-tokenization-metadata" ]
+    branches: [ "main", "237-copy-purge-model-storage-functionality-for-main" ]
 
 
 permissions:

diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py
@@ -14,6 +14,7 @@
 from multiprocessing import Process, Lock
 import os
 from gc import collect
+from shutil import rmtree
 
 
 # import optuna
@@ -565,18 +566,31 @@ def run_random_search(self):
         print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
         print(
             f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
+        def has_valid_metric(num):
+            try:
+                float(num)
+                return True
+            except Exception as exc:
+                print(exc)
+                return False
+        # ~ pd.to_numeric(x['a'], errors="coerce").astype(float).isna()
+        # rows_having_a_valid_metric = oracles[self.metric_to_rank_by].apply(lambda x: has_valid_metric(x))
+        rows_having_a_valid_metric = ~ pd.to_numeric(oracles[self.metric_to_rank_by], errors="coerce").isna()
+        oracles_having_valid_metrics = oracles[rows_having_a_valid_metric]
+
         if self.direction == "maximize" or self.direction == "max":
-
-            best = float(oracles[oracles[self.metric_to_rank_by]
-                         != self.metric_to_rank_by]
-                         [self.metric_to_rank_by].astype(float).max())
+            best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).max())
+            # best = float(oracles[oracles[self.metric_to_rank_by]
+            #              != self.metric_to_rank_by]
+            #              [self.metric_to_rank_by].astype(float).max())
         else:
             print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
             print(
                 f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
-            best = float(oracles[oracles[self.metric_to_rank_by]
-                                 != self.metric_to_rank_by]
-                         [self.metric_to_rank_by].astype(float).min())
+            best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).min())
+            # best = float(oracles[oracles[self.metric_to_rank_by]
+            #                      != self.metric_to_rank_by]
+            #              [self.metric_to_rank_by].astype(float).min())
         print(f"Best result this trial was: {best}")
         print(f"Type of best result: {type(best)}")
         self.best_model_path =\
@@ -585,8 +599,14 @@ def run_random_search(self):
         print(f"Best model name: {self.best_model_path}")
         return best
 
-    def get_best_model(self):
+    def purge_model_storage(self):
+        path_0 = f"{self.project_name}/models"
+        rmtree(path_0)
+
+    def get_best_model(self, purge_model_storage_files: bool=False):
         best_model = tf.keras.models.load_model(self.best_model_path)
+        if purge_model_storage_files:
+            self.purge_model_storage()        
         return best_model
 
 # ->

diff --git a/regression-example-ames-no-preproc-val-set.py b/regression-example-ames-no-preproc-val-set.py
@@ -10,10 +10,13 @@
 from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
     import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
 from ast import literal_eval
+from os.path import exists
 
 NUMBER_OF_TRAILS_PER_BATCH = 2
 NUMBER_OF_BATCHES_OF_TRIALS = 2
 
+META_TRIAL_NUMBER = 1
+
 ###
 
 LABEL_COLUMN = 'price'
@@ -24,6 +27,7 @@
     .replace(':', '_')\
     .replace('-', '_')
 PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'
+PROJECT_NAME = f"{PROJECT_NAME}-meta-{META_TRIAL_NUMBER}"
 
 def hash_a_row(row):
     """casts a row of a Pandas DataFrame as a String, hashes it, and casts it
@@ -207,16 +211,23 @@ def hash_based_split(df,  # Pandas dataframe
         metrics=[tf.keras.metrics.RootMeanSquaredError()],
         epochs=epochs,
         patience=7,
-        project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
+        project_name=PROJECT_NAME,
         # use_multiprocessing_for_multiple_neural_networks=False,  # pull this param
         model_graphs='model_graphs',
         batch_size=batch_size,
         meta_trial_number=meta_trial_number)
 result = cerebros.run_random_search()
 
 print("Best model: (May need to re-initialize weights, and retrain with early stopping callback)")
-best_model_found = cerebros.get_best_model()
+best_model_found = cerebros.get_best_model(purge_model_storage_files=True)
 print(best_model_found.summary())
 
+
+# Verify purge_model_storage_files works:
+model_storage_path = f"{PROJECT_NAME}/models"
+if exists(model_storage_path):
+    raise ValueError(f"Failed test: Parh {model_storage_path} should have beed deleted and was not.")
+
+
 print("result extracted from cerebros")
 print(f"Final result was (val_root_mean_squared_error): {result}")
diff --git a/regression-example-ames-no-preproc.py b/regression-example-ames-no-preproc.py
@@ -10,6 +10,9 @@
 from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
     import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
 from ast import literal_eval
+from os import listdir
+from os.path import exists
+
 
 NUMBER_OF_TRAILS_PER_BATCH = 2
 NUMBER_OF_BATCHES_OF_TRIALS = 2
@@ -20,15 +23,15 @@
 
 ## your data:
 
+META_TRIAL_NUMBER = 1
 
 TIME = pendulum.now().__str__()[:16]\
     .replace('T', '_')\
     .replace(':', '_')\
     .replace('-', '_')
-PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'
-
+PROJECT_NAME = f"{TIME}_cerebros_auto_ml_test"
+PROJECT_NAME = f"{PROJECT_NAME}_meta_{META_TRIAL_NUMBER}"
 
-# white = pd.read_csv('wine_data.csv')
 
 raw_data = pd.read_csv('ames.csv')
 needed_cols = [
@@ -110,7 +113,7 @@
         metrics=[tf.keras.metrics.RootMeanSquaredError()],
         epochs=epochs,
         patience=7,
-        project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
+        project_name=PROJECT_NAME,
         # use_multiprocessing_for_multiple_neural_networks=False,  # pull this param
         model_graphs='model_graphs',
         batch_size=batch_size,
@@ -121,5 +124,14 @@
 best_model_found = cerebros.get_best_model()
 print(best_model_found.summary())
 
+# Validate that purge_model_storage is NOT active by default 
+
+model_storage_path = f"{PROJECT_NAME}/models"
+assert exists(model_storage_path)
+num_items = len(listdir(model_storage_path))
+print(f"There are {num_items} items in {model_storage_path}.")
+if num_items <= 0:
+    raise ValueError(f"Failed test: {model_storage_path} was deleted and should not have been.")
+
 print("result extracted from cerebros")
 print(f"Final result was (val_root_mean_squared_error): {result}")
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,7 +6,7 @@ name: Python application @@
     on:
       push:
-        branches: [ "main", "218-fix-padding-token-mismatch-and-logging-tokenization-metadata" ]
+        branches: [ "main", "237-copy-purge-model-storage-functionality-for-main" ]
     permissions:
@@ Expand Down @@