Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/automerge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ name: Python application
on:
push:

branches: [ "main", "218-fix-padding-token-mismatch-and-logging-tokenization-metadata" ]
branches: [ "main", "237-copy-purge-model-storage-functionality-for-main" ]


permissions:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from multiprocessing import Process, Lock
import os
from gc import collect
from shutil import rmtree


# import optuna
Expand Down Expand Up @@ -565,18 +566,31 @@ def run_random_search(self):
print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
print(
f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
def has_valid_metric(num):
try:
float(num)
return True
except Exception as exc:
print(exc)
return False
# ~ pd.to_numeric(x['a'], errors="coerce").astype(float).isna()
# rows_having_a_valid_metric = oracles[self.metric_to_rank_by].apply(lambda x: has_valid_metric(x))
rows_having_a_valid_metric = ~ pd.to_numeric(oracles[self.metric_to_rank_by], errors="coerce").isna()
oracles_having_valid_metrics = oracles[rows_having_a_valid_metric]

if self.direction == "maximize" or self.direction == "max":

best = float(oracles[oracles[self.metric_to_rank_by]
!= self.metric_to_rank_by]
[self.metric_to_rank_by].astype(float).max())
best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).max())
# best = float(oracles[oracles[self.metric_to_rank_by]
# != self.metric_to_rank_by]
# [self.metric_to_rank_by].astype(float).max())
else:
print(f"metric_to_rank_by is: '{self.metric_to_rank_by}'")
print(
f"Type of metric_to_rank_by is: {str(type(self.metric_to_rank_by))}")
best = float(oracles[oracles[self.metric_to_rank_by]
!= self.metric_to_rank_by]
[self.metric_to_rank_by].astype(float).min())
best = float(oracles_having_valid_metrics[self.metric_to_rank_by].astype(float).min())
# best = float(oracles[oracles[self.metric_to_rank_by]
# != self.metric_to_rank_by]
# [self.metric_to_rank_by].astype(float).min())
print(f"Best result this trial was: {best}")
print(f"Type of best result: {type(best)}")
self.best_model_path =\
Expand All @@ -585,8 +599,14 @@ def run_random_search(self):
print(f"Best model name: {self.best_model_path}")
return best

def get_best_model(self):
def purge_model_storage(self):
path_0 = f"{self.project_name}/models"
rmtree(path_0)

def get_best_model(self, purge_model_storage_files: bool=False):
best_model = tf.keras.models.load_model(self.best_model_path)
if purge_model_storage_files:
self.purge_model_storage()
return best_model

# ->
Expand Down
15 changes: 13 additions & 2 deletions regression-example-ames-no-preproc-val-set.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
from ast import literal_eval
from os.path import exists

NUMBER_OF_TRAILS_PER_BATCH = 2
NUMBER_OF_BATCHES_OF_TRIALS = 2

META_TRIAL_NUMBER = 1

###

LABEL_COLUMN = 'price'
Expand All @@ -24,6 +27,7 @@
.replace(':', '_')\
.replace('-', '_')
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'
PROJECT_NAME = f"{PROJECT_NAME}-meta-{META_TRIAL_NUMBER}"

def hash_a_row(row):
"""casts a row of a Pandas DataFrame as a String, hashes it, and casts it
Expand Down Expand Up @@ -207,16 +211,23 @@ def hash_based_split(df, # Pandas dataframe
metrics=[tf.keras.metrics.RootMeanSquaredError()],
epochs=epochs,
patience=7,
project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
project_name=PROJECT_NAME,
# use_multiprocessing_for_multiple_neural_networks=False, # pull this param
model_graphs='model_graphs',
batch_size=batch_size,
meta_trial_number=meta_trial_number)
result = cerebros.run_random_search()

print("Best model: (May need to re-initialize weights, and retrain with early stopping callback)")
best_model_found = cerebros.get_best_model()
best_model_found = cerebros.get_best_model(purge_model_storage_files=True)
print(best_model_found.summary())


# Verify purge_model_storage_files works:
model_storage_path = f"{PROJECT_NAME}/models"
if exists(model_storage_path):
raise ValueError(f"Failed test: Parh {model_storage_path} should have beed deleted and was not.")


print("result extracted from cerebros")
print(f"Final result was (val_root_mean_squared_error): {result}")
20 changes: 16 additions & 4 deletions regression-example-ames-no-preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
from ast import literal_eval
from os import listdir
from os.path import exists


NUMBER_OF_TRAILS_PER_BATCH = 2
NUMBER_OF_BATCHES_OF_TRIALS = 2
Expand All @@ -20,15 +23,15 @@

## your data:

META_TRIAL_NUMBER = 1

TIME = pendulum.now().__str__()[:16]\
.replace('T', '_')\
.replace(':', '_')\
.replace('-', '_')
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_test'

PROJECT_NAME = f"{TIME}_cerebros_auto_ml_test"
PROJECT_NAME = f"{PROJECT_NAME}_meta_{META_TRIAL_NUMBER}"

# white = pd.read_csv('wine_data.csv')

raw_data = pd.read_csv('ames.csv')
needed_cols = [
Expand Down Expand Up @@ -110,7 +113,7 @@
metrics=[tf.keras.metrics.RootMeanSquaredError()],
epochs=epochs,
patience=7,
project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
project_name=PROJECT_NAME,
# use_multiprocessing_for_multiple_neural_networks=False, # pull this param
model_graphs='model_graphs',
batch_size=batch_size,
Expand All @@ -121,5 +124,14 @@
best_model_found = cerebros.get_best_model()
print(best_model_found.summary())

# Validate that purge_model_storage is NOT active by default

model_storage_path = f"{PROJECT_NAME}/models"
assert exists(model_storage_path)
num_items = len(listdir(model_storage_path))
print(f"There are {num_items} items in {model_storage_path}.")
if num_items <= 0:
raise ValueError(f"Failed test: {model_storage_path} was deleted and should not have been.")

print("result extracted from cerebros")
print(f"Final result was (val_root_mean_squared_error): {result}")