diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 29b75cc..d7ecd0a 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "148-tensorflow-upgrades" ] + branches: [ "main", "162-add-a-tqdm-global-progress-bar-to-nas-search-task" ] permissions: contents: read @@ -33,25 +33,16 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics -# - name: Test by running. -# run: python3 cerebros.py -# - name: Test distributed by running. -# run: python3 test_simple_cerebros_gridsearch.py -# - name: Test distributed random search wine by running. -# run: python3 random_search.py -# - name: Test CerebrosRealNeuronNetwork -# run: python3 realnn-regression-example-ames-no-preproc.py -# timeout-minutes: 45 - name: Test distributed random search Ames by running run: python3 regression-example-ames-no-preproc.py - name: Test distributed random search Ames by running - Val set run: python3 regression-example-ames-no-preproc-val-set.py - - name: Test text classifier - random search - ham-spam - run: python3 text-class-ham-or-spam.py - timeout-minutes: 90 - - name: Test image classifier - small subset of CIFAR10 + # - name: Test text classifier - random search - ham-spam + # run: python3 text-class-ham-or-spam.py + # timeout-minutes: 90 + - name: Test image classifier - small subset of CIFAR10 # add back timeout-minutes: 90 run: python3 cifar10-example.py - name: Phishing email detection with GPT2 embedding - timeout-minutes: 120 + timeout-minutes: 420 run: python3 phishing_email_detection_gpt2.py diff --git a/cerebros/neuralnetworkfuture/neural_network_future.py b/cerebros/neuralnetworkfuture/neural_network_future.py index b91adf6..4643794 100644 --- a/cerebros/neuralnetworkfuture/neural_network_future.py +++ b/cerebros/neuralnetworkfuture/neural_network_future.py @@ -332,8 +332,10 @@ def compile_neural_network(self): self.materialized_neural_network.compile( loss=self.loss, metrics=self.metrics, - optimizer=tf.keras.optimizers.Adam( - learning_rate=self.learning_rate), + optimizer=tf.keras.optimizers.AdamW( + learning_rate=self.learning_rate, + weight_decay=0.004 # Add weight decay parameter + ), jit_compile=jit_compile) def util_parse_connectivity_csv(self): diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py index 125582c..0874e99 100644 --- a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py +++ b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import tensorflow as tf +from tqdm import tqdm from cerebros.denseautomlstructuralcomponent.\ dense_automl_structural_component \ import DenseAutoMlStructuralComponent, DenseLateralConnectivity, \ @@ -519,7 +520,10 @@ def run_moity_permutations(self, spec, subtrial_number, lock): def run_random_search(self): processes = [] - for i in np.arange(self.number_of_architecture_moities_to_try): + for i in tqdm(np.arange(self.number_of_architecture_moities_to_try), + desc="Global task progress", + colour="#16ceeb"): + self.parse_neural_network_structural_spec_random() spec = self.get_neural_network_spec() diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py index 9920ce9..91c1451 100644 --- a/phishing_email_detection_gpt2.py +++ b/phishing_email_detection_gpt2.py @@ -14,6 +14,7 @@ import tensorflow as tf import tensorflow_text from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone +from keras_nlp.layers import PositionEmbedding from sklearn.model_selection import train_test_split from sklearn.utils import shuffle from tensorflow.keras.utils import to_categorical @@ -29,6 +30,8 @@ from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\ import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid from ast import literal_eval +import time + # # Load the email data @@ -65,8 +68,15 @@ # # Tensors for training data and labels # -training_x = [tf.constant(X_train)] -train_labels = [tf.constant(y_train)] + +# Training data for baseline model +baseline_train_x = tf.constant(X_train) +baseline_train_y = tf.constant(y_train, dtype=tf.int8) + +# Packaged for Cerebros (multimodal, takes inputs as a list) +training_x = [baseline_train_x] +train_labels = [baseline_train_y] + # # Input and output shapes # @@ -75,6 +85,7 @@ """### A custom GPT2 encoder layer for text embedding""" + class GPT2Layer(tf.keras.layers.Layer): def __init__(self, max_seq_length, **kwargs): @@ -90,9 +101,9 @@ def __init__(self, max_seq_length, **kwargs): # Set whether the GPT2 model's layers are trainable #self.encoder.trainable = False for layer in self.encoder.layers: - layer.trainable = False + layer.trainable = True # - self.encoder.layers[-2].trainable = True + # self.encoder.layers[-2].trainable = True # # Set the maximum sequence length for tokenization self.max_seq_length = max_seq_length @@ -121,30 +132,147 @@ def from_config(cls, config): # GPT2 configurables max_seq_length = 96 -# Base model +# GPT Baseline Model input_layer = Input(shape=(), dtype=tf.string) gpt2_layer = GPT2Layer(max_seq_length)(input_layer) #output = Flatten()(gpt2_layer) -base_model = Model(inputs=input_layer, outputs=gpt2_layer) -base_model.summary() +binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer) + +gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output) + + +gpt_baseline_model.compile( + optimizer=Adam(learning_rate=1e-4), # Small LR since we're fine-tuning GPT + loss='binary_crossentropy', + # metrics=['accuracy', tf.keras.metrics.AUC(name='auc')] + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()] +) + +gpt_t0 = time.time() + +print(gpt_baseline_model.summary()) + +history = gpt_baseline_model.fit( + x=X_train, # Input data + y=y_train, # Labels + epochs=3, # Number of training iterations + batch_size=16, # Batch size small due to GPU memory constraints + validation_split=0.2, # Hold out 20% of training data for validation + shuffle=True, # Shuffle data at each epoch + callbacks=[ + tf.keras.callbacks.EarlyStopping( + monitor='val_loss', + patience=3, + restore_best_weights=True, + min_delta=0.001 + ), + tf.keras.callbacks.ReduceLROnPlateau( + monitor='val_loss', + factor=0.2, + patience=2, + min_lr=1e-6 + ) + ] +) + +gpt_t1 = time.time() +gpt_time_on_one_model_min = (gpt_t1 - gpt_t0) / 60 + +hy_df = pd.DataFrame(history.history) +print(hy_df) + + +### Cerebros model: + +# TokenizerLayer class to handle tokenization and return only token_ids +class TokenizerLayer(tf.keras.layers.Layer): + + def __init__(self, max_seq_length, **kwargs): + super(TokenizerLayer, self).__init__(**kwargs) # Update this line + self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en") + self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length) + self.max_seq_length = max_seq_length + + def call(self, inputs): + prep = self.preprocessor([inputs]) + return prep['token_ids'] + + def get_config(self): + config = super(TokenizerLayer, self).get_config() + config.update({'max_seq_length': self.max_seq_length}) + return config + + @classmethod + def from_config(cls, config): + return cls(max_seq_length=config['max_seq_length']) + +# GPT2 configurables + +# Optimal for accuracy thus far: +# max_seq_length = 900 +max_seq_length = 1024 + +inp = tf.keras.layers.Input(shape=(), dtype=tf.string) +gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length) +VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size() +tokens = gp2_tokenizer(inp) + +# On larger hardware, this could probably be increased considerably and +# Probably would improve performance ... +EMBEDDING_DIM = 23 # Define EMBEDDING_DIM here, to match your embedding layer. + +embedded = tf.keras.layers.Embedding( + input_dim=VOCABULARY_SIZE, + output_dim=EMBEDDING_DIM, + input_length=max_seq_length, + mask_zero=True)(tokens) + +position_embedding = PositionEmbedding( + sequence_length=max_seq_length, + initializer="uniform", +)(embedded) + +# As an FYI, we tried an add layer both with and without +# LayerNorm ... It degraded accuracy +# Just an FYI for anyone trying to apply conventional wisdom +# to save you the time ... +x = x = tf.keras.layers.Concatenate()([embedded, position_embedding]) +x = tf.keras.layers.Dropout(0.4)(x) # AI suggested 0.4 +flattened = tf.keras.layers.Flatten()(x) + +cerebros_base_model = tf.keras.Model( + inputs=inp, + outputs=flattened # Output enhanced embeddings now +) + """### Cerebros search for the best model""" # # Cerebros configurables # -activation = 'gelu' -predecessor_level_connection_affinity_factor_first = 49.9999 -predecessor_level_connection_affinity_factor_main = 0.31456 -max_consecutive_lateral_connections = 22 -p_lateral_connection = 0.39256 -num_lateral_connection_tries_per_unit = 10 -learning_rate = 0.0000511065 -epochs = 6 # [1, 100] -batch_size = 13 -maximum_levels = 4 # [3,7] -maximum_units_per_level = 8 # [2,10] -maximum_neurons_per_unit = 5 # [2,20] +activation = "relu" +predecessor_level_connection_affinity_factor_first = 10 +predecessor_level_connection_affinity_factor_main = 40 +max_consecutive_lateral_connections = 20 +p_lateral_connection = 30 +num_lateral_connection_tries_per_unit = 25 +learning_rate = 3 * 10 ** -3 +epochs = 15 # [1, 100] +batch_size = 17 +minimum_levels = 2 +maximum_levels = 2 # [3,7] + +minimum_units_per_level = 4 +maximum_units_per_level = 7 + +minimum_neurons_per_unit = 1 +maximum_neurons_per_unit = 2 + +moities_to_try = 5 +tries_per_moity = 1 # # Logging @@ -157,6 +285,7 @@ def from_config(cls, config): meta_trial_number = 42 # irrelevant unless in distributed training + cerebros_automl = SimpleCerebrosRandomSearch( unit_type=DenseUnit, input_shapes=INPUT_SHAPES, @@ -166,16 +295,16 @@ def from_config(cls, config): validation_split=0.35, direction='maximize', metric_to_rank_by="val_binary_accuracy", - minimum_levels=2, + minimum_levels=minimum_levels, maximum_levels=maximum_levels, - minimum_units_per_level=1, + minimum_units_per_level=minimum_units_per_level, maximum_units_per_level=maximum_units_per_level, - minimum_neurons_per_unit=1, + minimum_neurons_per_unit=minimum_neurons_per_unit, maximum_neurons_per_unit=maximum_neurons_per_unit, activation=activation, final_activation='sigmoid', - number_of_architecture_moities_to_try=2, - number_of_tries_per_architecture_moity=1, + number_of_architecture_moities_to_try=moities_to_try, + number_of_tries_per_architecture_moity=tries_per_moity, minimum_skip_connection_depth=1, maximum_skip_connection_depth=7, predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first, @@ -191,31 +320,32 @@ def from_config(cls, config): p_lateral_connection_decay=zero_95_exp_decay, num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit, learning_rate=learning_rate, - loss=tf.keras.losses.CategoricalHinge(), - metrics=[tf.keras.metrics.BinaryAccuracy(), - tf.keras.metrics.Precision(), - tf.keras.metrics.Recall()], + loss=tf.keras.losses.BinaryCrossentropy(), + # loss=tf.keras.losses.CategoricalHinge(), + metrics=[tf.keras.metrics.BinaryAccuracy(), + tf.keras.metrics.Precision(), + tf.keras.metrics.Recall()], epochs=epochs, project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}", model_graphs='model_graphs', batch_size=batch_size, meta_trial_number=meta_trial_number, - base_models=[base_model], + base_models=[cerebros_base_model], train_data_dtype=tf.string) +cerebros_t0 = time.time() result = cerebros_automl.run_random_search() +cerebros_t1 = time.time() +cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60 +models_tried = moities_to_try * tries_per_moity +cerebros_time_per_model = cerebros_time_all_models_min / models_tried -print(f'Best accuracy achieved is {result}') -print(f'binary accuracy') +print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.") +print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.") -"""### Testing the best model found""" -# -# Load the best model (taking into account that it has a custom layer) -# -best_model_found =\ -tf.keras.models.load_model(cerebros_automl.best_model_path,\ -custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)}) +print(f'Cerebros best accuracy achieved is {result}') +print(f'val set accuracy') + +# """### Testing the best model found""" -print('Evaluating on the test dataset') -best_model_found.evaluate(X_test, y_test) diff --git a/requirements.txt b/requirements.txt index 1964f13..146b1e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ pyvis==0.3.2 plotly==5.20.0 matplotlib==3.8.4 imageio==2.34.0 +tqdm==4.67.1