david-thrower · david-thrower · Apr 12, 2025 · Mar 22, 2025 · Mar 22, 2025 · Mar 22, 2025
diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml
@@ -5,7 +5,7 @@ name: Python application
 
 on:
   push:
-    branches: [ "main", "148-tensorflow-upgrades" ]
+    branches: [ "main", "162-add-a-tqdm-global-progress-bar-to-nas-search-task" ]
 
 permissions:
   contents: read
@@ -33,25 +33,16 @@ jobs:
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-#    - name: Test by running.
-#      run: python3 cerebros.py
-#    - name: Test distributed by running.
-#      run: python3 test_simple_cerebros_gridsearch.py
-#    - name: Test distributed random search wine by running.
-#      run: python3 random_search.py
-#    - name: Test CerebrosRealNeuronNetwork
-#      run: python3 realnn-regression-example-ames-no-preproc.py
-#      timeout-minutes: 45
     - name: Test distributed random search Ames by running
       run: python3 regression-example-ames-no-preproc.py
     - name: Test distributed random search Ames by running - Val set
       run: python3 regression-example-ames-no-preproc-val-set.py
-    - name: Test text classifier - random search - ham-spam
-      run: python3 text-class-ham-or-spam.py
-      timeout-minutes: 90
-    - name: Test image classifier - small subset of CIFAR10
+    # - name: Test text classifier - random search - ham-spam
+    #   run: python3 text-class-ham-or-spam.py
+    #   timeout-minutes: 90
+    - name: Test image classifier - small subset of CIFAR10 # add back
       timeout-minutes: 90
       run: python3 cifar10-example.py
     - name: Phishing email detection with GPT2 embedding
-      timeout-minutes: 120
+      timeout-minutes: 420
       run: python3 phishing_email_detection_gpt2.py
diff --git a/cerebros/neuralnetworkfuture/neural_network_future.py b/cerebros/neuralnetworkfuture/neural_network_future.py
@@ -332,8 +332,10 @@ def compile_neural_network(self):
         self.materialized_neural_network.compile(
             loss=self.loss,
             metrics=self.metrics,
-            optimizer=tf.keras.optimizers.Adam(
-                    learning_rate=self.learning_rate),
+            optimizer=tf.keras.optimizers.AdamW(
+                learning_rate=self.learning_rate,
+                weight_decay=0.004  # Add weight decay parameter
+            ),
             jit_compile=jit_compile)
 
     def util_parse_connectivity_csv(self):

diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 import tensorflow as tf
+from tqdm import tqdm
 from cerebros.denseautomlstructuralcomponent.\
     dense_automl_structural_component \
     import DenseAutoMlStructuralComponent, DenseLateralConnectivity, \
@@ -519,7 +520,10 @@ def run_moity_permutations(self, spec, subtrial_number, lock):
 
     def run_random_search(self):
         processes = []
-        for i in np.arange(self.number_of_architecture_moities_to_try):
+        for i in tqdm(np.arange(self.number_of_architecture_moities_to_try),
+                      desc="Global task progress",
+                      colour="#16ceeb"):
+
             self.parse_neural_network_structural_spec_random()
             spec = self.get_neural_network_spec()
 

diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py
@@ -14,6 +14,7 @@
 import tensorflow as tf
 import tensorflow_text
 from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone
+from keras_nlp.layers import PositionEmbedding
 from sklearn.model_selection import train_test_split
 from sklearn.utils import shuffle
 from tensorflow.keras.utils import to_categorical
@@ -29,6 +30,8 @@
 from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
     import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
 from ast import literal_eval
+import time
+
 
 #
 # Load the email data
@@ -65,8 +68,15 @@
 #
 # Tensors for training data and labels
 #
-training_x   = [tf.constant(X_train)]
-train_labels = [tf.constant(y_train)]
+
+# Training data for baseline model
+baseline_train_x = tf.constant(X_train)
+baseline_train_y = tf.constant(y_train, dtype=tf.int8)
+
+# Packaged for Cerebros (multimodal, takes inputs as a list)
+training_x   = [baseline_train_x]
+train_labels = [baseline_train_y]
+
 #
 # Input and output shapes
 #
@@ -75,6 +85,7 @@
 
 """### A custom GPT2 encoder layer for text embedding"""
 
+
 class GPT2Layer(tf.keras.layers.Layer):
 
     def __init__(self, max_seq_length, **kwargs):
@@ -90,9 +101,9 @@ def __init__(self, max_seq_length, **kwargs):
         # Set whether the GPT2 model's layers are trainable
         #self.encoder.trainable = False
         for layer in self.encoder.layers:
-            layer.trainable = False
+            layer.trainable = True
         #
-        self.encoder.layers[-2].trainable = True
+        # self.encoder.layers[-2].trainable = True
         #
         # Set the maximum sequence length for tokenization
         self.max_seq_length = max_seq_length
@@ -121,30 +132,147 @@ def from_config(cls, config):
 # GPT2 configurables
 max_seq_length = 96
 
-# Base model
+# GPT Baseline Model
 input_layer = Input(shape=(), dtype=tf.string)
 gpt2_layer = GPT2Layer(max_seq_length)(input_layer)
 #output = Flatten()(gpt2_layer)
-base_model = Model(inputs=input_layer, outputs=gpt2_layer)
-base_model.summary()
+binary_output = tf.keras.layers.Dense(1, activation='sigmoid')(gpt2_layer)
+
+gpt_baseline_model = Model(inputs=input_layer, outputs=binary_output)
+
+
+gpt_baseline_model.compile(
+    optimizer=Adam(learning_rate=1e-4),  # Small LR since we're fine-tuning GPT
+    loss='binary_crossentropy',
+    # metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
+    metrics=[tf.keras.metrics.BinaryAccuracy(), 
+         tf.keras.metrics.Precision(), 
+         tf.keras.metrics.Recall()]
+)
+
+gpt_t0 = time.time()
+
+print(gpt_baseline_model.summary())
+
+history = gpt_baseline_model.fit(
+    x=X_train,  # Input data
+    y=y_train,  # Labels
+    epochs=3,  # Number of training iterations
+    batch_size=16,  # Batch size small due to GPU memory constraints
+    validation_split=0.2,  # Hold out 20% of training data for validation
+    shuffle=True,  # Shuffle data at each epoch
+    callbacks=[
+        tf.keras.callbacks.EarlyStopping(
+            monitor='val_loss',
+            patience=3,
+            restore_best_weights=True,
+            min_delta=0.001
+        ),
+        tf.keras.callbacks.ReduceLROnPlateau(
+            monitor='val_loss',
+            factor=0.2,
+            patience=2,
+            min_lr=1e-6
+        )
+    ]
+)
+
+gpt_t1 = time.time()
+gpt_time_on_one_model_min =  (gpt_t1 - gpt_t0) / 60
+
+hy_df = pd.DataFrame(history.history)
+print(hy_df)
+
+
+### Cerebros model:
+
+# TokenizerLayer class to handle tokenization and return only token_ids
+class TokenizerLayer(tf.keras.layers.Layer):
+
+    def __init__(self, max_seq_length, **kwargs):
+        super(TokenizerLayer, self).__init__(**kwargs)  # Update this line
+        self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en")
+        self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length)
+        self.max_seq_length = max_seq_length
+
+    def call(self, inputs):
+        prep = self.preprocessor([inputs])
+        return prep['token_ids']
+
+    def get_config(self):
+        config = super(TokenizerLayer, self).get_config()
+        config.update({'max_seq_length': self.max_seq_length})
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(max_seq_length=config['max_seq_length'])
+
+# GPT2 configurables
+
+# Optimal for accuracy thus far:
+# max_seq_length = 900
+max_seq_length = 1024
+
+inp = tf.keras.layers.Input(shape=(), dtype=tf.string)
+gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length)
+VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size()
+tokens = gp2_tokenizer(inp)
+
+# On larger hardware, this could probably be increased considerably and
+# Probably would improve performance ...
+EMBEDDING_DIM = 23  # Define EMBEDDING_DIM here, to match your embedding layer.
+
+embedded = tf.keras.layers.Embedding(
+    input_dim=VOCABULARY_SIZE,
+    output_dim=EMBEDDING_DIM,
+    input_length=max_seq_length,
+    mask_zero=True)(tokens)
+
+position_embedding = PositionEmbedding(
+    sequence_length=max_seq_length,
+    initializer="uniform",
+)(embedded)
+
+# As an FYI, we tried an add layer both with and without
+# LayerNorm ... It degraded accuracy
+# Just an FYI for anyone trying to apply conventional wisdom
+# to save you the time ...
+x = x = tf.keras.layers.Concatenate()([embedded, position_embedding])
+x = tf.keras.layers.Dropout(0.4)(x)  # AI suggested 0.4
+flattened = tf.keras.layers.Flatten()(x)
+
+cerebros_base_model = tf.keras.Model(
+    inputs=inp,
+    outputs=flattened  # Output enhanced embeddings now
+)
+
 
 """### Cerebros search for the best model"""
 
 #
 # Cerebros configurables
 #
-activation = 'gelu'
-predecessor_level_connection_affinity_factor_first = 49.9999
-predecessor_level_connection_affinity_factor_main = 0.31456
-max_consecutive_lateral_connections = 22
-p_lateral_connection = 0.39256
-num_lateral_connection_tries_per_unit = 10
-learning_rate = 0.0000511065
-epochs = 6  # [1, 100]
-batch_size = 13
-maximum_levels = 4  # [3,7]
-maximum_units_per_level = 8  # [2,10]
-maximum_neurons_per_unit = 5  # [2,20]
+activation = "relu"
+predecessor_level_connection_affinity_factor_first = 10
+predecessor_level_connection_affinity_factor_main = 40
+max_consecutive_lateral_connections = 20
+p_lateral_connection = 30
+num_lateral_connection_tries_per_unit = 25
+learning_rate = 3 * 10 ** -3
+epochs = 15  # [1, 100]
+batch_size = 17
+minimum_levels = 2
+maximum_levels = 2 # [3,7]
+
+minimum_units_per_level = 4
+maximum_units_per_level = 7
+
+minimum_neurons_per_unit = 1
+maximum_neurons_per_unit = 2
+
+moities_to_try = 5
+tries_per_moity = 1
 
 #
 # Logging
@@ -157,6 +285,7 @@ def from_config(cls, config):
 
 meta_trial_number = 42 # irrelevant unless in distributed training
 
+
 cerebros_automl = SimpleCerebrosRandomSearch(
     unit_type=DenseUnit,
     input_shapes=INPUT_SHAPES,
@@ -166,16 +295,16 @@ def from_config(cls, config):
     validation_split=0.35,
     direction='maximize',
     metric_to_rank_by="val_binary_accuracy",
-    minimum_levels=2,
+    minimum_levels=minimum_levels,
     maximum_levels=maximum_levels,
-    minimum_units_per_level=1,
+    minimum_units_per_level=minimum_units_per_level,
     maximum_units_per_level=maximum_units_per_level,
-    minimum_neurons_per_unit=1,
+    minimum_neurons_per_unit=minimum_neurons_per_unit,
     maximum_neurons_per_unit=maximum_neurons_per_unit,
     activation=activation,
     final_activation='sigmoid',
-    number_of_architecture_moities_to_try=2,
-    number_of_tries_per_architecture_moity=1,
+    number_of_architecture_moities_to_try=moities_to_try,
+    number_of_tries_per_architecture_moity=tries_per_moity,
     minimum_skip_connection_depth=1,
     maximum_skip_connection_depth=7,
     predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
@@ -191,31 +320,32 @@ def from_config(cls, config):
     p_lateral_connection_decay=zero_95_exp_decay,
     num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
     learning_rate=learning_rate,
-    loss=tf.keras.losses.CategoricalHinge(),
-    metrics=[tf.keras.metrics.BinaryAccuracy(),
-             tf.keras.metrics.Precision(),
-             tf.keras.metrics.Recall()],
+    loss=tf.keras.losses.BinaryCrossentropy(),
+    # loss=tf.keras.losses.CategoricalHinge(),
+    metrics=[tf.keras.metrics.BinaryAccuracy(), 
+         tf.keras.metrics.Precision(), 
+         tf.keras.metrics.Recall()],
     epochs=epochs,
     project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
     model_graphs='model_graphs',
     batch_size=batch_size,
     meta_trial_number=meta_trial_number,
-    base_models=[base_model],
+    base_models=[cerebros_base_model],
     train_data_dtype=tf.string)
 
+cerebros_t0 = time.time()
 result = cerebros_automl.run_random_search()
+cerebros_t1 = time.time()
+cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60
+models_tried = moities_to_try  * tries_per_moity
+cerebros_time_per_model = cerebros_time_all_models_min / models_tried
 
-print(f'Best accuracy achieved is {result}')
-print(f'binary accuracy')
+print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.")
+print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.")
 
-"""### Testing the best model found"""
 
-#
-# Load the best model (taking into account that it has a custom layer)
-#
-best_model_found =\
-tf.keras.models.load_model(cerebros_automl.best_model_path,\
-custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)})
+print(f'Cerebros best accuracy achieved is {result}')
+print(f'val set accuracy')
+
+# """### Testing the best model found"""
 
-print('Evaluating on the test dataset')
-best_model_found.evaluate(X_test, y_test)
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ pyvis==0.3.2
 plotly==5.20.0
 matplotlib==3.8.4
 imageio==2.34.0
+tqdm==4.67.1