@@ -63,7 +63,8 @@ def objective(trial: optuna.Trial) -> float:
6363 # Number of text samples to create: # Number of text samples (of approximately max_seq_len) to create
6464 # Raises RAM in a linear fashion
6565
66- SAMPLES_TO_CREATE = 20 # 681
66+ PHASE_I_A_SAMPLES_TO_CREATE = 20 # 681
67+ PHASE_I_B_SAMPLES_TO_CREATE = 50
6768
6869 # How many tokens to provide before expecting the next token to be predicted.
6970 # Half this = double RAM (inversely proportional to RAM requirement)
@@ -157,7 +158,7 @@ def objective(trial: optuna.Trial) -> float:
157158
158159 # Prepare a record of params:
159160 # Log sampled hyperparameters to MLflow
160- params = {"SAMPLES_TO_CREATE" : SAMPLES_TO_CREATE ,
161+ params = {"PHASE_I_A_SAMPLES_TO_CREATE" : PHASE_I_A_SAMPLES_TO_CREATE ,
161162 "PROMPT_LENGTH" :PROMPT_LENGTH ,
162163 "MAX_SEQ_LENGTH" :MAX_SEQ_LENGTH ,
163164 "POSITIONAL_EMBEDDING_DROPOUT" :POSITIONAL_EMBEDDING_DROPOUT ,
@@ -341,7 +342,8 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
341342
342343 # del(bible)
343344 # collect()
344- non_instruct_samples = bible [:SAMPLES_TO_CREATE ]
345+ non_instruct_samples = bible [:PHASE_I_A_SAMPLES_TO_CREATE ]
346+ phase_i_b_samples = bible [PHASE_I_A_SAMPLES_TO_CREATE :PHASE_I_B_SAMPLES_TO_CREATE + PHASE_I_A_SAMPLES_TO_CREATE ]
345347 print (f"Samples from KJV bible consisting of { len (non_instruct_samples )} look like this (sub-sample of 3): { non_instruct_samples [:3 ]} " )
346348
347349
@@ -722,9 +724,9 @@ def reset_state(self):
722724 train_data_dtype = tf .int32 ) # Changed from tf.string to tf.int32
723725
724726 cerebros_t0 = time .time ()
725- result = cerebros_automl .run_random_search ()
727+ phase_i_a_result_0 = cerebros_automl .run_random_search ()
726728 # Replace "inf" / "nan" with "worst result that can be bumerically registered"
727- result = float (result ) # Deep copy that survives del() of parent object ...
729+ phase_i_a_result = float (phase_i_a_result_0 ) # Deep copy that survives del() of parent object ...
728730 cerebros_t1 = time .time ()
729731 cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0 ) / 60
730732 models_tried = moities_to_try * tries_per_moity
@@ -740,9 +742,10 @@ def reset_state(self):
740742
741743 """
742744
743- print (f'Cerebros best accuracy achieved is { result } ' )
745+ print (f'Cerebros best accuracy achieved in Phase I-a is { phase_i_a_result } ' )
744746 print (f'val set perplexity' )
745-
747+ # Log the metric to MlFLow
748+ mlflow .log_metric ("phase-i-a-perplexity" , phase_i_a_result , step = trial .number )
746749 """### Testing the best model found"""
747750
748751 MODEL_FILE_NAME = "cerebros-foundation-model.keras"
@@ -1299,7 +1302,7 @@ def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result:
12991302
13001303 # print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
13011304
1302- mlflow . log_metric ( "phase-i-a-perplexity" , result , step = trial . number )
1305+
13031306 # del(best_model_found)
13041307 del (generator )
13051308 collect ()
@@ -1354,7 +1357,7 @@ def __next__(self):
13541357
13551358
13561359 # Create the tf.data.Dataset
1357- def create_dataset (raw_text_samples , tokenizer , sample_expansion_batch_size = 100 ) :
1360+ def create_dataset (raw_text_sample , tokenizer , sample_expansion_batch_size = 10 ) -> tf . data . Dataset :
13581361 generator = SampleExpansionGenerator (raw_text_samples , tokenizer , sample_expansion_batch_size )
13591362
13601363 dataset = tf .data .Dataset .from_generator (
@@ -1366,13 +1369,21 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=100)
13661369 )
13671370 return dataset
13681371
1369-
1370-
1371-
1372+ phase_i_b_dataset = create_dataset (raw_text_sample = phase_i_b_samples , tokenizer , sample_expansion_batch_size = 10 )
13721373
1373- # To Do: Set .fit() params <------<<<
1374- # phase_i_b_history = best_model_found.fit()
1375-
1374+ # To Do: Set .fit() params <------<<<
1375+
1376+ phase_i_b_history = \
1377+ best_model_found .fit (
1378+ x = phase_i_b_dataset ,
1379+ epochs = epochs ,
1380+ batch_size = batch_size ,
1381+ validation_split = 0.2 )
1382+
1383+ phase_i_b_history = \
1384+ pd .DataFrame (phase_i_b_history .history )
1385+ # To Do: Find best metric: Reference: cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py: Line ~ 590
1386+ # result = phase_i_b_history.
13761387
13771388 return result
13781389
0 commit comments