Skip to content

Commit 807c891

Browse files
Update generative-proof-of-concept-CPU-preprocessing-in-memory.py
Save work
1 parent c49e8a9 commit 807c891

File tree

1 file changed

+26
-15
lines changed

1 file changed

+26
-15
lines changed

generative-proof-of-concept-CPU-preprocessing-in-memory.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ def objective(trial: optuna.Trial) -> float:
6363
# Number of text samples to create: # Number of text samples (of approximately max_seq_len) to create
6464
# Raises RAM in a linear fashion
6565

66-
SAMPLES_TO_CREATE = 20 # 681
66+
PHASE_I_A_SAMPLES_TO_CREATE = 20 # 681
67+
PHASE_I_B_SAMPLES_TO_CREATE = 50
6768

6869
# How many tokens to provide before expecting the next token to be predicted.
6970
# Half this = double RAM (inversely proportional to RAM requirement)
@@ -157,7 +158,7 @@ def objective(trial: optuna.Trial) -> float:
157158

158159
# Prepare a record of params:
159160
# Log sampled hyperparameters to MLflow
160-
params = {"SAMPLES_TO_CREATE":SAMPLES_TO_CREATE,
161+
params = {"PHASE_I_A_SAMPLES_TO_CREATE":PHASE_I_A_SAMPLES_TO_CREATE,
161162
"PROMPT_LENGTH":PROMPT_LENGTH,
162163
"MAX_SEQ_LENGTH":MAX_SEQ_LENGTH,
163164
"POSITIONAL_EMBEDDING_DROPOUT":POSITIONAL_EMBEDDING_DROPOUT,
@@ -341,7 +342,8 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
341342

342343
# del(bible)
343344
# collect()
344-
non_instruct_samples = bible[:SAMPLES_TO_CREATE]
345+
non_instruct_samples = bible[:PHASE_I_A_SAMPLES_TO_CREATE]
346+
phase_i_b_samples = bible[PHASE_I_A_SAMPLES_TO_CREATE:PHASE_I_B_SAMPLES_TO_CREATE + PHASE_I_A_SAMPLES_TO_CREATE]
345347
print(f"Samples from KJV bible consisting of {len(non_instruct_samples)} look like this (sub-sample of 3): {non_instruct_samples[:3]}")
346348

347349

@@ -722,9 +724,9 @@ def reset_state(self):
722724
train_data_dtype=tf.int32) # Changed from tf.string to tf.int32
723725

724726
cerebros_t0 = time.time()
725-
result = cerebros_automl.run_random_search()
727+
phase_i_a_result_0 = cerebros_automl.run_random_search()
726728
# Replace "inf" / "nan" with "worst result that can be bumerically registered"
727-
result = float(result) # Deep copy that survives del() of parent object ...
729+
phase_i_a_result = float(phase_i_a_result_0) # Deep copy that survives del() of parent object ...
728730
cerebros_t1 = time.time()
729731
cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60
730732
models_tried = moities_to_try * tries_per_moity
@@ -740,9 +742,10 @@ def reset_state(self):
740742
741743
"""
742744

743-
print(f'Cerebros best accuracy achieved is {result}')
745+
print(f'Cerebros best accuracy achieved in Phase I-a is {phase_i_a_result}')
744746
print(f'val set perplexity')
745-
747+
# Log the metric to MlFLow
748+
mlflow.log_metric("phase-i-a-perplexity", phase_i_a_result, step=trial.number)
746749
"""### Testing the best model found"""
747750

748751
MODEL_FILE_NAME = "cerebros-foundation-model.keras"
@@ -1299,7 +1302,7 @@ def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result:
12991302

13001303
# print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
13011304

1302-
mlflow.log_metric("phase-i-a-perplexity", result, step=trial.number)
1305+
13031306
# del(best_model_found)
13041307
del(generator)
13051308
collect()
@@ -1354,7 +1357,7 @@ def __next__(self):
13541357

13551358

13561359
# Create the tf.data.Dataset
1357-
def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=100):
1360+
def create_dataset(raw_text_sample, tokenizer, sample_expansion_batch_size=10) -> tf.data.Dataset:
13581361
generator = SampleExpansionGenerator(raw_text_samples, tokenizer, sample_expansion_batch_size)
13591362

13601363
dataset = tf.data.Dataset.from_generator(
@@ -1366,13 +1369,21 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=100)
13661369
)
13671370
return dataset
13681371

1369-
1370-
1371-
1372+
phase_i_b_dataset = create_dataset(raw_text_sample=phase_i_b_samples, tokenizer, sample_expansion_batch_size=10)
13721373

1373-
# To Do: Set .fit() params <------<<<
1374-
# phase_i_b_history = best_model_found.fit()
1375-
1374+
# To Do: Set .fit() params <------<<<
1375+
1376+
phase_i_b_history =\
1377+
best_model_found.fit(
1378+
x=phase_i_b_dataset,
1379+
epochs=epochs,
1380+
batch_size=batch_size,
1381+
validation_split=0.2)
1382+
1383+
phase_i_b_history =\
1384+
pd.DataFrame(phase_i_b_history.history)
1385+
# To Do: Find best metric: Reference: cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py: Line ~ 590
1386+
# result = phase_i_b_history.
13761387

13771388
return result
13781389

0 commit comments

Comments
 (0)