Skip to content

Commit 995ae94

Browse files
Update generative-proof-of-concept-CPU-preprocessing-in-memory.py
Some code cleanup.
1 parent d6fb1c0 commit 995ae94

File tree

1 file changed

+20
-87
lines changed

1 file changed

+20
-87
lines changed

generative-proof-of-concept-CPU-preprocessing-in-memory.py

Lines changed: 20 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,6 @@ def objective(trial: optuna.Trial) -> float:
641641

642642
cerebros_t0 = time.time()
643643
phase_i_a_result_0 = cerebros_automl.run_random_search()
644-
# Replace "inf" / "nan" with "worst result that can be bumerically registered"
645644
phase_i_a_result = float(phase_i_a_result_0) # Deep copy that survives del() of parent object ...
646645
cerebros_t1 = time.time()
647646
cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60
@@ -650,19 +649,12 @@ def objective(trial: optuna.Trial) -> float:
650649

651650

652651

653-
print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.")
654-
""" ADD BACK
655-
656-
657-
print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.")
658-
659-
"""
660-
661-
print(f'Cerebros best accuracy achieved in Phase I-a is {phase_i_a_result}')
662-
print(f'val set perplexity')
652+
print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.")
653+
print(f'Cerebros best perplexity achieved in Phase I-a is {phase_i_a_result}')
663654
# Log the metric to MlFLow
664655
mlflow.log_metric("phase-i-a-perplexity", phase_i_a_result, step=trial.number)
665-
"""### Testing the best model found"""
656+
657+
"""### Testing the best model found"""
666658

667659
MODEL_FILE_NAME = "cerebros-foundation-model.keras"
668660

@@ -683,12 +675,6 @@ def objective(trial: optuna.Trial) -> float:
683675
print("GENERATED TEXT SAMPLES")
684676
print("="*50)
685677

686-
687-
688-
689-
690-
691-
# Replace the generation code block with this:
692678

693679
# Create config and generator
694680
config = CerebrosNotGPTConfig(
@@ -746,12 +732,7 @@ def complete_text_beam(text: str,
746732
generated_text =\
747733
tokenizer.decode(generated_tokens).replace(text, "")
748734
return generated_text
749-
750-
# test_text_block = "I saw the sun and it was as shining on the"
751-
# response = complete_text_greedy(test_text_block)
752-
# print(f"I ask the generator (greedy): {test_text_block}... It responds: '{response}'.")
753-
# response = complete_text_beam(test_text_block)
754-
# print(f"I ask the generator (Beam defaults - max_new_tokens: 10, temperature: 0.75, top_k: 75, top_p: 0.98, repetition_penalty: None, presence_penalty: 1.3, frequency_penalty: 1.4): {test_text_block}... It responds: '{response}'.")
735+
755736

756737
trial_number = int(trial.number)
757738
def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result_cutoff: float, trial_id: int, test_sample_number: int, result_0: float) -> None:
@@ -889,16 +870,10 @@ def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result_
889870
presence_penalty=perm_0['presence_penalty'],
890871
frequency_penalty=perm_0['frequency_penalty'])
891872
print(f"Trial #: {trial_id} Text Sample #: {test_sample_number} Perplexity: {result_0} GENERATE PARAMS: max_new_tokens={perm_0['max_new_tokens']} temperature={perm_0['temperature']}, top_k={perm_0['top_k']}, top_p={perm_0['top_p']}, repetition_penalty={perm_0['repetition_penalty']} presence_penalty={perm_0['presence_penalty']} frequency_penalty{perm_0['frequency_penalty']} PROMPT: '{test_prompt}' RESPONSE: '{response_0}'")
892-
#
893-
# print(f"Sample {sample_number}: I ask the generator (Beam: - max_new_tokens: 10, temperature=0.6, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4): {test_prompt}... It responds: '{response_3}'.")
894-
# response_4 = complete_text_beam(text=test_prompt, max_new_tokens=max_new_tokens, temperature=0.7, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4)
895-
# print(f"Sample {sample_number}: I ask the generator (Beam: - max_new_tokens: 10, temperature=0.7, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4): {test_prompt}... It responds: '{response_4}'.")
896-
# response_5 = complete_text_beam(text=test_prompt, max_new_tokens=max_new_tokens, temperature=0.7, top_k=75, top_p=0.97, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4)
897-
# print(f"Sample {sample_number}: I ask the generator (Beam: - max_new_tokens: 10, temperature=0.7, top_k=75, top_p=0.97, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4): {test_prompt}... It responds: '{response_5}'.")
898873

899874
# Sample prompts to test:
900875

901-
print("########### Phase I-a Model Checkpoint Generation Samples: ")
876+
print("########### Phase I-a Model Checkpoint Generation Samples: ###########")
902877

903878
prompt_samples = [
904879
"I saw the sun and it was as shining on the",
@@ -929,47 +904,6 @@ def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result_
929904
result_0=phase_i_a_result)
930905
counter += 1
931906

932-
# # Tokenize the text without padding first to get actual tokens
933-
# sample_tokenized = tokenizer(
934-
# sample,
935-
# add_special_tokens=False
936-
# )['input_ids']
937-
# start_generate_index = int(np.ceil(len(sample_tokenized) * 0.5))
938-
# half_sample_tokenized = sample_tokenized[:start_generate_index]
939-
940-
# # Convert to Python list of integers
941-
# if hasattr(half_sample_tokenized, 'numpy'):
942-
# token_ids = half_sample_tokenized.numpy().tolist()
943-
# else:
944-
# token_ids = [int(token_id) for token_id in half_sample_tokenized]
945-
946-
# print(f"Actual token count: {len(token_ids)}")
947-
# print(f"First 10 tokens: {token_ids[:10]}")
948-
949-
# # Now pass the list of integers to your generate method
950-
# generated_tokens = generator.generate(
951-
# token_ids=token_ids, # Just the actual tokens, no padding
952-
# do_sample=True,
953-
# max_new_tokens=20,
954-
# temperature=0.73,
955-
# # One set of recommendations
956-
# top_k=75,
957-
# top_p=0.97,
958-
# # Previous semi-working values
959-
# # top_k=40,
960-
# # top_p=0.985,
961-
# # repetition_penalty=1.2,
962-
# presence_penalty=1.2,
963-
# frequency_penalty=1.4
964-
# )
965-
966-
# # Decode the result
967-
# half_sample = tokenizer.decode(half_sample_tokenized)
968-
# full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)\
969-
# .replace(half_sample, "")
970-
971-
# print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
972-
973907

974908
# del(best_model_found)
975909
# del(generator)
@@ -980,11 +914,19 @@ def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result_
980914

981915
# Create the Dataset Generaror:
982916
class SampleExpansionGenerator:
983-
def __init__(self, raw_text_samples, tokenizer, sample_expansion_batch_size=50, model_batch_size=10, prompt_length_0=PROMPT_LENGTH, max_seq_length=MAX_SEQ_LENGTH, vocabulary_size=VOCABULARY_SIZE):
917+
def __init__(self,
918+
raw_text_samples,
919+
tokenizer,
920+
sample_expansion_batch_size=50,
921+
model_batch_size=10,
922+
prompt_length_0=PROMPT_LENGTH,
923+
max_seq_length=MAX_SEQ_LENGTH,
924+
vocabulary_size=VOCABULARY_SIZE):
925+
984926
self.raw_text_samples = raw_text_samples
985927
self.tokenizer = tokenizer
986928
self.sample_expansion_batch_size = sample_expansion_batch_size
987-
self.model_batch_size = model_batch_size # Add this parameter
929+
self.model_batch_size = model_batch_size
988930
self.prompt_length_0 = prompt_length_0
989931
self.max_seq_length = max_seq_length
990932
self.vocabulary_size = vocabulary_size
@@ -1088,7 +1030,7 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=50,
10881030
result_phase_i_b = float(phase_i_b_history['perplexity'].min())
10891031
mlflow.log_metric("phase_i_b-perplexity", result_phase_i_b, step=trial_number)
10901032

1091-
print("########### Phase I-b Model Checkpoint Generation Samples: ")
1033+
print("########### Phase I-b Model Checkpoint Generation Samples: ###########")
10921034

10931035
# Text samples after Phase I-b training
10941036
counter = 0
@@ -1102,25 +1044,16 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=50,
11021044
test_sample_number=counter,
11031045
result_0=result_phase_i_b)
11041046
counter += 1
1105-
1047+
1048+
# Return the final result to Optuna
11061049
return result_phase_i_b
11071050

11081051

11091052
def main():
1110-
# Optional fast path for CI / smoke tests
1111-
# fast = os.getenv("CEREBROS_FAST", "0") == "1"
1112-
# n_trials = int(os.getenv("CEREBROS_N_TRIALS", "3" if fast else "20"))
11131053
n_trials = N_TRIALS
1114-
# mlflow_parent = mlflow.start_run(run_name=os.getenv("MLFLOW_PARENT_RUN_NAME", "cerebros_poc_parent"), tags={"phase": "poc", "mode": "fast" if fast else "full"})
11151054
sampler = optuna.samplers.TPESampler(multivariate=True, n_startup_trials=5)
11161055
study = optuna.create_study(direction="minimize", sampler=sampler, storage=optuna_storage)
1117-
study.optimize(objective, n_trials=n_trials)
1118-
# mlflow.log_param("n_trials", n_trials)
1119-
# Log fixed (non-tunable) generation control param once at parent level
1120-
# mlflow.log_param("PROMPT_LEN", PROMPT_LEN)
1121-
# mlflow.log_metric("best_value", study.best_trial.value)
1122-
# Log best params as params (flat)
1123-
1056+
study.optimize(objective, n_trials=N_TRIALS)
11241057
print('Best trial:')
11251058
best_trial = study.best_trial
11261059
print(' Value: ', best_trial.value)

0 commit comments

Comments
 (0)