@@ -641,7 +641,6 @@ def objective(trial: optuna.Trial) -> float:
641641
642642 cerebros_t0 = time .time ()
643643 phase_i_a_result_0 = cerebros_automl .run_random_search ()
644- # Replace "inf" / "nan" with "worst result that can be bumerically registered"
645644 phase_i_a_result = float (phase_i_a_result_0 ) # Deep copy that survives del() of parent object ...
646645 cerebros_t1 = time .time ()
647646 cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0 ) / 60
@@ -650,19 +649,12 @@ def objective(trial: optuna.Trial) -> float:
650649
651650
652651
653- print (f"Cerebros trained { models_tried } models FROM A COLD START in ONLY { cerebros_time_all_models_min } min. Cerebros took only { cerebros_time_per_model } minutes on average per model." )
654- """ ADD BACK
655-
656-
657- print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.")
658-
659- """
660-
661- print (f'Cerebros best accuracy achieved in Phase I-a is { phase_i_a_result } ' )
662- print (f'val set perplexity' )
652+ print (f"Cerebros trained { models_tried } models FROM A COLD START in ONLY { cerebros_time_all_models_min } min. Cerebros took only { cerebros_time_per_model } minutes on average per model." )
653+ print (f'Cerebros best perplexity achieved in Phase I-a is { phase_i_a_result } ' )
663654 # Log the metric to MlFLow
664655 mlflow .log_metric ("phase-i-a-perplexity" , phase_i_a_result , step = trial .number )
665- """### Testing the best model found"""
656+
657+ """### Testing the best model found"""
666658
667659 MODEL_FILE_NAME = "cerebros-foundation-model.keras"
668660
@@ -683,12 +675,6 @@ def objective(trial: optuna.Trial) -> float:
683675 print ("GENERATED TEXT SAMPLES" )
684676 print ("=" * 50 )
685677
686-
687-
688-
689-
690-
691- # Replace the generation code block with this:
692678
693679 # Create config and generator
694680 config = CerebrosNotGPTConfig (
@@ -746,12 +732,7 @@ def complete_text_beam(text: str,
746732 generated_text = \
747733 tokenizer .decode (generated_tokens ).replace (text , "" )
748734 return generated_text
749-
750- # test_text_block = "I saw the sun and it was as shining on the"
751- # response = complete_text_greedy(test_text_block)
752- # print(f"I ask the generator (greedy): {test_text_block}... It responds: '{response}'.")
753- # response = complete_text_beam(test_text_block)
754- # print(f"I ask the generator (Beam defaults - max_new_tokens: 10, temperature: 0.75, top_k: 75, top_p: 0.98, repetition_penalty: None, presence_penalty: 1.3, frequency_penalty: 1.4): {test_text_block}... It responds: '{response}'.")
735+
755736
756737 trial_number = int (trial .number )
757738 def test_text (test_prompt : str , max_new_tokens : int , sample_number : int , result_cutoff : float , trial_id : int , test_sample_number : int , result_0 : float ) -> None :
@@ -889,16 +870,10 @@ def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result_
889870 presence_penalty = perm_0 ['presence_penalty' ],
890871 frequency_penalty = perm_0 ['frequency_penalty' ])
891872 print (f"Trial #: { trial_id } Text Sample #: { test_sample_number } Perplexity: { result_0 } GENERATE PARAMS: max_new_tokens={ perm_0 ['max_new_tokens' ]} temperature={ perm_0 ['temperature' ]} , top_k={ perm_0 ['top_k' ]} , top_p={ perm_0 ['top_p' ]} , repetition_penalty={ perm_0 ['repetition_penalty' ]} presence_penalty={ perm_0 ['presence_penalty' ]} frequency_penalty{ perm_0 ['frequency_penalty' ]} PROMPT: '{ test_prompt } ' RESPONSE: '{ response_0 } '" )
892- #
893- # print(f"Sample {sample_number}: I ask the generator (Beam: - max_new_tokens: 10, temperature=0.6, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4): {test_prompt}... It responds: '{response_3}'.")
894- # response_4 = complete_text_beam(text=test_prompt, max_new_tokens=max_new_tokens, temperature=0.7, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4)
895- # print(f"Sample {sample_number}: I ask the generator (Beam: - max_new_tokens: 10, temperature=0.7, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4): {test_prompt}... It responds: '{response_4}'.")
896- # response_5 = complete_text_beam(text=test_prompt, max_new_tokens=max_new_tokens, temperature=0.7, top_k=75, top_p=0.97, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4)
897- # print(f"Sample {sample_number}: I ask the generator (Beam: - max_new_tokens: 10, temperature=0.7, top_k=75, top_p=0.97, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4): {test_prompt}... It responds: '{response_5}'.")
898873
899874 # Sample prompts to test:
900875
901- print ("########### Phase I-a Model Checkpoint Generation Samples: " )
876+ print ("########### Phase I-a Model Checkpoint Generation Samples: ########### " )
902877
903878 prompt_samples = [
904879 "I saw the sun and it was as shining on the" ,
@@ -929,47 +904,6 @@ def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result_
929904 result_0 = phase_i_a_result )
930905 counter += 1
931906
932- # # Tokenize the text without padding first to get actual tokens
933- # sample_tokenized = tokenizer(
934- # sample,
935- # add_special_tokens=False
936- # )['input_ids']
937- # start_generate_index = int(np.ceil(len(sample_tokenized) * 0.5))
938- # half_sample_tokenized = sample_tokenized[:start_generate_index]
939-
940- # # Convert to Python list of integers
941- # if hasattr(half_sample_tokenized, 'numpy'):
942- # token_ids = half_sample_tokenized.numpy().tolist()
943- # else:
944- # token_ids = [int(token_id) for token_id in half_sample_tokenized]
945-
946- # print(f"Actual token count: {len(token_ids)}")
947- # print(f"First 10 tokens: {token_ids[:10]}")
948-
949- # # Now pass the list of integers to your generate method
950- # generated_tokens = generator.generate(
951- # token_ids=token_ids, # Just the actual tokens, no padding
952- # do_sample=True,
953- # max_new_tokens=20,
954- # temperature=0.73,
955- # # One set of recommendations
956- # top_k=75,
957- # top_p=0.97,
958- # # Previous semi-working values
959- # # top_k=40,
960- # # top_p=0.985,
961- # # repetition_penalty=1.2,
962- # presence_penalty=1.2,
963- # frequency_penalty=1.4
964- # )
965-
966- # # Decode the result
967- # half_sample = tokenizer.decode(half_sample_tokenized)
968- # full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)\
969- # .replace(half_sample, "")
970-
971- # print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
972-
973907
974908 # del(best_model_found)
975909 # del(generator)
@@ -980,11 +914,19 @@ def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result_
980914
981915 # Create the Dataset Generaror:
982916 class SampleExpansionGenerator :
983- def __init__ (self , raw_text_samples , tokenizer , sample_expansion_batch_size = 50 , model_batch_size = 10 , prompt_length_0 = PROMPT_LENGTH , max_seq_length = MAX_SEQ_LENGTH , vocabulary_size = VOCABULARY_SIZE ):
917+ def __init__ (self ,
918+ raw_text_samples ,
919+ tokenizer ,
920+ sample_expansion_batch_size = 50 ,
921+ model_batch_size = 10 ,
922+ prompt_length_0 = PROMPT_LENGTH ,
923+ max_seq_length = MAX_SEQ_LENGTH ,
924+ vocabulary_size = VOCABULARY_SIZE ):
925+
984926 self .raw_text_samples = raw_text_samples
985927 self .tokenizer = tokenizer
986928 self .sample_expansion_batch_size = sample_expansion_batch_size
987- self .model_batch_size = model_batch_size # Add this parameter
929+ self .model_batch_size = model_batch_size
988930 self .prompt_length_0 = prompt_length_0
989931 self .max_seq_length = max_seq_length
990932 self .vocabulary_size = vocabulary_size
@@ -1088,7 +1030,7 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=50,
10881030 result_phase_i_b = float (phase_i_b_history ['perplexity' ].min ())
10891031 mlflow .log_metric ("phase_i_b-perplexity" , result_phase_i_b , step = trial_number )
10901032
1091- print ("########### Phase I-b Model Checkpoint Generation Samples: " )
1033+ print ("########### Phase I-b Model Checkpoint Generation Samples: ########### " )
10921034
10931035 # Text samples after Phase I-b training
10941036 counter = 0
@@ -1102,25 +1044,16 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=50,
11021044 test_sample_number = counter ,
11031045 result_0 = result_phase_i_b )
11041046 counter += 1
1105-
1047+
1048+ # Return the final result to Optuna
11061049 return result_phase_i_b
11071050
11081051
11091052def main ():
1110- # Optional fast path for CI / smoke tests
1111- # fast = os.getenv("CEREBROS_FAST", "0") == "1"
1112- # n_trials = int(os.getenv("CEREBROS_N_TRIALS", "3" if fast else "20"))
11131053 n_trials = N_TRIALS
1114- # mlflow_parent = mlflow.start_run(run_name=os.getenv("MLFLOW_PARENT_RUN_NAME", "cerebros_poc_parent"), tags={"phase": "poc", "mode": "fast" if fast else "full"})
11151054 sampler = optuna .samplers .TPESampler (multivariate = True , n_startup_trials = 5 )
11161055 study = optuna .create_study (direction = "minimize" , sampler = sampler , storage = optuna_storage )
1117- study .optimize (objective , n_trials = n_trials )
1118- # mlflow.log_param("n_trials", n_trials)
1119- # Log fixed (non-tunable) generation control param once at parent level
1120- # mlflow.log_param("PROMPT_LEN", PROMPT_LEN)
1121- # mlflow.log_metric("best_value", study.best_trial.value)
1122- # Log best params as params (flat)
1123-
1056+ study .optimize (objective , n_trials = N_TRIALS )
11241057 print ('Best trial:' )
11251058 best_trial = study .best_trial
11261059 print (' Value: ' , best_trial .value )
0 commit comments