@@ -68,9 +68,10 @@ def objective(trial: optuna.Trial) -> float:
6868
6969 # Number of text samples to create: # Number of text samples (of approximately max_seq_len) to create
7070 # Raises RAM in a linear fashion
71-
72- PHASE_I_A_SAMPLES_TO_CREATE = 20 # 681
73- PHASE_I_B_SAMPLES_TO_CREATE = 50
71+
72+ PHASE_I_A_SAMPLES_TO_CREATE = 10 # 681
73+ PHASE_I_B_SAMPLES_TO_CREATE = 20
74+ PHASE_I_B_VAL_SPLIT = 0.15 # Validation split for phase I-b (0.0 to 1.0)
7475
7576 # How many tokens to provide before expecting the next token to be predicted.
7677 # Half this = double RAM (inversely proportional to RAM requirement)
@@ -357,6 +358,12 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
357358 phase_i_b_samples = bible [PHASE_I_A_SAMPLES_TO_CREATE :PHASE_I_B_SAMPLES_TO_CREATE + PHASE_I_A_SAMPLES_TO_CREATE ]
358359 print (f"Samples from KJV bible consisting of { len (non_instruct_samples )} look like this (sub-sample of 3): { non_instruct_samples [:3 ]} " )
359360
361+ # Split phase_i_b_samples into train and validation sets
362+ phase_i_b_train_samples , phase_i_b_val_samples = train_test_split (
363+ phase_i_b_samples ,
364+ test_size = PHASE_I_B_VAL_SPLIT ,
365+ shuffle = False
366+ )
360367
361368 # Replace with imported text
362369
@@ -1409,15 +1416,15 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=10)
14091416 dataset = dataset .batch (batch_size )
14101417 return dataset
14111418
1412- phase_i_b_dataset = create_dataset (raw_text_samples = phase_i_b_samples , tokenizer = tokenizer , sample_expansion_batch_size = 10 )
1419+ phase_i_b_train_dataset = create_dataset (raw_text_samples = phase_i_b_train_samples , tokenizer = tokenizer , sample_expansion_batch_size = 10 )
1420+ phase_i_b_val_dataset = create_dataset (raw_text_samples = phase_i_b_val_samples , tokenizer = tokenizer , sample_expansion_batch_size = 10 )
14131421
14141422
14151423 phase_i_b_history = \
14161424 generator .model .fit (
1417- # best_model_found.fit(
1418- x = phase_i_b_dataset ,
1425+ x = phase_i_b_train_dataset ,
1426+ validation_data = phase_i_b_val_dataset ,
14191427 epochs = phase_i_b_epochs )
1420- # batch_size=batch_size)
14211428
14221429
14231430 phase_i_b_history = \
0 commit comments