diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index 6f96cd7..e210929 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -5,7 +5,7 @@ name: Python application on: push: - branches: [ "main", "260-integrate-suggested-solution-to-streaming-llm-phase-i-b-training-data" ] + branches: [ "main", "263-add-val-ds-to-branch-260" ] permissions: contents: read diff --git a/generative-proof-of-concept-CPU-preprocessing-in-memory.py b/generative-proof-of-concept-CPU-preprocessing-in-memory.py index 2b93eb0..7e4f175 100644 --- a/generative-proof-of-concept-CPU-preprocessing-in-memory.py +++ b/generative-proof-of-concept-CPU-preprocessing-in-memory.py @@ -68,9 +68,10 @@ def objective(trial: optuna.Trial) -> float: # Number of text samples to create: # Number of text samples (of approximately max_seq_len) to create # Raises RAM in a linear fashion - - PHASE_I_A_SAMPLES_TO_CREATE = 20 # 681 - PHASE_I_B_SAMPLES_TO_CREATE = 50 + + PHASE_I_A_SAMPLES_TO_CREATE = 10 # 681 + PHASE_I_B_SAMPLES_TO_CREATE = 20 + PHASE_I_B_VAL_SPLIT = 0.15 # Validation split for phase I-b (0.0 to 1.0) # How many tokens to provide before expecting the next token to be predicted. # Half this = double RAM (inversely proportional to RAM requirement) @@ -357,6 +358,12 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens phase_i_b_samples = bible[PHASE_I_A_SAMPLES_TO_CREATE:PHASE_I_B_SAMPLES_TO_CREATE + PHASE_I_A_SAMPLES_TO_CREATE] print(f"Samples from KJV bible consisting of {len(non_instruct_samples)} look like this (sub-sample of 3): {non_instruct_samples[:3]}") + # Split phase_i_b_samples into train and validation sets + phase_i_b_train_samples, phase_i_b_val_samples = train_test_split( + phase_i_b_samples, + test_size=PHASE_I_B_VAL_SPLIT, + shuffle=False + ) # Replace with imported text @@ -1409,15 +1416,15 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=10) dataset = dataset.batch(batch_size) return dataset - phase_i_b_dataset = create_dataset(raw_text_samples=phase_i_b_samples, tokenizer=tokenizer, sample_expansion_batch_size=10) + phase_i_b_train_dataset = create_dataset(raw_text_samples=phase_i_b_train_samples, tokenizer=tokenizer, sample_expansion_batch_size=10) + phase_i_b_val_dataset = create_dataset(raw_text_samples=phase_i_b_val_samples, tokenizer=tokenizer, sample_expansion_batch_size=10) phase_i_b_history =\ generator.model.fit( - # best_model_found.fit( - x=phase_i_b_dataset, + x=phase_i_b_train_dataset, + validation_data=phase_i_b_val_dataset, epochs=phase_i_b_epochs) - # batch_size=batch_size) phase_i_b_history =\