Skip to content

Commit a55f74f

Browse files
Adjust sample sizes and dataset for phase I-b
Updated sample creation parameters and dataset handling for phase I-b. Add a val split and val set.
1 parent 39941bc commit a55f74f

File tree

1 file changed

+15
-8
lines changed

1 file changed

+15
-8
lines changed

generative-proof-of-concept-CPU-preprocessing-in-memory.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,10 @@ def objective(trial: optuna.Trial) -> float:
6868

6969
# Number of text samples to create: # Number of text samples (of approximately max_seq_len) to create
7070
# Raises RAM in a linear fashion
71-
72-
PHASE_I_A_SAMPLES_TO_CREATE = 20 # 681
73-
PHASE_I_B_SAMPLES_TO_CREATE = 50
71+
72+
PHASE_I_A_SAMPLES_TO_CREATE = 10 # 681
73+
PHASE_I_B_SAMPLES_TO_CREATE = 20
74+
PHASE_I_B_VAL_SPLIT = 0.15 # Validation split for phase I-b (0.0 to 1.0)
7475

7576
# How many tokens to provide before expecting the next token to be predicted.
7677
# Half this = double RAM (inversely proportional to RAM requirement)
@@ -357,6 +358,12 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
357358
phase_i_b_samples = bible[PHASE_I_A_SAMPLES_TO_CREATE:PHASE_I_B_SAMPLES_TO_CREATE + PHASE_I_A_SAMPLES_TO_CREATE]
358359
print(f"Samples from KJV bible consisting of {len(non_instruct_samples)} look like this (sub-sample of 3): {non_instruct_samples[:3]}")
359360

361+
# Split phase_i_b_samples into train and validation sets
362+
phase_i_b_train_samples, phase_i_b_val_samples = train_test_split(
363+
phase_i_b_samples,
364+
test_size=PHASE_I_B_VAL_SPLIT,
365+
shuffle=False
366+
)
360367

361368
# Replace with imported text
362369

@@ -1409,15 +1416,15 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=10)
14091416
dataset = dataset.batch(batch_size)
14101417
return dataset
14111418

1412-
phase_i_b_dataset = create_dataset(raw_text_samples=phase_i_b_samples, tokenizer=tokenizer, sample_expansion_batch_size=10)
1419+
phase_i_b_train_dataset = create_dataset(raw_text_samples=phase_i_b_train_samples, tokenizer=tokenizer, sample_expansion_batch_size=10)
1420+
phase_i_b_val_dataset = create_dataset(raw_text_samples=phase_i_b_val_samples, tokenizer=tokenizer, sample_expansion_batch_size=10)
14131421

14141422

1415-
phase_i_b_history =\
1423+
phase_i_b_history =\
14161424
generator.model.fit(
1417-
# best_model_found.fit(
1418-
x=phase_i_b_dataset,
1425+
x=phase_i_b_train_dataset,
1426+
validation_data=phase_i_b_val_dataset,
14191427
epochs=phase_i_b_epochs)
1420-
# batch_size=batch_size)
14211428

14221429

14231430
phase_i_b_history =\

0 commit comments

Comments
 (0)