Skip to content

Commit e1f158e

Browse files
Merge pull request #264 from david-thrower/263-add-val-ds-to-branch-260
263 add val ds to branch 260
2 parents 39941bc + 262d6c7 commit e1f158e

File tree

2 files changed

+15
-8
lines changed

2 files changed

+15
-8
lines changed

.github/workflows/automerge.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ name: Python application
55

66
on:
77
push:
8-
branches: [ "main", "260-integrate-suggested-solution-to-streaming-llm-phase-i-b-training-data" ]
8+
branches: [ "main", "263-add-val-ds-to-branch-260" ]
99

1010
permissions:
1111
contents: read

generative-proof-of-concept-CPU-preprocessing-in-memory.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,10 @@ def objective(trial: optuna.Trial) -> float:
6868

6969
# Number of text samples to create: # Number of text samples (of approximately max_seq_len) to create
7070
# Raises RAM in a linear fashion
71-
72-
PHASE_I_A_SAMPLES_TO_CREATE = 20 # 681
73-
PHASE_I_B_SAMPLES_TO_CREATE = 50
71+
72+
PHASE_I_A_SAMPLES_TO_CREATE = 10 # 681
73+
PHASE_I_B_SAMPLES_TO_CREATE = 20
74+
PHASE_I_B_VAL_SPLIT = 0.15 # Validation split for phase I-b (0.0 to 1.0)
7475

7576
# How many tokens to provide before expecting the next token to be predicted.
7677
# Half this = double RAM (inversely proportional to RAM requirement)
@@ -357,6 +358,12 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
357358
phase_i_b_samples = bible[PHASE_I_A_SAMPLES_TO_CREATE:PHASE_I_B_SAMPLES_TO_CREATE + PHASE_I_A_SAMPLES_TO_CREATE]
358359
print(f"Samples from KJV bible consisting of {len(non_instruct_samples)} look like this (sub-sample of 3): {non_instruct_samples[:3]}")
359360

361+
# Split phase_i_b_samples into train and validation sets
362+
phase_i_b_train_samples, phase_i_b_val_samples = train_test_split(
363+
phase_i_b_samples,
364+
test_size=PHASE_I_B_VAL_SPLIT,
365+
shuffle=False
366+
)
360367

361368
# Replace with imported text
362369

@@ -1409,15 +1416,15 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=10)
14091416
dataset = dataset.batch(batch_size)
14101417
return dataset
14111418

1412-
phase_i_b_dataset = create_dataset(raw_text_samples=phase_i_b_samples, tokenizer=tokenizer, sample_expansion_batch_size=10)
1419+
phase_i_b_train_dataset = create_dataset(raw_text_samples=phase_i_b_train_samples, tokenizer=tokenizer, sample_expansion_batch_size=10)
1420+
phase_i_b_val_dataset = create_dataset(raw_text_samples=phase_i_b_val_samples, tokenizer=tokenizer, sample_expansion_batch_size=10)
14131421

14141422

14151423
phase_i_b_history =\
14161424
generator.model.fit(
1417-
# best_model_found.fit(
1418-
x=phase_i_b_dataset,
1425+
x=phase_i_b_train_dataset,
1426+
validation_data=phase_i_b_val_dataset,
14191427
epochs=phase_i_b_epochs)
1420-
# batch_size=batch_size)
14211428

14221429

14231430
phase_i_b_history =\

0 commit comments

Comments
 (0)