Skip to content

Commit 1e37ca5

Browse files
Update sample counts and batch size for phase I-b
Increased the number of samples to create for phase I-b and updated the sample expansion batch size in dataset creation.
1 parent e1f158e commit 1e37ca5

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

generative-proof-of-concept-CPU-preprocessing-in-memory.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,11 @@ def objective(trial: optuna.Trial) -> float:
7070
# Raises RAM in a linear fashion
7171

7272
PHASE_I_A_SAMPLES_TO_CREATE = 10 # 681
73-
PHASE_I_B_SAMPLES_TO_CREATE = 20
73+
PHASE_I_B_SAMPLES_TO_CREATE = 50
7474
PHASE_I_B_VAL_SPLIT = 0.15 # Validation split for phase I-b (0.0 to 1.0)
7575

76+
PHASE_I_B_SAMPLE_EXPANSION_BATCH_SIZE = 20
77+
7678
# How many tokens to provide before expecting the next token to be predicted.
7779
# Half this = double RAM (inversely proportional to RAM requirement)
7880
PROMPT_LENGTH = 1
@@ -1416,8 +1418,18 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=10)
14161418
dataset = dataset.batch(batch_size)
14171419
return dataset
14181420

1419-
phase_i_b_train_dataset = create_dataset(raw_text_samples=phase_i_b_train_samples, tokenizer=tokenizer, sample_expansion_batch_size=10)
1420-
phase_i_b_val_dataset = create_dataset(raw_text_samples=phase_i_b_val_samples, tokenizer=tokenizer, sample_expansion_batch_size=10)
1421+
phase_i_b_train_dataset =\
1422+
create_dataset(
1423+
raw_text_samples=phase_i_b_train_samples,
1424+
tokenizer=tokenizer,
1425+
sample_expansion_batch_size=PHASE_I_B_SAMPLE_EXPANSION_BATCH_SIZE)
1426+
1427+
1428+
phase_i_b_val_dataset =\
1429+
create_dataset(
1430+
raw_text_samples=phase_i_b_val_samples,
1431+
tokenizer=tokenizer,
1432+
sample_expansion_batch_size=PHASE_I_B_SAMPLE_EXPANSION_BATCH_SIZE)
14211433

14221434

14231435
phase_i_b_history =\

0 commit comments

Comments
 (0)