Update generative-proof-of-concept-CPU-preprocessing-in-memory.py

david-thrower · web-flow · commit ef4caa04e8d1 · 2025-10-17T00:00:46.000-04:00
Increase sample expansion batch size to 50 Added parameter model_batch_size to  create_dataset.
diff --git a/generative-proof-of-concept-CPU-preprocessing-in-memory.py b/generative-proof-of-concept-CPU-preprocessing-in-memory.py
@@ -1331,7 +1331,7 @@ def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result_
 
         # Create the Dataset Generaror:
         class SampleExpansionGenerator:
-            def __init__(self, raw_text_samples, tokenizer, sample_expansion_batch_size=5):
+            def __init__(self, raw_text_samples, tokenizer, sample_expansion_batch_size=50):
                 self.raw_text_samples = raw_text_samples
                 self.tokenizer = tokenizer
                 self.sample_expansion_batch_size = sample_expansion_batch_size
@@ -1400,7 +1400,7 @@ def __next__(self):
 
 
         # Create the tf.data.Dataset
-        def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=10) -> tf.data.Dataset:
+        def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=50, model_batch_size=10) -> tf.data.Dataset:
             generator = SampleExpansionGenerator(raw_text_samples, tokenizer, sample_expansion_batch_size)
 
             dataset = tf.data.Dataset.from_generator(
@@ -1415,21 +1415,23 @@ def create_dataset(raw_text_samples, tokenizer, sample_expansion_batch_size=10)
             # Set dataset to allow multiple epochs:
             # dataset = dataset.repeat()
             # Batch it
-            dataset = dataset.batch(batch_size)
+            dataset = dataset.batch(model_batch_size)
             return dataset
 
         phase_i_b_train_dataset =\
            create_dataset(
               raw_text_samples=phase_i_b_train_samples,
               tokenizer=tokenizer,
-              sample_expansion_batch_size=PHASE_I_B_SAMPLE_EXPANSION_BATCH_SIZE)
+              sample_expansion_batch_size=PHASE_I_B_SAMPLE_EXPANSION_BATCH_SIZE,
+              model_batch_size=batch_size)
 
         
         phase_i_b_val_dataset =\
             create_dataset(
                raw_text_samples=phase_i_b_val_samples,
                tokenizer=tokenizer,
-               sample_expansion_batch_size=PHASE_I_B_SAMPLE_EXPANSION_BATCH_SIZE)
+               sample_expansion_batch_size=PHASE_I_B_SAMPLE_EXPANSION_BATCH_SIZE,
+               model_batch_size=batch_size)
 
 
         phase_i_b_history =\