Replace inline prepare_data with utility function

david-thrower · web-flow · commit a0877a9e704b · 2025-10-17T15:12:25.000-04:00
Refactor data preparation function to use external utility.
diff --git a/generative-proof-of-concept-CPU-preprocessing-in-memory.py b/generative-proof-of-concept-CPU-preprocessing-in-memory.py
@@ -54,6 +54,7 @@ def objective(trial: optuna.Trial) -> float:
     import numpy as np
     from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\
         import SimpleCerebrosRandomSearch
+    from cerebrosllmutils.llm_utils import prepare_data
     import pendulum
     from cerebros.units.units import DenseUnit
     from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
@@ -209,83 +210,83 @@ def objective(trial: optuna.Trial) -> float:
         
         # Data Preprocessing:
 
-        def prepare_data(data, max_seq_length: int = MAX_SEQ_LENGTH):
-            all_input_ids = []
-            all_labels = []
+        # def prepare_data(data, max_seq_length: int = MAX_SEQ_LENGTH):
+        #     all_input_ids = []
+        #     all_labels = []
         
-            pad_token_id = tokenizer.pad_token_id
+        #     pad_token_id = tokenizer.pad_token_id
             
-            # Tokenize all data at once for efficiency
-            tokenized_data = tokenizer(
-                data,
-                max_length=max_seq_length,
-                padding='max_length',
-                truncation=True,
-                add_special_tokens=False  # We'll handle special tokens manually
-            )
+        #     # Tokenize all data at once for efficiency
+        #     tokenized_data = tokenizer(
+        #         data,
+        #         max_length=max_seq_length,
+        #         padding='max_length',
+        #         truncation=True,
+        #         add_special_tokens=False  # We'll handle special tokens manually
+        #     )
             
-            # Get the token ID for </prompt>
-            end_prompt_token_id = tokenizer.encode("</prompt>", add_special_tokens=False)[0]
+        #     # Get the token ID for </prompt>
+        #     end_prompt_token_id = tokenizer.encode("</prompt>", add_special_tokens=False)[0]
             
-            # Process each sample
-            for sample_tokens in tokenized_data['input_ids']:
-                # Find the index of </prompt> token
-                try:
-                    end_prompt_index = sample_tokens.index(end_prompt_token_id)
-                except ValueError:
-                    # If </prompt> not found, treat sample as a non-instruct sample
-                    end_prompt_index = (PROMPT_LENGTH - 1) # int(np.ceil(len(sample_tokens) * (1/3)))  # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples 
+        #     # Process each sample
+        #     for sample_tokens in tokenized_data['input_ids']:
+        #         # Find the index of </prompt> token
+        #         try:
+        #             end_prompt_index = sample_tokens.index(end_prompt_token_id)
+        #         except ValueError:
+        #             # If </prompt> not found, treat sample as a non-instruct sample
+        #             end_prompt_index = (PROMPT_LENGTH - 1) # int(np.ceil(len(sample_tokens) * (1/3)))  # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples 
                     
-                # Find first pad token after </prompt>
-                first_pad_index = None
-                for i in range(end_prompt_index + 1, len(sample_tokens)):
-                    if sample_tokens[i] == pad_token_id:
-                        first_pad_index = i
-                        break
+        #         # Find first pad token after </prompt>
+        #         first_pad_index = None
+        #         for i in range(end_prompt_index + 1, len(sample_tokens)):
+        #             if sample_tokens[i] == pad_token_id:
+        #                 first_pad_index = i
+        #                 break
                 
-                # If no pad token found, use the end of sequence
-                if first_pad_index is None:
-                    first_pad_index = len(sample_tokens)
+        #         # If no pad token found, use the end of sequence
+        #         if first_pad_index is None:
+        #             first_pad_index = len(sample_tokens)
                 
-                # Apply sliding window from after </prompt> to first pad token
-                # Start from end_prompt_index + 1 (first token to predict)
-                # End at first_pad_index - 1 (last token to predict)
-                for i in range(end_prompt_index + 1, first_pad_index):
-                    # Input: from start up to (but not including) token i
-                    input_ids = sample_tokens[:i]
+        #         # Apply sliding window from after </prompt> to first pad token
+        #         # Start from end_prompt_index + 1 (first token to predict)
+        #         # End at first_pad_index - 1 (last token to predict)
+        #         for i in range(end_prompt_index + 1, first_pad_index):
+        #             # Input: from start up to (but not including) token i
+        #             input_ids = sample_tokens[:i]
                     
-                    # Pad or truncate to max_seq_length
-                    if len(input_ids) > max_seq_length:
-                        input_ids = input_ids[:max_seq_length]
-                    else:
-                        input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
+        #             # Pad or truncate to max_seq_length
+        #             if len(input_ids) > max_seq_length:
+        #                 input_ids = input_ids[:max_seq_length]
+        #             else:
+        #                 input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
                     
-                    # Label: one-hot encoding of token at position i
-                    next_token = sample_tokens[i]
-                    label = [0] * VOCABULARY_SIZE
-                    label[next_token] = 1
+        #             # Label: one-hot encoding of token at position i
+        #             next_token = sample_tokens[i]
+        #             label = [0] * VOCABULARY_SIZE
+        #             label[next_token] = 1
                     
-                    all_input_ids.append(input_ids)
-                    all_labels.append(label)
+        #             all_input_ids.append(input_ids)
+        #             all_labels.append(label)
                 
-                # Add final sample with pad token as label to indicate termination
-                if first_pad_index < len(sample_tokens):  # Only if there's actually a pad token
-                    input_ids = sample_tokens[:first_pad_index]
+        #         # Add final sample with pad token as label to indicate termination
+        #         if first_pad_index < len(sample_tokens):  # Only if there's actually a pad token
+        #             input_ids = sample_tokens[:first_pad_index]
                     
-                    # Pad or truncate to max_seq_length
-                    if len(input_ids) > max_seq_length:
-                        input_ids = input_ids[:max_seq_length]
-                    else:
-                        input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
+        #             # Pad or truncate to max_seq_length
+        #             if len(input_ids) > max_seq_length:
+        #                 input_ids = input_ids[:max_seq_length]
+        #             else:
+        #                 input_ids = input_ids + [pad_token_id] * (max_seq_length - len(input_ids))
                     
-                    # Label: one-hot encoding of pad token
-                    label = [0] * VOCABULARY_SIZE
-                    label[pad_token_id] = 1
+        #             # Label: one-hot encoding of pad token
+        #             label = [0] * VOCABULARY_SIZE
+        #             label[pad_token_id] = 1
                     
-                    all_input_ids.append(input_ids)
-                    all_labels.append(label)
+        #             all_input_ids.append(input_ids)
+        #             all_labels.append(label)
             
-            return all_input_ids, all_labels, VOCABULARY_SIZE
+        #     return all_input_ids, all_labels, VOCABULARY_SIZE
         
         
         ## Only add re, tokenizer already in script
@@ -439,15 +440,16 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
         # Add non-instruct samples
         # data += non_instruct_samples
         
-        
-        x, y, vocab_size = prepare_data(non_instruct_samples) # data)
+        ##<<##<<
+        x, y, vocab_size =  prepare_data(data_0=non_instruct_samples, tokenizer_0=toeknizer, max_seq_length=MAX_SEQ_LENGTH, prompt_length: int = 1)
+        # x, y, vocab_size =  prepare_data() # data)
         
         print("Input IDs shape:", len(x), "x", len(x[0]) if x else 0)
         print("Labels shape:", len(y), "x", len(y[0]) if y else 0)
         print("Vocabulary size:", vocab_size)
         print("First few samples generated:", len(x))
         
-        
+        raise ValueError("Debug")
         # i = 1
         # for d,l in zip(x, y):
         #     print(f"Sample {i}:")