Adjust end_prompt_index calculation for samples

david-thrower · web-flow · commit 71d0a3176e36 · 2025-10-15T15:03:29.000-04:00
hange end_prompt_index from PROMPT_LENGTH to (PROMPT_LENGTH - 1); Not sure if this is right. Let it test.
diff --git a/generative-proof-of-concept-CPU-preprocessing-in-memory.py b/generative-proof-of-concept-CPU-preprocessing-in-memory.py
@@ -224,7 +224,7 @@ def prepare_data(data, max_seq_length: int = MAX_SEQ_LENGTH):
                     end_prompt_index = sample_tokens.index(end_prompt_token_id)
                 except ValueError:
                     # If </prompt> not found, treat sample as a non-instruct sample
-                    end_prompt_index = PROMPT_LENGTH # int(np.ceil(len(sample_tokens) * (1/3)))  # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples 
+                    end_prompt_index = (PROMPT_LENGTH - 1) # int(np.ceil(len(sample_tokens) * (1/3)))  # 0 ## 1. Give it a fair starting place to predict the next word 2. reduce the number of expanded samples 
                     
                 # Find first pad token after </prompt>
                 first_pad_index = None