44from pathlib import Path
55from typing import Any , Literal , Optional , Union
66
7+ import random
8+ import time
79import yaml
810from datasets import (
911 Dataset ,
@@ -183,7 +185,6 @@ def __iter__(
183185 def _create_prompt (self , prompt_tokens : int , start_index : int , request_id : int ) -> str :
184186 """
185187 Create a prompt with unique prefix to prevent vLLM prefix caching.
186-
187188 Args:
188189 prompt_tokens: Target number of tokens for the prompt
189190 start_index: Starting position in the text corpus
@@ -192,10 +193,9 @@ def _create_prompt(self, prompt_tokens: int, start_index: int, request_id: int)
192193 Generated prompt string with unique prefix
193194 """
194195 if prompt_tokens <= 0 :
195- return f" { request_id } : "
196+ return self . _create_unique_prefix ( request_id )
196197
197- # Create unique prefix that will prevent any prefix caching
198- unique_prefix = f"{ request_id } : "
198+ unique_prefix = self ._create_unique_prefix (request_id )
199199
200200 # Calculate how many tokens the prefix uses
201201 prefix_tokens = len (self .processor .tokenize (unique_prefix ))
@@ -222,6 +222,21 @@ def _create_prompt(self, prompt_tokens: int, start_index: int, request_id: int)
222222 base_text = self .text_creator .create_text (start_index , left - start_index )
223223 return unique_prefix + base_text
224224
225+ def _create_unique_prefix (self , request_id : int ) -> str :
226+ """
227+ Create a unique prefix that will never match any other request.
228+ """
229+
230+ timestamp = int (time .time () * 1000000 ) # microseconds
231+ random .seed (request_id + timestamp )
232+ random_component = random .randint (100000 , 999999 )
233+
234+ prefix_parts = [
235+ f"RAND{ random_component } " ,
236+ ]
237+
238+ return f"{ '_' .join (prefix_parts )} : "
239+
225240
226241class SyntheticDatasetCreator (DatasetCreator ):
227242 @classmethod
0 commit comments