4
4
from pathlib import Path
5
5
from typing import Any , Literal , Optional , Union
6
6
7
+ import random
8
+ import time
7
9
import yaml
8
10
from datasets import (
9
11
Dataset ,
@@ -183,7 +185,6 @@ def __iter__(
183
185
def _create_prompt (self , prompt_tokens : int , start_index : int , request_id : int ) -> str :
184
186
"""
185
187
Create a prompt with unique prefix to prevent vLLM prefix caching.
186
-
187
188
Args:
188
189
prompt_tokens: Target number of tokens for the prompt
189
190
start_index: Starting position in the text corpus
@@ -192,10 +193,9 @@ def _create_prompt(self, prompt_tokens: int, start_index: int, request_id: int)
192
193
Generated prompt string with unique prefix
193
194
"""
194
195
if prompt_tokens <= 0 :
195
- return f" { request_id } : "
196
+ return self . _create_unique_prefix ( request_id )
196
197
197
- # Create unique prefix that will prevent any prefix caching
198
- unique_prefix = f"{ request_id } : "
198
+ unique_prefix = self ._create_unique_prefix (request_id )
199
199
200
200
# Calculate how many tokens the prefix uses
201
201
prefix_tokens = len (self .processor .tokenize (unique_prefix ))
@@ -222,6 +222,21 @@ def _create_prompt(self, prompt_tokens: int, start_index: int, request_id: int)
222
222
base_text = self .text_creator .create_text (start_index , left - start_index )
223
223
return unique_prefix + base_text
224
224
225
+ def _create_unique_prefix (self , request_id : int ) -> str :
226
+ """
227
+ Create a unique prefix that will never match any other request.
228
+ """
229
+
230
+ timestamp = int (time .time () * 1000000 ) # microseconds
231
+ random .seed (request_id + timestamp )
232
+ random_component = random .randint (100000 , 999999 )
233
+
234
+ prefix_parts = [
235
+ f"RAND{ random_component } " ,
236
+ ]
237
+
238
+ return f"{ '_' .join (prefix_parts )} : "
239
+
225
240
226
241
class SyntheticDatasetCreator (DatasetCreator ):
227
242
@classmethod
0 commit comments