Skip to content

Commit 3555123

Browse files
committed
Add fixed prefix option to synthetic data
1 parent 1261fe8 commit 3555123

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

src/guidellm/dataset/synthetic.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525

2626

2727
class SyntheticDatasetConfig(BaseModel):
28+
prefix_tokens: int = Field(
29+
description="The number of shared prefix tokens to prepend to each prompt.",
30+
ge=0,
31+
default=0,
32+
)
2833
prompt_tokens: int = Field(
2934
description="The average number of text tokens generated for prompts.",
3035
gt=0,
@@ -164,15 +169,19 @@ def __iter__(
164169
# ensure diff distribution from output tokens
165170
rand = random.Random(self.random_seed + 2) # noqa: S311
166171

172+
prefix_index = rand.randint(0, len(self.text_creator.words))
173+
prefix_tokens = self.config.prefix_tokens
174+
prefix = self._create_prompt(prefix_tokens, prefix_index)
175+
167176
for _, prompt_tokens, output_tokens in zip(
168177
range(self.config.samples),
169178
prompt_tokens_sampler,
170179
output_tokens_sampler,
171180
):
172181
start_index = rand.randint(0, len(self.text_creator.words))
173182
yield {
174-
"prompt": self._create_prompt(prompt_tokens, start_index),
175-
"prompt_tokens_count": prompt_tokens,
183+
"prompt": prefix + self._create_prompt(prompt_tokens, start_index),
184+
"prompt_tokens_count": prefix_tokens + prompt_tokens,
176185
"output_tokens_count": output_tokens,
177186
}
178187

0 commit comments

Comments
 (0)