From 7b96175123f8b40cf9cf4b3b050d44579a321085 Mon Sep 17 00:00:00 2001 From: "guangli.bao" Date: Mon, 22 Sep 2025 16:23:29 +0800 Subject: [PATCH] fix start token Signed-off-by: guangli.bao --- src/guidellm/dataset/synthetic.py | 10 ++++++---- tests/unit/dataset/test_synthetic.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/guidellm/dataset/synthetic.py b/src/guidellm/dataset/synthetic.py index 8c30f0f7..f751bd43 100644 --- a/src/guidellm/dataset/synthetic.py +++ b/src/guidellm/dataset/synthetic.py @@ -194,19 +194,21 @@ def __iter__( } def _create_prompt( - self, prompt_tokens: int, start_index: int, unique_prefix: Optional[int] = None + self, + prompt_tokens: int, + start_index: int, + unique_prefix: Optional[int] = None, # noqa: ARG002 ) -> list[int]: if prompt_tokens <= 0: return [] left = start_index right = start_index + 4 * prompt_tokens - start_tokens = [unique_prefix] if unique_prefix else [] while left < right: mid = (left + right) // 2 test_prompt = self.text_creator.create_text(start_index, mid - start_index) - test_tokens = start_tokens + self.processor.encode(test_prompt) + test_tokens = self.processor.encode(test_prompt) if len(test_tokens) == prompt_tokens: return test_tokens @@ -216,7 +218,7 @@ def _create_prompt( right = mid final_text = self.text_creator.create_text(start_index, left - start_index) - return start_tokens + self.processor.encode(final_text) + return self.processor.encode(final_text) class SyntheticDatasetCreator(DatasetCreator): diff --git a/tests/unit/dataset/test_synthetic.py b/tests/unit/dataset/test_synthetic.py index e3110fa3..e57b3749 100644 --- a/tests/unit/dataset/test_synthetic.py +++ b/tests/unit/dataset/test_synthetic.py @@ -415,7 +415,7 @@ def test_create_prompt_method( # Test normal case result = generator._create_prompt(5, 0, 42) - assert result == [42, 1, 2, 3] + assert result == [1, 2, 3] # Test zero tokens result = generator._create_prompt(0, 0, 42)