Address review comments

jaredoconnell · jaredoconnell · commit 81817f4c53b4 · 2025-10-21T18:44:39.000-04:00
Signed-off-by: Jared O'Connell &lt;joconnel@redhat.com&gt;
diff --git a/src/guidellm/data/deserializers/synthetic.py b/src/guidellm/data/deserializers/synthetic.py
@@ -183,7 +183,7 @@ def _create_prompt(
 
         while len(prompt_token_ids) < prompt_tokens_count:
             attempts += 1
-            num_chars = math.ceil(
+            num_chars = int(
                 prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts
             )
             text = unique + faker.text(max_nb_chars=num_chars)
diff --git a/src/guidellm/data/loaders.py b/src/guidellm/data/loaders.py
@@ -16,7 +16,6 @@
 
 __all__ = ["DataLoader", "DatasetsIterator"]
 
-from guidellm.schemas import GenerationRequest
 
 
 class DatasetsIterator(TorchIterableDataset):
@@ -100,11 +99,10 @@ def generator(
                         continue
 
                     for preprocessor in self.preprocessors:
-                        processed_row = preprocessor(row)
-                        if isinstance(processed_row, GenerationRequest):
-                            yield processed_row
-                        else:
-                            row = processed_row
+                        # This can assign a GenerationRequest, which would then be
+                        # passed into the preprocessor, which is a type violation.
+                        # This should be fixed at some point.
+                        row = preprocessor(row)  # type: ignore[assignment]
                     yield row
                 except Exception as err:  # noqa: BLE001 # Exception logged
                     logger.error(f"Skipping data row due to error: {err}")
diff --git a/src/guidellm/data/preprocessors/formatters.py b/src/guidellm/data/preprocessors/formatters.py
@@ -7,7 +7,6 @@
     DatasetPreprocessor,
     PreprocessorRegistry,
 )
-from guidellm.data.utils import text_stats
 from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
 
 __all__ = [
@@ -102,10 +101,10 @@ def __call__(
         prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
         text = "".join(txt for txt in columns.get("text_column", []) if txt)
         if prefix or text:
-            arguments.body["prompt"] = prefix + text
-            stats = text_stats(arguments.body["prompt"])
-            input_metrics.text_characters = stats.get("num_chars")
-            input_metrics.text_words = stats.get("num_words")
+            prompt = prefix + text
+            arguments.body["prompt"] = prompt
+            input_metrics.text_characters = len(prompt)
+            input_metrics.text_words = len(prompt.split())
 
         return GenerationRequest(
             request_type="text_completions",
@@ -198,27 +197,25 @@ def __call__(  # noqa: C901, PLR0912, PLR0915
             if not prefix:
                 continue
 
-            stats = text_stats(prefix)
-            if (num_chars := stats.get("num_chars")) is not None:
-                input_metrics.text_characters = (
-                    input_metrics.text_characters or 0
-                ) + num_chars
-            if (num_words := stats.get("num_words")) is not None:
-                input_metrics.text_words = (input_metrics.text_words or 0) + num_words
+            input_metrics.text_characters = (
+                input_metrics.text_characters or 0
+            ) + len(prefix)
+
+            input_metrics.text_words = (input_metrics.text_words or 0) + \
+                                       len(prefix.split())
 
             arguments.body["messages"].append({"role": "system", "content": prefix})
 
         for text in columns.get("text_column", []):
             if not text:
                 continue
 
-            stats = text_stats(text)
-            if (num_chars := stats.get("num_chars")) is not None:
-                input_metrics.text_characters = (
-                    input_metrics.text_characters or 0
-                ) + num_chars
-            if (num_words := stats.get("num_words")) is not None:
-                input_metrics.text_words = (input_metrics.text_words or 0) + num_words
+            input_metrics.text_characters = (
+                input_metrics.text_characters or 0
+            ) + len(text)
+            input_metrics.text_words = (
+               input_metrics.text_words or 0
+            ) + len(text.split())
 
             arguments.body["messages"].append(
                 {"role": "user", "content": [{"type": "text", "text": text}]}
@@ -395,10 +392,10 @@ def __call__(  # noqa: C901
         prefix = "".join(pre for pre in columns.get("prefix_column", []) if pre)
         text = "".join(txt for txt in columns.get("text_column", []) if txt)
         if prefix or text:
-            arguments.body["prompt"] = prefix + text
-            stats = text_stats(arguments.body["prompt"])
-            input_metrics.text_characters = stats.get("num_chars")
-            input_metrics.text_words = stats.get("num_words")
+            prompt = prefix + text
+            arguments.body["prompt"] = prompt
+            input_metrics.text_characters = len(prompt)
+            input_metrics.text_words = len(prompt.split())
 
         return GenerationRequest(
             request_type="audio_transcriptions",
diff --git a/src/guidellm/data/utils/__init__.py b/src/guidellm/data/utils/__init__.py
@@ -1,10 +1,6 @@
 from .dataset import DEFAULT_SPLITS, resolve_dataset_split
-from .functions import (
-    text_stats,
-)
 
 __all__ = [
     "DEFAULT_SPLITS",
     "resolve_dataset_split",
-    "text_stats",
 ]
diff --git a/src/guidellm/data/utils/functions.py b/src/guidellm/data/utils/functions.py

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ def _create_prompt(`
`183`	`183`
`184`	`184`	`while len(prompt_token_ids) < prompt_tokens_count:`
`185`	`185`	`attempts += 1`
`186`		`- num_chars = math.ceil(`
	`186`	`+ num_chars = int(`
`187`	`187`	`prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts`
`188`	`188`	`)`
`189`	`189`	`text = unique + faker.text(max_nb_chars=num_chars)`