Addressed review comments

jaredoconnell · jaredoconnell · commit e922b5b00652 · 2025-11-21T13:12:05.000-05:00
Use separate class for preprocess config

Signed-off-by: Jared O'Connell &lt;joconnel@redhat.com&gt;
diff --git a/src/guidellm/data/config.py b/src/guidellm/data/config.py
@@ -6,10 +6,9 @@
 import yaml
 from pydantic import ValidationError
 
-from guidellm.data.schemas import DataNotSupportedError
-from guidellm.schemas import StandardBaseModel
+from guidellm.data.schemas import DataConfig, DataNotSupportedError
 
-ConfigT = TypeVar("ConfigT", bound=StandardBaseModel)
+ConfigT = TypeVar("ConfigT", bound=DataConfig)
 
 
 def load_config(config: Any, config_class: type[ConfigT]) -> ConfigT | None:
diff --git a/src/guidellm/data/schemas.py b/src/guidellm/data/schemas.py
@@ -7,6 +7,7 @@
 from guidellm.schemas import StandardBaseModel
 
 __all__ = [
+    "DataConfig",
     "DataNotSupportedError",
     "GenerativeDatasetColumnType",
     "SyntheticTextDatasetConfig",
@@ -29,47 +30,54 @@ class DataNotSupportedError(Exception):
     Exception raised when the data format is not supported by deserializer or config.
     """
 
-class TokenCountConfig(StandardBaseModel):
+class DataConfig(StandardBaseModel):
+    """
+    A generic parent class for various configs for the data package
+    that can be passed in as key-value pairs or JSON.
+    """
+
+class PreprocessDatasetConfig(DataConfig):
+
     prompt_tokens: int = Field(
-        description="The average number of text tokens in prompts.",
+        description="The average number of text tokens retained or added to prompts.",
         gt=0,
     )
     prompt_tokens_stdev: int | None = Field(
-        description="The standard deviation of the tokens in prompts.",
+        description="The standard deviation of the number of tokens retained in or "
+                    "added to prompts.",
         gt=0,
         default=None,
     )
     prompt_tokens_min: int | None = Field(
-        description="The minimum number of text tokens in prompts.",
+        description="The minimum number of text tokens retained or added to prompts.",
         gt=0,
         default=None,
     )
     prompt_tokens_max: int | None = Field(
-        description="The maximum number of text tokens in prompts.",
+        description="The maximum number of text tokens retained or added to prompts.",
         gt=0,
         default=None,
     )
     output_tokens: int = Field(
-        description="The average number of text tokens in outputs.",
+        description="The average number of text tokens retained or added to outputs.",
         gt=0,
     )
     output_tokens_stdev: int | None = Field(
-        description="The standard deviation of the tokens in outputs.",
+        description="The standard deviation of the number of tokens retained or "
+                    "added to outputs.",
         gt=0,
         default=None,
     )
     output_tokens_min: int | None = Field(
-        description="The minimum number of text tokens in outputs.",
+        description="The minimum number of text tokens retained or added to outputs.",
         gt=0,
         default=None,
     )
     output_tokens_max: int | None = Field(
-        description="The maximum number of text tokens in outputs.",
+        description="The maximum number of text tokens retained or added to outputs.",
         gt=0,
         default=None,
     )
-
-class PreprocessDatasetConfig(TokenCountConfig):
     prefix_tokens_max: int | None = Field(
         description="The maximum number of text tokens left in the prefixes.",
         gt=0,
@@ -94,7 +102,46 @@ class SyntheticTextPrefixBucketConfig(StandardBaseModel):
     )
 
 
-class SyntheticTextDatasetConfig(TokenCountConfig):
+class SyntheticTextDatasetConfig(DataConfig):
+    prompt_tokens: int = Field(
+        description="The average number of text tokens generated for prompts.",
+        gt=0,
+    )
+    prompt_tokens_stdev: int | None = Field(
+        description="The standard deviation of the tokens generated for prompts.",
+        gt=0,
+        default=None,
+    )
+    prompt_tokens_min: int | None = Field(
+        description="The minimum number of text tokens generated for prompts.",
+        gt=0,
+        default=None,
+    )
+    prompt_tokens_max: int | None = Field(
+        description="The maximum number of text tokens generated for prompts.",
+        gt=0,
+        default=None,
+    )
+    output_tokens: int = Field(
+        description="The average number of text tokens generated for outputs.",
+        gt=0,
+    )
+    output_tokens_stdev: int | None = Field(
+        description="The standard deviation of the tokens generated for outputs.",
+        gt=0,
+        default=None,
+    )
+    output_tokens_min: int | None = Field(
+        description="The minimum number of text tokens generated for outputs.",
+        gt=0,
+        default=None,
+    )
+    output_tokens_max: int | None = Field(
+        description="The maximum number of text tokens generated for outputs.",
+        gt=0,
+        default=None,
+    )
+
     model_config = ConfigDict(
         extra="allow",
     )