4
4
from collections .abc import Iterator
5
5
from pathlib import Path
6
6
from random import Random
7
- from typing import Any , Callable
7
+ from typing import Any , Callable , Self
8
8
9
9
import yaml
10
10
from datasets import Features , IterableDataset , Value
11
11
from faker import Faker
12
- from pydantic import Field
12
+ from pydantic import ConfigDict , Field , model_validator
13
13
from transformers import PreTrainedTokenizerBase
14
14
15
15
from guidellm .data .deserializers .deserializer import (
@@ -34,7 +34,7 @@ class SyntheticTextPrefixBucketConfig(StandardBaseModel):
34
34
default = 100 ,
35
35
)
36
36
prefix_count : int = Field (
37
- description = "The number of unique prefixs to generate for this bucket." ,
37
+ description = "The number of unique prefixes to generate for this bucket." ,
38
38
ge = 1 ,
39
39
default = 1 ,
40
40
)
@@ -46,6 +46,10 @@ class SyntheticTextPrefixBucketConfig(StandardBaseModel):
46
46
47
47
48
48
class SyntheticTextDatasetConfig (StandardBaseModel ):
49
+ model_config = ConfigDict (
50
+ extra = "allow" ,
51
+ )
52
+
49
53
prefix_buckets : list [SyntheticTextPrefixBucketConfig ] | None = Field (
50
54
description = "Buckets for the prefix tokens distribution." ,
51
55
default = None ,
@@ -93,6 +97,26 @@ class SyntheticTextDatasetConfig(StandardBaseModel):
93
97
default = "data:prideandprejudice.txt.gz" ,
94
98
)
95
99
100
+ @model_validator (mode = "after" )
101
+ def check_prefix_options (self ) -> Self :
102
+ prefix_count = self .__pydantic_extra__ .get ("prefix_count" , None ) # type: ignore[attr-defined]
103
+ prefix_tokens = self .__pydantic_extra__ .get ("prefix_count" , None ) # type: ignore[attr-defined]
104
+ if prefix_count is not None or prefix_tokens is not None :
105
+ if self .prefix_buckets :
106
+ raise ValueError (
107
+ "prefix_buckets is mutually exclusive"
108
+ " with prefix_count and prefix_tokens"
109
+ )
110
+
111
+ self .prefix_buckets = [
112
+ SyntheticTextPrefixBucketConfig (
113
+ prefix_count = prefix_count or 1 ,
114
+ prefix_tokens = prefix_tokens or 0 ,
115
+ )
116
+ ]
117
+
118
+ return self
119
+
96
120
97
121
class SyntheticTextGenerator :
98
122
def __init__ (
0 commit comments