@@ -41,6 +41,19 @@ class SamplerType(str, Enum):
4141
4242
4343class CategorySamplerParams (ConfigBase ):
44+ """Parameters for categorical sampling with optional probability weighting.
45+
46+ Samples values from a discrete set of categories. When weights are provided, values are
47+ sampled according to their assigned probabilities. Without weights, uniform sampling is used.
48+
49+ Attributes:
50+ values: List of possible categorical values to sample from. Can contain strings, integers,
51+ or floats. Must contain at least one value.
52+ weights: Optional unnormalized probability weights for each value. If provided, must be
53+ the same length as `values`. Weights are automatically normalized to sum to 1.0.
54+ Larger weights result in higher sampling probability for the corresponding value.
55+ """
56+
4457 values : list [Union [str , int , float ]] = Field (
4558 ...,
4659 min_length = 1 ,
@@ -68,6 +81,25 @@ def _validate_equal_lengths(self) -> Self:
6881
6982
7083class DatetimeSamplerParams (ConfigBase ):
84+ """Parameters for uniform datetime sampling within a specified range.
85+
86+ Samples datetime values uniformly between a start and end date with a specified granularity.
87+ The sampling unit determines the smallest possible time interval between consecutive samples.
88+
89+ Attributes:
90+ start: Earliest possible datetime for the sampling range (inclusive). Must be a valid
91+ datetime string parseable by pandas.to_datetime().
92+ end: Latest possible datetime for the sampling range (inclusive). Must be a valid
93+ datetime string parseable by pandas.to_datetime().
94+ unit: Time unit for sampling granularity. Options:
95+ - "Y": Years
96+ - "M": Months
97+ - "D": Days (default)
98+ - "h": Hours
99+ - "m": Minutes
100+ - "s": Seconds
101+ """
102+
71103 start : str = Field (..., description = "Earliest possible datetime for sampling range, inclusive." )
72104 end : str = Field (..., description = "Latest possible datetime for sampling range, inclusive." )
73105 unit : Literal ["Y" , "M" , "D" , "h" , "m" , "s" ] = Field (
@@ -86,6 +118,19 @@ def _validate_param_is_datetime(cls, value: str) -> str:
86118
87119
88120class SubcategorySamplerParams (ConfigBase ):
121+ """Parameters for subcategory sampling conditioned on a parent category column.
122+
123+ Samples subcategory values based on the value of a parent category column. Each parent
124+ category value maps to its own list of possible subcategory values, enabling hierarchical
125+ or conditional sampling patterns.
126+
127+ Attributes:
128+ category: Name of the parent category column that this subcategory depends on.
129+ The parent column must be generated before this subcategory column.
130+ values: Mapping from each parent category value to a list of possible subcategory values.
131+ Each key must correspond to a value that appears in the parent category column.
132+ """
133+
89134 category : str = Field (..., description = "Name of parent category to this subcategory." )
90135 values : dict [str , list [Union [str , int , float ]]] = Field (
91136 ...,
@@ -94,6 +139,30 @@ class SubcategorySamplerParams(ConfigBase):
94139
95140
96141class TimeDeltaSamplerParams (ConfigBase ):
142+ """Parameters for sampling time deltas relative to a reference datetime column.
143+
144+ Samples time offsets within a specified range and adds them to values from a reference
145+ datetime column. This is useful for generating related datetime columns like order dates
146+ and delivery dates, or event start times and end times.
147+
148+ Note:
149+ Years and months are not supported as timedelta units because they have variable lengths.
150+ See: [pandas timedelta documentation](https://pandas.pydata.org/docs/user_guide/timedeltas.html)
151+
152+ Attributes:
153+ dt_min: Minimum time-delta value (inclusive). Must be non-negative and less than `dt_max`.
154+ Specified in units defined by the `unit` parameter.
155+ dt_max: Maximum time-delta value (exclusive). Must be positive and greater than `dt_min`.
156+ Specified in units defined by the `unit` parameter.
157+ reference_column_name: Name of an existing datetime column to add the time-delta to.
158+ This column must be generated before the timedelta column.
159+ unit: Time unit for the delta values. Options:
160+ - "D": Days (default)
161+ - "h": Hours
162+ - "m": Minutes
163+ - "s": Seconds
164+ """
165+
97166 dt_min : int = Field (
98167 ...,
99168 ge = 0 ,
@@ -127,6 +196,20 @@ def _validate_min_less_than_max(self) -> Self:
127196
128197
129198class UUIDSamplerParams (ConfigBase ):
199+ """Parameters for generating UUID (Universally Unique Identifier) values.
200+
201+ Generates UUID4 (random) identifiers with optional formatting options. UUIDs are useful
202+ for creating unique identifiers for records, entities, or transactions.
203+
204+ Attributes:
205+ prefix: Optional string to prepend to each UUID. Useful for creating namespaced or
206+ typed identifiers (e.g., "user-", "order-", "txn-").
207+ short_form: If True, truncates UUIDs to 8 characters (first segment only). Default is False
208+ for full 32-character UUIDs (excluding hyphens).
209+ uppercase: If True, converts all hexadecimal letters to uppercase. Default is False for
210+ lowercase UUIDs.
211+ """
212+
130213 prefix : Optional [str ] = Field (default = None , description = "String prepended to the front of the UUID." )
131214 short_form : bool = Field (
132215 default = False ,
@@ -148,6 +231,24 @@ def last_index(self) -> int:
148231
149232
150233class ScipySamplerParams (ConfigBase ):
234+ """Parameters for sampling from any scipy.stats continuous or discrete distribution.
235+
236+ Provides a flexible interface to sample from the wide range of probability distributions
237+ available in scipy.stats. This enables advanced statistical sampling beyond the built-in
238+ distribution types (Gaussian, Uniform, etc.).
239+
240+ See: [scipy.stats documentation](https://docs.scipy.org/doc/scipy/reference/stats.html)
241+
242+ Attributes:
243+ dist_name: Name of the scipy.stats distribution to sample from (e.g., "beta", "gamma",
244+ "lognorm", "expon"). Must be a valid distribution name from scipy.stats.
245+ dist_params: Dictionary of parameters for the specified distribution. Parameter names
246+ and values must match the scipy.stats distribution specification (e.g., {"a": 2, "b": 5}
247+ for beta distribution, {"scale": 1.5} for exponential).
248+ decimal_places: Optional number of decimal places to round sampled values to. If None,
249+ values are not rounded.
250+ """
251+
151252 dist_name : str = Field (..., description = "Name of a scipy.stats distribution." )
152253 dist_params : dict = Field (
153254 ...,
@@ -159,15 +260,55 @@ class ScipySamplerParams(ConfigBase):
159260
160261
161262class BinomialSamplerParams (ConfigBase ):
263+ """Parameters for sampling from a Binomial distribution.
264+
265+ Samples integer values representing the number of successes in a fixed number of independent
266+ Bernoulli trials, each with the same probability of success. Commonly used to model the number
267+ of successful outcomes in repeated experiments.
268+
269+ Attributes:
270+ n: Number of independent trials. Must be a positive integer.
271+ p: Probability of success on each trial. Must be between 0.0 and 1.0 (inclusive).
272+ """
273+
162274 n : int = Field (..., description = "Number of trials." )
163275 p : float = Field (..., description = "Probability of success on each trial." , ge = 0.0 , le = 1.0 )
164276
165277
166278class BernoulliSamplerParams (ConfigBase ):
279+ """Parameters for sampling from a Bernoulli distribution.
280+
281+ Samples binary values (0 or 1) representing the outcome of a single trial with a fixed
282+ probability of success. This is the simplest discrete probability distribution, useful for
283+ modeling binary outcomes like success/failure, yes/no, or true/false.
284+
285+ Attributes:
286+ p: Probability of success (sampling 1). Must be between 0.0 and 1.0 (inclusive).
287+ The probability of failure (sampling 0) is automatically 1 - p.
288+ """
289+
167290 p : float = Field (..., description = "Probability of success." , ge = 0.0 , le = 1.0 )
168291
169292
170293class BernoulliMixtureSamplerParams (ConfigBase ):
294+ """Parameters for sampling from a Bernoulli mixture distribution.
295+
296+ Combines a Bernoulli distribution with another continuous distribution, creating a mixture
297+ where values are either 0 (with probability 1-p) or sampled from the specified distribution
298+ (with probability p). This is useful for modeling scenarios with many zero values mixed with
299+ a continuous distribution of non-zero values.
300+
301+ Common use cases include modeling sparse events, zero-inflated data, or situations where
302+ an outcome either doesn't occur (0) or follows a specific distribution when it does occur.
303+
304+ Attributes:
305+ p: Probability of sampling from the mixture distribution (non-zero outcome).
306+ Must be between 0.0 and 1.0 (inclusive). With probability 1-p, the sample is 0.
307+ dist_name: Name of the scipy.stats distribution to sample from when outcome is non-zero.
308+ Must be a valid scipy.stats distribution name (e.g., "norm", "gamma", "expon").
309+ dist_params: Parameters for the specified scipy.stats distribution.
310+ """
311+
171312 p : float = Field (
172313 ...,
173314 description = "Bernoulli distribution probability of success." ,
@@ -189,6 +330,21 @@ class BernoulliMixtureSamplerParams(ConfigBase):
189330
190331
191332class GaussianSamplerParams (ConfigBase ):
333+ """Parameters for sampling from a Gaussian (Normal) distribution.
334+
335+ Samples continuous values from a normal distribution characterized by its mean and standard
336+ deviation. The Gaussian distribution is one of the most commonly used probability distributions,
337+ appearing naturally in many real-world phenomena due to the Central Limit Theorem.
338+
339+ Attributes:
340+ mean: Mean (center) of the Gaussian distribution. This is the expected value and the
341+ location of the distribution's peak.
342+ stddev: Standard deviation of the Gaussian distribution. Controls the spread or width
343+ of the distribution. Must be positive.
344+ decimal_places: Optional number of decimal places to round sampled values to. If None,
345+ values are not rounded.
346+ """
347+
192348 mean : float = Field (..., description = "Mean of the Gaussian distribution" )
193349 stddev : float = Field (..., description = "Standard deviation of the Gaussian distribution" )
194350 decimal_places : Optional [int ] = Field (
@@ -197,10 +353,38 @@ class GaussianSamplerParams(ConfigBase):
197353
198354
199355class PoissonSamplerParams (ConfigBase ):
356+ """Parameters for sampling from a Poisson distribution.
357+
358+ Samples non-negative integer values representing the number of events occurring in a fixed
359+ interval of time or space. The Poisson distribution is commonly used to model count data
360+ like the number of arrivals, occurrences, or events per time period.
361+
362+ The distribution is characterized by a single parameter (mean/rate), and both the mean and
363+ variance equal this parameter value.
364+
365+ Attributes:
366+ mean: Mean number of events in the fixed interval (also called rate parameter λ).
367+ Must be positive. This represents both the expected value and the variance of the
368+ distribution.
369+ """
370+
200371 mean : float = Field (..., description = "Mean number of events in a fixed interval." )
201372
202373
203374class UniformSamplerParams (ConfigBase ):
375+ """Parameters for sampling from a continuous Uniform distribution.
376+
377+ Samples continuous values uniformly from a specified range, where every value in the range
378+ has equal probability of being sampled. This is useful when all values within a range are
379+ equally likely, such as random percentages, proportions, or unbiased measurements.
380+
381+ Attributes:
382+ low: Lower bound of the uniform distribution (inclusive). Can be any real number.
383+ high: Upper bound of the uniform distribution (inclusive). Must be greater than `low`.
384+ decimal_places: Optional number of decimal places to round sampled values to. If None,
385+ values are not rounded and may have many decimal places.
386+ """
387+
204388 low : float = Field (..., description = "Lower bound of the uniform distribution, inclusive." )
205389 high : float = Field (..., description = "Upper bound of the uniform distribution, inclusive." )
206390 decimal_places : Optional [int ] = Field (
@@ -216,6 +400,35 @@ class UniformSamplerParams(ConfigBase):
216400
217401
218402class PersonSamplerParams (ConfigBase ):
403+ """Parameters for sampling synthetic person data with demographic attributes.
404+
405+ Generates realistic synthetic person data including names, addresses, phone numbers, and other
406+ demographic information. Data can be sampled from managed datasets (when available) or generated
407+ using Faker. The sampler supports filtering by locale, sex, age, geographic location, and can
408+ optionally include synthetic persona descriptions.
409+
410+ Attributes:
411+ locale: Locale string determining the language and geographic region for synthetic people.
412+ Format: language_COUNTRY (e.g., "en_US", "en_GB", "fr_FR", "de_DE", "es_ES", "ja_JP").
413+ Defaults to "en_US".
414+ sex: If specified, filters to only sample people of the specified sex. Options: "Male" or
415+ "Female". If None, samples both sexes.
416+ city: If specified, filters to only sample people from the specified city or cities. Can be
417+ a single city name (string) or a list of city names.
418+ age_range: Two-element list [min_age, max_age] specifying the age range to sample from
419+ (inclusive). Defaults to a standard age range. Both values must be between minimum and
420+ maximum allowed ages.
421+ state: Only supported for "en_US" locale. Filters to sample people from specified US state(s).
422+ Must be provided as two-letter state abbreviations (e.g., "CA", "NY", "TX"). Can be a
423+ single state or a list of states.
424+ with_synthetic_personas: If True, appends additional synthetic persona columns including
425+ personality traits, interests, and background descriptions. Only supported for certain
426+ locales with managed datasets.
427+ sample_dataset_when_available: If True, samples from curated managed datasets when available
428+ for the specified locale. If False or unavailable, falls back to Faker-generated data.
429+ Managed datasets typically provide more realistic and diverse synthetic people.
430+ """
431+
219432 locale : str = Field (
220433 default = "en_US" ,
221434 description = (
0 commit comments