Skip to content

Commit 9765643

Browse files
committed
Add docstring to seedconfig
1 parent 483363d commit 9765643

File tree

1 file changed

+50
-0
lines changed

1 file changed

+50
-0
lines changed

src/data_designer/config/seed.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,56 @@ def to_index_range(self, dataset_size: int) -> IndexRange:
5656

5757

5858
class SeedConfig(ConfigBase):
59+
"""Configuration for sampling data from a seed dataset.
60+
61+
Args:
62+
dataset: Path or identifier for the seed dataset.
63+
sampling_strategy: Strategy for how to sample rows from the dataset.
64+
- ORDERED: Read rows sequentially in their original order.
65+
- SHUFFLE: Randomly shuffle rows before sampling. When used with
66+
selection_strategy, shuffling occurs within the selected range/partition.
67+
selection_strategy: Optional strategy to select a subset of the dataset.
68+
- IndexRange: Select a specific range of indices (e.g., rows 100-200).
69+
- PartitionBlock: Select a partition by splitting the dataset into N equal parts.
70+
Partition indices are zero-based (index=0 is the first partition, index=1 is
71+
the second, etc.).
72+
73+
Examples:
74+
Read rows sequentially from start to end:
75+
SeedConfig(dataset="my_data.parquet", sampling_strategy=SamplingStrategy.ORDERED)
76+
77+
Read rows in random order:
78+
SeedConfig(dataset="my_data.parquet", sampling_strategy=SamplingStrategy.SHUFFLE)
79+
80+
Read specific index range (rows 100-199):
81+
SeedConfig(
82+
dataset="my_data.parquet",
83+
sampling_strategy=SamplingStrategy.ORDERED,
84+
selection_strategy=IndexRange(start=100, end=199)
85+
)
86+
87+
Read random rows from a specific index range (shuffles within rows 100-199):
88+
SeedConfig(
89+
dataset="my_data.parquet",
90+
sampling_strategy=SamplingStrategy.SHUFFLE,
91+
selection_strategy=IndexRange(start=100, end=199)
92+
)
93+
94+
Read from partition 2 (3rd partition, zero-based) of 5 partitions (20% of dataset):
95+
SeedConfig(
96+
dataset="my_data.parquet",
97+
sampling_strategy=SamplingStrategy.ORDERED,
98+
selection_strategy=PartitionBlock(index=2, num_partitions=5)
99+
)
100+
101+
Read shuffled rows from partition 0 of 10 partitions (shuffles within the partition):
102+
SeedConfig(
103+
dataset="my_data.parquet",
104+
sampling_strategy=SamplingStrategy.SHUFFLE,
105+
selection_strategy=PartitionBlock(index=0, num_partitions=10)
106+
)
107+
"""
108+
59109
dataset: str
60110
sampling_strategy: SamplingStrategy = SamplingStrategy.ORDERED
61111
selection_strategy: Optional[Union[IndexRange, PartitionBlock]] = None

0 commit comments

Comments
 (0)