@@ -56,6 +56,56 @@ def to_index_range(self, dataset_size: int) -> IndexRange:
5656
5757
5858class SeedConfig (ConfigBase ):
59+ """Configuration for sampling data from a seed dataset.
60+
61+ Args:
62+ dataset: Path or identifier for the seed dataset.
63+ sampling_strategy: Strategy for how to sample rows from the dataset.
64+ - ORDERED: Read rows sequentially in their original order.
65+ - SHUFFLE: Randomly shuffle rows before sampling. When used with
66+ selection_strategy, shuffling occurs within the selected range/partition.
67+ selection_strategy: Optional strategy to select a subset of the dataset.
68+ - IndexRange: Select a specific range of indices (e.g., rows 100-200).
69+ - PartitionBlock: Select a partition by splitting the dataset into N equal parts.
70+ Partition indices are zero-based (index=0 is the first partition, index=1 is
71+ the second, etc.).
72+
73+ Examples:
74+ Read rows sequentially from start to end:
75+ SeedConfig(dataset="my_data.parquet", sampling_strategy=SamplingStrategy.ORDERED)
76+
77+ Read rows in random order:
78+ SeedConfig(dataset="my_data.parquet", sampling_strategy=SamplingStrategy.SHUFFLE)
79+
80+ Read specific index range (rows 100-199):
81+ SeedConfig(
82+ dataset="my_data.parquet",
83+ sampling_strategy=SamplingStrategy.ORDERED,
84+ selection_strategy=IndexRange(start=100, end=199)
85+ )
86+
87+ Read random rows from a specific index range (shuffles within rows 100-199):
88+ SeedConfig(
89+ dataset="my_data.parquet",
90+ sampling_strategy=SamplingStrategy.SHUFFLE,
91+ selection_strategy=IndexRange(start=100, end=199)
92+ )
93+
94+ Read from partition 2 (3rd partition, zero-based) of 5 partitions (20% of dataset):
95+ SeedConfig(
96+ dataset="my_data.parquet",
97+ sampling_strategy=SamplingStrategy.ORDERED,
98+ selection_strategy=PartitionBlock(index=2, num_partitions=5)
99+ )
100+
101+ Read shuffled rows from partition 0 of 10 partitions (shuffles within the partition):
102+ SeedConfig(
103+ dataset="my_data.parquet",
104+ sampling_strategy=SamplingStrategy.SHUFFLE,
105+ selection_strategy=PartitionBlock(index=0, num_partitions=10)
106+ )
107+ """
108+
59109 dataset : str
60110 sampling_strategy : SamplingStrategy = SamplingStrategy .ORDERED
61111 selection_strategy : Optional [Union [IndexRange , PartitionBlock ]] = None
0 commit comments