Skip to content

Commit 483363d

Browse files
committed
Optimize duckdb seed dataset select based on on limit and offset
1 parent c32b278 commit 483363d

File tree

1 file changed

+10
-12
lines changed

1 file changed

+10
-12
lines changed

src/data_designer/engine/column_generators/generators/seed_dataset.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -92,22 +92,20 @@ def _reset_batch_reader(self, num_records: int) -> None:
9292
shuffle_query = " ORDER BY RANDOM()" if shuffle else ""
9393

9494
if self._index_range is not None:
95-
# Use subquery with row_number() window function to filter by index range
96-
# IndexRange uses 0-based indexing [start, end] inclusive, row_number() is 1-based
97-
# To convert 0-based index i to 1-based row_number: row_number = i + 1
98-
# For inclusive range [start, end], we want: row_number > start AND row_number <= end + 1
99-
# This gives us 1-based rows [start+1, end+1] which maps to 0-based indices [start, end]
95+
# Use LIMIT and OFFSET for efficient index range filtering
96+
# IndexRange uses 0-based indexing [start, end] inclusive
97+
# OFFSET skips the first 'start' rows (0-based)
98+
# LIMIT takes 'end - start + 1' rows to include both start and end (inclusive)
99+
offset_value = self._index_range.start
100+
limit_value = self._index_range.end - self._index_range.start + 1
100101
read_query = f"""
101-
SELECT * EXCLUDE (row_num) FROM (
102-
SELECT *, row_number() OVER () as row_num
103-
FROM '{self._dataset_uri}'
104-
)
105-
WHERE row_num > {self._index_range.start} AND row_num <= {self._index_range.end + 1}
106-
{shuffle_query}
102+
SELECT * FROM '{self._dataset_uri}'
103+
LIMIT {limit_value} OFFSET {offset_value}
107104
"""
105+
106+
read_query = f"SELECT * FROM ({read_query}){shuffle_query}"
108107
else:
109108
read_query = f"SELECT * FROM '{self._dataset_uri}'{shuffle_query}"
110-
111109
self._batch_reader = self.duckdb_conn.query(read_query).record_batch(batch_size=num_records)
112110

113111
def _sample_records(self, num_records: int) -> pd.DataFrame:

0 commit comments

Comments
 (0)