@@ -92,22 +92,20 @@ def _reset_batch_reader(self, num_records: int) -> None:
9292 shuffle_query = " ORDER BY RANDOM()" if shuffle else ""
9393
9494 if self ._index_range is not None :
95- # Use subquery with row_number() window function to filter by index range
96- # IndexRange uses 0-based indexing [start, end] inclusive, row_number() is 1-based
97- # To convert 0-based index i to 1-based row_number: row_number = i + 1
98- # For inclusive range [start, end], we want: row_number > start AND row_number <= end + 1
99- # This gives us 1-based rows [start+1, end+1] which maps to 0-based indices [start, end]
95+ # Use LIMIT and OFFSET for efficient index range filtering
96+ # IndexRange uses 0-based indexing [start, end] inclusive
97+ # OFFSET skips the first 'start' rows (0-based)
98+ # LIMIT takes 'end - start + 1' rows to include both start and end (inclusive)
99+ offset_value = self ._index_range .start
100+ limit_value = self ._index_range .end - self ._index_range .start + 1
100101 read_query = f"""
101- SELECT * EXCLUDE (row_num) FROM (
102- SELECT *, row_number() OVER () as row_num
103- FROM '{ self ._dataset_uri } '
104- )
105- WHERE row_num > { self ._index_range .start } AND row_num <= { self ._index_range .end + 1 }
106- { shuffle_query }
102+ SELECT * FROM '{ self ._dataset_uri } '
103+ LIMIT { limit_value } OFFSET { offset_value }
107104 """
105+
106+ read_query = f"SELECT * FROM ({ read_query } ){ shuffle_query } "
108107 else :
109108 read_query = f"SELECT * FROM '{ self ._dataset_uri } '{ shuffle_query } "
110-
111109 self ._batch_reader = self .duckdb_conn .query (read_query ).record_batch (batch_size = num_records )
112110
113111 def _sample_records (self , num_records : int ) -> pd .DataFrame :
0 commit comments