Skip to content

Commit 9c66cf3

Browse files
authored
Fix double-offset bug in chunked sparse CSV row indices (#2279)
1 parent 8ca68fd commit 9c66cf3

File tree

2 files changed

+45
-0
lines changed

2 files changed

+45
-0
lines changed

tiledb/dataframe_.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -967,6 +967,9 @@ def from_csv(uri: str, csv_file: Union[str, List[str]], **kwargs):
967967
break
968968
df = pandas.concat(df_list)
969969
if "index_col" not in tiledb_args and df.index.name is None:
970+
# Reset index so row_start_idx can be applied correctly
971+
# (concat preserves original indices from the CSV files)
972+
df.reset_index(drop=True, inplace=True)
970973
df.index.name = "__tiledb_rows"
971974

972975
tiledb_args["row_start_idx"] = rows_written
@@ -986,6 +989,9 @@ def from_csv(uri: str, csv_file: Union[str, List[str]], **kwargs):
986989
df = next(df_iter, None)
987990
while df is not None:
988991
if "index_col" not in tiledb_args and df.index.name is None:
992+
# Reset index for each chunk so row_start_idx can be applied correctly
993+
# (pandas.read_csv with chunksize preserves original row indices)
994+
df.reset_index(drop=True, inplace=True)
989995
df.index.name = "__tiledb_rows"
990996

991997
# tell from_pandas what row to start the next write

tiledb/tests/test_pandas_dataframe.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,6 +1178,45 @@ def test_dataframe_csv_chunked(self):
11781178
df_idx_res = A.query(coords=False).df[int(ned[0]) : int(ned[1])]
11791179
tm.assert_frame_equal(df_idx_res, df.reset_index(drop=True))
11801180

1181+
def test_dataframe_csv_chunked_sparse_row_indices(self):
1182+
"""Test that chunked sparse CSV writes produce correct row indices.
1183+
1184+
Regression test for issue TileDB-Py#2278 where row_start_idx was being applied twice
1185+
when using chunksize with sparse arrays, causing indices to jump.
1186+
"""
1187+
# Create a simple CSV with 9 rows
1188+
df = pd.DataFrame({"char": list("foobarbaz")})
1189+
1190+
tmp_dir = self.path("csv_chunked_sparse_indices")
1191+
self.vfs.create_dir(tmp_dir)
1192+
tmp_csv = os.path.join(tmp_dir, "source.csv")
1193+
1194+
with tiledb.FileIO(self.vfs, tmp_csv, "wb") as fio:
1195+
df.to_csv(fio, index=False)
1196+
1197+
# Write without chunking (as baseline)
1198+
tmp_array_unchunked = os.path.join(tmp_dir, "unchunked")
1199+
tiledb.from_csv(tmp_array_unchunked, csv_file=tmp_csv, sparse=True)
1200+
1201+
# Write with chunking (3 rows per chunk)
1202+
tmp_array_chunked = os.path.join(tmp_dir, "chunked")
1203+
tiledb.from_csv(tmp_array_chunked, csv_file=tmp_csv, sparse=True, chunksize=3)
1204+
1205+
# Read back both arrays
1206+
with tiledb.open(tmp_array_unchunked) as A:
1207+
df_unchunked = A.df[:]
1208+
1209+
with tiledb.open(tmp_array_chunked) as A:
1210+
df_chunked = A.df[:]
1211+
1212+
# Both should have the same indices: [0, 1, 2, 3, 4, 5, 6, 7, 8]
1213+
expected_index = list(range(9))
1214+
self.assertEqual(list(df_unchunked.index), expected_index)
1215+
self.assertEqual(list(df_chunked.index), expected_index)
1216+
1217+
# DataFrames should be identical
1218+
tm.assert_frame_equal(df_chunked, df_unchunked)
1219+
11811220
def test_csv_fillna(self):
11821221
if pytest.tiledb_vfs == "s3":
11831222
pytest.skip(

0 commit comments

Comments
 (0)