Skip to content

Commit 6833c13

Browse files
author
wangzheyan
committed
add test, avoid empty last chunk
1 parent cdb8e4c commit 6833c13

File tree

2 files changed

+30
-1
lines changed

2 files changed

+30
-1
lines changed

src/daft-json/src/local.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,9 @@ impl<'a> JsonReader<'a> {
420420
last_pos = end_pos;
421421
}
422422

423-
offsets.push((last_pos, total_len));
423+
if last_pos < total_len {
424+
offsets.push((last_pos, total_len));
425+
}
424426

425427
offsets
426428
}

tests/io/test_jsonl_chunking.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,33 @@ def test_jsonl_chunk_size_one_reads_correctly(tmp_path: os.PathLike[str]) -> Non
7272
assert df.to_pylist() == rows
7373

7474

75+
def test_jsonl_chunk_size_mid_line_splits_correctly(tmp_path: os.PathLike[str]) -> None:
76+
"""chunk_size smaller than a single line forces every split point to land mid-line.
77+
78+
The reader must align each chunk to the next line boundary so
79+
that no row is lost or truncated.
80+
"""
81+
file_path = os.fspath(tmp_path / "mid.jsonl")
82+
# Each line is 21-23 bytes; chunk_size=10 forces every split point to land
83+
# mid-line (e.g. byte 10 is inside `{"id":1,"v|al":"hello"}\n`).
84+
rows = [
85+
{"id": 1, "val": "hello"},
86+
{"id": 2, "val": "world"},
87+
{"id": 3, "val": "foo"},
88+
{"id": 4, "val": "bar"},
89+
{"id": 5, "val": "baz"},
90+
]
91+
with open(file_path, "w", encoding="utf-8", newline="") as f:
92+
for row in rows:
93+
f.write(f'{{"id":{row["id"]},"val":"{row["val"]}"}}\n')
94+
95+
df = daft.read_json(file_path, _chunk_size=10)
96+
partitions = list(df.iter_partitions())
97+
assert len(partitions) == len(rows), "each line should become its own partition"
98+
assert df.to_pylist() == rows
99+
assert df.count_rows() == len(rows)
100+
101+
75102
def test_jsonl_gzip_chunk_size_controls_partitioning(tmp_path: os.PathLike[str]) -> None:
76103
file_path = os.fspath(tmp_path / "data.jsonl.gz")
77104
rows = 20_000

0 commit comments

Comments
 (0)