@@ -72,6 +72,33 @@ def test_jsonl_chunk_size_one_reads_correctly(tmp_path: os.PathLike[str]) -> Non
7272 assert df .to_pylist () == rows
7373
7474
75+ def test_jsonl_chunk_size_mid_line_splits_correctly (tmp_path : os .PathLike [str ]) -> None :
76+ """chunk_size smaller than a single line forces every split point to land mid-line.
77+
78+ The reader must align each chunk to the next line boundary so
79+ that no row is lost or truncated.
80+ """
81+ file_path = os .fspath (tmp_path / "mid.jsonl" )
82+ # Each line is 21-23 bytes; chunk_size=10 forces every split point to land
83+ # mid-line (e.g. byte 10 is inside `{"id":1,"v|al":"hello"}\n`).
84+ rows = [
85+ {"id" : 1 , "val" : "hello" },
86+ {"id" : 2 , "val" : "world" },
87+ {"id" : 3 , "val" : "foo" },
88+ {"id" : 4 , "val" : "bar" },
89+ {"id" : 5 , "val" : "baz" },
90+ ]
91+ with open (file_path , "w" , encoding = "utf-8" , newline = "" ) as f :
92+ for row in rows :
93+ f .write (f'{{"id":{ row ["id" ]} ,"val":"{ row ["val" ]} "}}\n ' )
94+
95+ df = daft .read_json (file_path , _chunk_size = 10 )
96+ partitions = list (df .iter_partitions ())
97+ assert len (partitions ) == len (rows ), "each line should become its own partition"
98+ assert df .to_pylist () == rows
99+ assert df .count_rows () == len (rows )
100+
101+
75102def test_jsonl_gzip_chunk_size_controls_partitioning (tmp_path : os .PathLike [str ]) -> None :
76103 file_path = os .fspath (tmp_path / "data.jsonl.gz" )
77104 rows = 20_000
0 commit comments