@@ -153,3 +153,98 @@ def test_read_with_encoding_setting(tmp_path):
153153
154154 with pytest .raises (Exception , match = r"(?i)utf-?8" ):
155155 daft .read_text (str (path )).to_pydict ()
156+
157+
158+ def test_read_whole_text_from_single_file (tmp_path ):
159+ path = tmp_path / "sample.txt"
160+ path .write_text ("hello\n world\n foo" , encoding = "utf-8" )
161+
162+ df = daft .read_text (str (path ), whole_text = True )
163+ assert df .schema () == Schema .from_pyarrow_schema (pa .schema ([("text" , pa .string ())]))
164+ result = df .to_pydict ()
165+ assert result ["text" ] == ["hello\n world\n foo" ]
166+
167+
168+ def test_read_whole_text_from_multiple_files (tmp_path ):
169+ file_a = tmp_path / "a.txt"
170+ file_b = tmp_path / "b.txt"
171+ file_a .write_text ("content of file a\n with multiple lines" , encoding = "utf-8" )
172+ file_b .write_text ("content of file b" , encoding = "utf-8" )
173+
174+ df = daft .read_text ([str (file_a ), str (file_b )], whole_text = True )
175+ result = df .to_pydict ()
176+ assert len (result ["text" ]) == 2
177+ assert "content of file a\n with multiple lines" in result ["text" ]
178+ assert "content of file b" in result ["text" ]
179+
180+
181+ def test_read_whole_text_with_path_column (tmp_path ):
182+ file_a = tmp_path / "a.txt"
183+ file_b = tmp_path / "b.txt"
184+ file_a .write_text ("content a" , encoding = "utf-8" )
185+ file_b .write_text ("content b" , encoding = "utf-8" )
186+
187+ df = daft .read_text ([str (file_a ), str (file_b )], whole_text = True , file_path_column = "path" )
188+ assert df .schema () == Schema .from_pyarrow_schema (pa .schema ([("text" , pa .string ()), ("path" , pa .string ())]))
189+
190+ data = df .to_pydict ()
191+ assert len (data ["text" ]) == 2
192+ assert len (data ["path" ]) == 2
193+
194+ rows = {(t , p ) for t , p in zip (data ["text" ], data ["path" ])}
195+ assert rows == {
196+ ("content a" , f"{ tmp_path } /a.txt" ),
197+ ("content b" , f"{ tmp_path } /b.txt" ),
198+ }
199+
200+
201+ def test_read_whole_text_from_empty_file (tmp_path ):
202+ path = tmp_path / "empty.txt"
203+ path .write_text ("" , encoding = "utf-8" )
204+
205+ df = daft .read_text (str (path ), whole_text = True , skip_blank_lines = False )
206+ result = df .to_pydict ()
207+ assert result ["text" ] == ["" ]
208+
209+ df = daft .read_text (str (path ), whole_text = True , skip_blank_lines = True )
210+ result = df .to_pydict ()
211+ assert result ["text" ] == []
212+
213+
214+ def test_read_whole_text_with_glob_patterns (tmp_path ):
215+ file_a = tmp_path / "a.txt"
216+ file_b = tmp_path / "b.txt"
217+ file_c = tmp_path / "c.txt"
218+ file_d = tmp_path / "d.txt"
219+ file_a .write_text ("content a1" , encoding = "utf-8" )
220+ file_b .write_text ("content b1\n content b2\t " , encoding = "utf-8" )
221+ file_c .write_text ("content c1\n content c2\n content c3\n \t " , encoding = "utf-8" )
222+ file_d .write_text ("" , encoding = "utf-8" )
223+
224+ df = daft .read_text (
225+ str (tmp_path / "*.txt" ),
226+ skip_blank_lines = True ,
227+ whole_text = True ,
228+ file_path_column = "path" ,
229+ )
230+ data = df .to_pydict ()
231+ assert len (data ["text" ]) == 3
232+ assert len (data ["path" ]) == 3
233+
234+ file_to_content = {p : t for p , t in zip (data ["path" ], data ["text" ])}
235+ assert file_to_content [str (file_a )] == "content a1"
236+ assert file_to_content [str (file_b )] == "content b1\n content b2\t "
237+ assert file_to_content [str (file_c )] == "content c1\n content c2\n content c3\n \t "
238+
239+
240+ def test_read_whole_text_with_gzip (tmp_path ):
241+ def _write_gzip (path : Path , content : bytes ) -> None :
242+ with gzip .open (path , "wb" ) as f :
243+ f .write (content )
244+
245+ path = tmp_path / "compressed.txt.gz"
246+ _write_gzip (path , b"line1\n line2\n line3" )
247+
248+ df = daft .read_text (str (path ), whole_text = True )
249+ result = df .to_pydict ()
250+ assert result ["text" ] == ["line1\n line2\n line3" ]
0 commit comments