@@ -225,3 +225,67 @@ def test_filename_pattern_with_uuid(self, pd):
225225 result = duckdb .sql (f"FROM read_parquet('{ temp_file_name } /*/*.parquet', hive_partitioning=TRUE)" )
226226 expected = [("rei" , 321.0 , "a" ), ("shinji" , 123.0 , "a" ), ("asuka" , 23.0 , "b" ), ("kaworu" , 340.0 , "c" )]
227227 assert result .execute ().fetchall () == expected
228+
229+ @pytest .mark .parametrize ("pd" , [NumpyPandas (), ArrowPandas ()])
230+ @pytest .mark .parametrize ("file_size_bytes" , [1000 , "1k" ])
231+ def test_file_size_bytes_basic (self , pd , file_size_bytes ):
232+ temp_file_name = os .path .join (tempfile .mkdtemp (), next (tempfile ._get_candidate_names ())) # noqa: PTH118
233+ # Create a larger dataset to ensure multiple files are created
234+ df = pd .DataFrame (
235+ {
236+ "name" : [f"name_{ i } " for i in range (100 )],
237+ "value" : [i * 100.0 for i in range (100 )],
238+ "description" : [f"description_{ i } _with_more_text" for i in range (100 )],
239+ }
240+ )
241+ rel = duckdb .from_df (df )
242+ rel .to_parquet (temp_file_name , file_size_bytes = file_size_bytes , per_thread_output = True )
243+
244+ # Check that multiple files were created
245+ files = list (pathlib .Path (temp_file_name ).iterdir ())
246+ assert len (files ) > 1 , f"Expected multiple files, got { len (files )} "
247+
248+ # Verify data integrity
249+ result = duckdb .read_parquet (f"{ temp_file_name } /*.parquet" )
250+ assert len (result .execute ().fetchall ()) == 100
251+
252+ @pytest .mark .parametrize ("pd" , [NumpyPandas (), ArrowPandas ()])
253+ def test_file_size_bytes_with_partition (self , pd ):
254+ temp_file_name = os .path .join (tempfile .mkdtemp (), next (tempfile ._get_candidate_names ())) # noqa: PTH118
255+ # Create a dataset with enough data to trigger file splitting
256+ df = pd .DataFrame (
257+ {
258+ "name" : [f"name_{ i } " for i in range (100 )],
259+ "value" : [i * 100.0 for i in range (100 )],
260+ "category" : ["a" if i < 50 else "b" for i in range (100 )],
261+ "description" : [f"description_{ i } _with_more_text_to_increase_size" for i in range (100 )],
262+ }
263+ )
264+ rel = duckdb .from_df (df )
265+ rel .to_parquet (temp_file_name , partition_by = ["category" ], file_size_bytes = "2k" , per_thread_output = True )
266+
267+ # Check that files were created in partition directories
268+ assert pathlib .Path (f"{ temp_file_name } /category=a" ).exists ()
269+ assert pathlib .Path (f"{ temp_file_name } /category=b" ).exists ()
270+
271+ # Verify data integrity
272+ result = duckdb .sql (f"FROM read_parquet('{ temp_file_name } /*/*.parquet', hive_partitioning=TRUE)" )
273+ assert len (result .execute ().fetchall ()) == 100
274+
275+ @pytest .mark .parametrize ("pd" , [NumpyPandas (), ArrowPandas ()])
276+ @pytest .mark .parametrize ("file_size_bytes" , ["1M" , "1G" ])
277+ def test_file_size_bytes_human_readable (self , pd , file_size_bytes ):
278+ temp_file_name = os .path .join (tempfile .mkdtemp (), next (tempfile ._get_candidate_names ())) # noqa: PTH118
279+ df = pd .DataFrame (
280+ {
281+ "name" : ["rei" , "shinji" , "asuka" , "kaworu" ],
282+ "float" : [321.0 , 123.0 , 23.0 , 340.0 ],
283+ "category" : ["a" , "a" , "b" , "c" ],
284+ }
285+ )
286+ rel = duckdb .from_df (df )
287+ rel .to_parquet (temp_file_name , file_size_bytes = file_size_bytes )
288+
289+ # With large file size limits, should create just one file
290+ parquet_rel = duckdb .read_parquet (temp_file_name )
291+ assert rel .execute ().fetchall () == parquet_rel .execute ().fetchall ()
0 commit comments