@@ -226,54 +226,24 @@ def test_filename_pattern_with_uuid(self, pd):
226226 expected = [("rei" , 321.0 , "a" ), ("shinji" , 123.0 , "a" ), ("asuka" , 23.0 , "b" ), ("kaworu" , 340.0 , "c" )]
227227 assert result .execute ().fetchall () == expected
228228
229- @pytest .mark .parametrize ("pd" , [NumpyPandas (), ArrowPandas ()])
230229 @pytest .mark .parametrize ("file_size_bytes" , [1000 , "1k" ])
231- def test_file_size_bytes_basic (self , pd , file_size_bytes ):
230+ def test_file_size_bytes_basic (self , file_size_bytes ):
232231 temp_file_name = os .path .join (tempfile .mkdtemp (), next (tempfile ._get_candidate_names ())) # noqa: PTH118
233- # Create a larger dataset to ensure multiple files are created
234- df = pd .DataFrame (
235- {
236- "name" : [f"name_{ i } " for i in range (100 )],
237- "value" : [i * 100.0 for i in range (100 )],
238- "description" : [f"description_{ i } _with_more_text" for i in range (100 )],
239- }
240- )
241- rel = duckdb .from_df (df )
242- rel .to_parquet (temp_file_name , file_size_bytes = file_size_bytes , per_thread_output = True )
232+
233+ # use same test data as external/duckdb/test/sql/copy/file_size_bytes.test
234+ rel = duckdb .from_query ("SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);" )
235+ rel .to_parquet (temp_file_name , file_size_bytes = file_size_bytes , row_group_size = 2000 )
243236
244237 # Check that multiple files were created
245238 files = list (pathlib .Path (temp_file_name ).iterdir ())
246239 assert len (files ) > 1 , f"Expected multiple files, got { len (files )} "
247240
248241 # Verify data integrity
249242 result = duckdb .read_parquet (f"{ temp_file_name } /*.parquet" )
250- assert len (result .execute ().fetchall ()) == 100
251-
252- @pytest .mark .parametrize ("pd" , [NumpyPandas (), ArrowPandas ()])
253- def test_file_size_bytes_with_partition (self , pd ):
254- temp_file_name = os .path .join (tempfile .mkdtemp (), next (tempfile ._get_candidate_names ())) # noqa: PTH118
255- # Create a dataset with enough data to trigger file splitting
256- df = pd .DataFrame (
257- {
258- "name" : [f"name_{ i } " for i in range (100 )],
259- "value" : [i * 100.0 for i in range (100 )],
260- "category" : ["a" if i < 50 else "b" for i in range (100 )],
261- "description" : [f"description_{ i } _with_more_text_to_increase_size" for i in range (100 )],
262- }
263- )
264- rel = duckdb .from_df (df )
265- rel .to_parquet (temp_file_name , partition_by = ["category" ], file_size_bytes = "2k" , per_thread_output = True )
266-
267- # Check that files were created in partition directories
268- assert pathlib .Path (f"{ temp_file_name } /category=a" ).exists ()
269- assert pathlib .Path (f"{ temp_file_name } /category=b" ).exists ()
270-
271- # Verify data integrity
272- result = duckdb .sql (f"FROM read_parquet('{ temp_file_name } /*/*.parquet', hive_partitioning=TRUE)" )
273- assert len (result .execute ().fetchall ()) == 100
243+ assert len (result .execute ().fetchall ()) == 10000
274244
275245 @pytest .mark .parametrize ("pd" , [NumpyPandas (), ArrowPandas ()])
276- @pytest .mark .parametrize ("file_size_bytes" , ["1M " , "1G" ])
246+ @pytest .mark .parametrize ("file_size_bytes" , ["256MB " , "1G" ])
277247 def test_file_size_bytes_human_readable (self , pd , file_size_bytes ):
278248 temp_file_name = os .path .join (tempfile .mkdtemp (), next (tempfile ._get_candidate_names ())) # noqa: PTH118
279249 df = pd .DataFrame (
0 commit comments