@@ -76,6 +76,8 @@ impl DataFrame {
7676
7777 let file_type = format_as_file_type ( format) ;
7878
79+ let copy_options = options. build_sink_options ( ) ;
80+
7981 let plan = if options. sort_by . is_empty ( ) {
8082 self . plan
8183 } else {
@@ -88,7 +90,7 @@ impl DataFrame {
8890 plan,
8991 path. into ( ) ,
9092 file_type,
91- Default :: default ( ) ,
93+ copy_options ,
9294 options. partition_by ,
9395 ) ?
9496 . build ( ) ?;
@@ -324,4 +326,156 @@ mod tests {
324326
325327 Ok ( ( ) )
326328 }
329+
330+ /// Test FileOutputMode::SingleFile - explicitly request single file output
331+ /// for paths WITHOUT file extensions. This verifies the fix for the regression
332+ /// where extension heuristics ignored the explicit with_single_file_output(true).
333+ #[ tokio:: test]
334+ async fn test_file_output_mode_single_file ( ) -> Result < ( ) > {
335+ use arrow:: array:: Int32Array ;
336+ use arrow:: datatypes:: { DataType , Field , Schema } ;
337+ use arrow:: record_batch:: RecordBatch ;
338+
339+ let ctx = SessionContext :: new ( ) ;
340+ let tmp_dir = TempDir :: new ( ) ?;
341+
342+ // Path WITHOUT .parquet extension - this is the key scenario
343+ let output_path = tmp_dir. path ( ) . join ( "data_no_ext" ) ;
344+ let output_path_str = output_path. to_str ( ) . unwrap ( ) ;
345+
346+ let df = ctx. read_batch ( RecordBatch :: try_new (
347+ Arc :: new ( Schema :: new ( vec ! [ Field :: new( "id" , DataType :: Int32 , false ) ] ) ) ,
348+ vec ! [ Arc :: new( Int32Array :: from( vec![ 1 , 2 , 3 ] ) ) ] ,
349+ ) ?) ?;
350+
351+ // Explicitly request single file output
352+ df. write_parquet (
353+ output_path_str,
354+ DataFrameWriteOptions :: new ( ) . with_single_file_output ( true ) ,
355+ None ,
356+ )
357+ . await ?;
358+
359+ // Verify: output should be a FILE, not a directory
360+ assert ! (
361+ output_path. is_file( ) ,
362+ "Expected single file at {:?}, but got is_file={}, is_dir={}" ,
363+ output_path,
364+ output_path. is_file( ) ,
365+ output_path. is_dir( )
366+ ) ;
367+
368+ // Verify the file is readable as parquet
369+ let file = std:: fs:: File :: open ( & output_path) ?;
370+ let reader = parquet:: file:: reader:: SerializedFileReader :: new ( file) ?;
371+ let metadata = reader. metadata ( ) ;
372+ assert_eq ! ( metadata. num_row_groups( ) , 1 ) ;
373+ assert_eq ! ( metadata. file_metadata( ) . num_rows( ) , 3 ) ;
374+
375+ Ok ( ( ) )
376+ }
377+
378+ /// Test FileOutputMode::Automatic - uses extension heuristic.
379+ /// Path WITH extension -> single file; path WITHOUT extension -> directory.
380+ #[ tokio:: test]
381+ async fn test_file_output_mode_automatic ( ) -> Result < ( ) > {
382+ use arrow:: array:: Int32Array ;
383+ use arrow:: datatypes:: { DataType , Field , Schema } ;
384+ use arrow:: record_batch:: RecordBatch ;
385+
386+ let ctx = SessionContext :: new ( ) ;
387+ let tmp_dir = TempDir :: new ( ) ?;
388+
389+ let schema =
390+ Arc :: new ( Schema :: new ( vec ! [ Field :: new( "id" , DataType :: Int32 , false ) ] ) ) ;
391+ let batch = RecordBatch :: try_new (
392+ schema,
393+ vec ! [ Arc :: new( Int32Array :: from( vec![ 1 , 2 , 3 ] ) ) ] ,
394+ ) ?;
395+
396+ // Case 1: Path WITH extension -> should create single file (Automatic mode)
397+ let output_with_ext = tmp_dir. path ( ) . join ( "data.parquet" ) ;
398+ let df = ctx. read_batch ( batch. clone ( ) ) ?;
399+ df. write_parquet (
400+ output_with_ext. to_str ( ) . unwrap ( ) ,
401+ DataFrameWriteOptions :: new ( ) , // Automatic mode (default)
402+ None ,
403+ )
404+ . await ?;
405+
406+ assert ! (
407+ output_with_ext. is_file( ) ,
408+ "Path with extension should be a single file, got is_file={}, is_dir={}" ,
409+ output_with_ext. is_file( ) ,
410+ output_with_ext. is_dir( )
411+ ) ;
412+
413+ // Case 2: Path WITHOUT extension -> should create directory (Automatic mode)
414+ let output_no_ext = tmp_dir. path ( ) . join ( "data_dir" ) ;
415+ let df = ctx. read_batch ( batch) ?;
416+ df. write_parquet (
417+ output_no_ext. to_str ( ) . unwrap ( ) ,
418+ DataFrameWriteOptions :: new ( ) , // Automatic mode (default)
419+ None ,
420+ )
421+ . await ?;
422+
423+ assert ! (
424+ output_no_ext. is_dir( ) ,
425+ "Path without extension should be a directory, got is_file={}, is_dir={}" ,
426+ output_no_ext. is_file( ) ,
427+ output_no_ext. is_dir( )
428+ ) ;
429+
430+ Ok ( ( ) )
431+ }
432+
433+ /// Test FileOutputMode::Directory - explicitly request directory output
434+ /// even for paths WITH file extensions.
435+ #[ tokio:: test]
436+ async fn test_file_output_mode_directory ( ) -> Result < ( ) > {
437+ use arrow:: array:: Int32Array ;
438+ use arrow:: datatypes:: { DataType , Field , Schema } ;
439+ use arrow:: record_batch:: RecordBatch ;
440+
441+ let ctx = SessionContext :: new ( ) ;
442+ let tmp_dir = TempDir :: new ( ) ?;
443+
444+ // Path WITH .parquet extension but explicitly requesting directory output
445+ let output_path = tmp_dir. path ( ) . join ( "output.parquet" ) ;
446+ let output_path_str = output_path. to_str ( ) . unwrap ( ) ;
447+
448+ let df = ctx. read_batch ( RecordBatch :: try_new (
449+ Arc :: new ( Schema :: new ( vec ! [ Field :: new( "id" , DataType :: Int32 , false ) ] ) ) ,
450+ vec ! [ Arc :: new( Int32Array :: from( vec![ 1 , 2 , 3 ] ) ) ] ,
451+ ) ?) ?;
452+
453+ // Explicitly request directory output (single_file_output = false)
454+ df. write_parquet (
455+ output_path_str,
456+ DataFrameWriteOptions :: new ( ) . with_single_file_output ( false ) ,
457+ None ,
458+ )
459+ . await ?;
460+
461+ // Verify: output should be a DIRECTORY, not a single file
462+ assert ! (
463+ output_path. is_dir( ) ,
464+ "Expected directory at {:?}, but got is_file={}, is_dir={}" ,
465+ output_path,
466+ output_path. is_file( ) ,
467+ output_path. is_dir( )
468+ ) ;
469+
470+ // Verify the directory contains parquet file(s)
471+ let entries: Vec < _ > = std:: fs:: read_dir ( & output_path) ?
472+ . filter_map ( |e| e. ok ( ) )
473+ . collect ( ) ;
474+ assert ! (
475+ !entries. is_empty( ) ,
476+ "Directory should contain at least one file"
477+ ) ;
478+
479+ Ok ( ( ) )
480+ }
327481}
0 commit comments