more testing around writer options

timsaucer · timsaucer · commit 0d191f655750 · 2025-10-07T12:58:00.000-04:00
diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
@@ -45,7 +45,13 @@
     SessionContext,
     SQLOptions,
 )
-from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions
+from .dataframe import (
+    DataFrame,
+    DataFrameWriteOptions,
+    InsertOp,
+    ParquetColumnOptions,
+    ParquetWriterOptions,
+)
 from .dataframe_formatter import configure_formatter
 from .expr import (
     Expr,
@@ -75,9 +81,11 @@
     "Config",
     "DFSchema",
     "DataFrame",
+    "DataFrameWriteOptions",
     "Database",
     "ExecutionPlan",
     "Expr",
+    "InsertOp",
     "LogicalPlan",
     "ParquetColumnOptions",
     "ParquetWriterOptions",
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -941,7 +941,10 @@ def write_csv(
             with_header: If true, output the CSV header row.
             write_options: Options that impact how the DataFrame is written.
         """
-        self.df.write_csv(str(path), with_header, write_options._raw_write_options)
+        raw_write_options = (
+            write_options._raw_write_options if write_options is not None else None
+        )
+        self.df.write_csv(str(path), with_header, raw_write_options)
 
     @overload
     def write_parquet(
@@ -1013,11 +1016,14 @@ def write_parquet(
         ):
             compression_level = compression.get_default_level()
 
+        raw_write_options = (
+            write_options._raw_write_options if write_options is not None else None
+        )
         self.df.write_parquet(
             str(path),
             compression.value,
             compression_level,
-            write_options._raw_write_options,
+            raw_write_options,
         )
 
     def write_parquet_with_options(
@@ -1070,11 +1076,14 @@ def write_parquet_with_options(
                 bloom_filter_ndv=opts.bloom_filter_ndv,
             )
 
+        raw_write_options = (
+            write_options._raw_write_options if write_options is not None else None
+        )
         self.df.write_parquet_with_options(
             str(path),
             options_internal,
             column_specific_options_internal,
-            write_options._raw_write_options,
+            raw_write_options,
         )
 
     def write_json(
@@ -1088,7 +1097,10 @@ def write_json(
             path: Path of the JSON file to write.
             write_options: Options that impact how the DataFrame is written.
         """
-        self.df.write_json(str(path), write_options=write_options._raw_write_options)
+        raw_write_options = (
+            write_options._raw_write_options if write_options is not None else None
+        )
+        self.df.write_json(str(path), write_options=raw_write_options)
 
     def write_table(
         self, table_name: str, write_options: DataFrameWriteOptions | None = None
@@ -1099,7 +1111,10 @@ def write_table(
         Not all table providers support writing operations. See the individual
         implementations for details.
         """
-        self.df.write_table(table_name, write_options._raw_write_options)
+        raw_write_options = (
+            write_options._raw_write_options if write_options is not None else None
+        )
+        self.df.write_table(table_name, raw_write_options)
 
     def to_arrow_table(self) -> pa.Table:
         """Execute the :py:class:`DataFrame` and convert it into an Arrow Table.
@@ -1284,17 +1299,11 @@ def __init__(
         sort_by: Expr | SortExpr | Sequence[Expr] | Sequence[SortExpr] | None = None,
     ) -> None:
         """Instantiate writer options for DataFrame."""
-        write_options = DataFrameWriteOptionsInternal()
-        if insert_operation is not None:
-            write_options = write_options.with_insert_operation(insert_operation.value)
-        write_options = write_options.with_single_file_output(single_file_output)
-        if partition_by is not None:
-            if isinstance(partition_by, str):
-                partition_by = [partition_by]
-            write_options = write_options.with_partition_by(partition_by)
+        if isinstance(partition_by, str):
+            partition_by = [partition_by]
 
         sort_by_raw = sort_list_to_raw_sort_list(sort_by)
-        if sort_by_raw is not None:
-            write_options = write_options.with_sort_by(sort_by_raw)
 
-        self._raw_write_options = write_options
+        self._raw_write_options = DataFrameWriteOptionsInternal(
+            insert_operation, single_file_output, partition_by, sort_by_raw
+        )
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -16,6 +16,7 @@
 # under the License.
 import ctypes
 import datetime
+import itertools
 import os
 import re
 import threading
@@ -59,9 +60,7 @@ def ctx():
 
 
 @pytest.fixture
-def df():
-    ctx = SessionContext()
-
+def df(ctx):
     # create a RecordBatch and a new DataFrame from it
     batch = pa.RecordBatch.from_arrays(
         [pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([8, 5, 8])],
@@ -1831,29 +1830,52 @@ def test_write_csv(ctx, df, tmp_path, path_to_str):
     assert result == expected
 
 
+sort_by_cases = [
+    (None, [1, 2, 3], "unsorted"),
+    (column("c"), [2, 1, 3], "single_column_expr"),
+    (column("a").sort(ascending=False), [3, 2, 1], "single_sort_expr"),
+    ([column("c"), column("b")], [2, 1, 3], "list_col_expr"),
+    (
+        [column("c").sort(ascending=False), column("b").sort(ascending=False)],
+        [3, 1, 2],
+        "list_sort_expr",
+    ),
+]
+
+formats = ["csv", "json", "parquet", "table"]
+
+
 @pytest.mark.parametrize(
-    ("sort_by", "expected_a"),
+    ("format", "sort_by", "expected_a"),
     [
-        pytest.param(None, [1, 2, 3], id="unsorted"),
-        pytest.param(column("c"), [2, 1, 3], id="single_column_expr"),
-        pytest.param(
-            column("a").sort(ascending=False), [3, 2, 1], id="single_sort_expr"
-        ),
-        pytest.param([column("c"), column("b")], [2, 1, 3], id="list_col_expr"),
-        pytest.param(
-            [column("c").sort(ascending=False), column("b").sort(ascending=False)],
-            [3, 1, 2],
-            id="list_sort_expr",
-        ),
+        pytest.param(format, sort_by, expected_a, id=f"{format}_{test_id}")
+        for format, (sort_by, expected_a, test_id) in itertools.product(
+            formats, sort_by_cases
+        )
     ],
 )
-def test_write_csv_with_options(ctx, df, tmp_path, sort_by, expected_a) -> None:
+def test_write_files_with_options(
+    ctx, df, tmp_path, format, sort_by, expected_a
+) -> None:
     write_options = DataFrameWriteOptions(sort_by=sort_by)
-    df.write_csv(tmp_path, with_header=True, write_options=write_options)
 
-    ctx.register_csv("csv", tmp_path)
-    result = ctx.table("csv").to_pydict()["a"]
-    ctx.table("csv").show()
+    if format == "csv":
+        df.write_csv(tmp_path, with_header=True, write_options=write_options)
+        ctx.register_csv("test_table", tmp_path)
+    elif format == "json":
+        df.write_json(tmp_path, write_options=write_options)
+        ctx.register_json("test_table", tmp_path)
+    elif format == "parquet":
+        df.write_parquet(tmp_path, write_options=write_options)
+        ctx.register_parquet("test_table", tmp_path)
+    elif format == "table":
+        batch = pa.RecordBatch.from_arrays([[], [], []], schema=df.schema())
+        ctx.register_record_batches("test_table", [[batch]])
+        ctx.table("test_table").show()
+        df.write_table("test_table", write_options=write_options)
+
+    result = ctx.table("test_table").to_pydict()["a"]
+    ctx.table("test_table").show()
 
     assert result == expected_a
 
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -1078,44 +1078,25 @@ impl From<PyDataFrameWriteOptions> for DataFrameWriteOptions {
 #[pymethods]
 impl PyDataFrameWriteOptions {
     #[new]
-    fn new() -> Self {
+    fn new(
+        insert_operation: Option<PyInsertOp>,
+        single_file_output: bool,
+        partition_by: Option<Vec<String>>,
+        sort_by: Option<Vec<PySortExpr>>,
+    ) -> Self {
+        let insert_operation = insert_operation.map(Into::into).unwrap_or(InsertOp::Append);
+        let sort_by = sort_by
+            .unwrap_or_default()
+            .into_iter()
+            .map(Into::into)
+            .collect();
         Self {
-            insert_operation: InsertOp::Append,
-            single_file_output: false,
-            partition_by: vec![],
-            sort_by: vec![],
+            insert_operation,
+            single_file_output,
+            partition_by: partition_by.unwrap_or_default(),
+            sort_by,
         }
     }
-
-    pub fn with_insert_operation(&self, insert_operation: PyInsertOp) -> Self {
-        let mut result = self.clone();
-
-        result.insert_operation = insert_operation.into();
-        result
-    }
-
-    pub fn with_single_file_output(&self, single_file_output: bool) -> Self {
-        let mut result = self.clone();
-
-        result.single_file_output = single_file_output;
-        result
-    }
-
-    /// Sets the partition_by columns for output partitioning
-    pub fn with_partition_by(&self, partition_by: Vec<String>) -> Self {
-        let mut result = self.clone();
-
-        result.partition_by = partition_by;
-        result
-    }
-
-    /// Sets the sort_by columns for output sorting
-    pub fn with_sort_by(&self, sort_by: Vec<PySortExpr>) -> Self {
-        let mut result = self.clone();
-
-        result.sort_by = sort_by.into_iter().map(Into::into).collect();
-        result
-    }
 }
 
 /// Print DataFrame