apache · kazantsev-maksim · Dec 14, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 17, 2025
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
@@ -167,6 +167,7 @@ jobs:
               org.apache.comet.CometStringExpressionSuite
               org.apache.comet.CometBitwiseExpressionSuite
               org.apache.comet.CometMapExpressionSuite
+              org.apache.comet.CometCsvExpressionSuite
               org.apache.comet.CometJsonExpressionSuite
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite

diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
@@ -131,6 +131,7 @@ jobs:
               org.apache.comet.CometBitwiseExpressionSuite
               org.apache.comet.CometMapExpressionSuite
               org.apache.comet.CometJsonExpressionSuite
+              org.apache.comet.CometCsvExpressionSuite
               org.apache.comet.expressions.conditional.CometIfSuite
               org.apache.comet.expressions.conditional.CometCoalesceSuite
               org.apache.comet.expressions.conditional.CometCaseWhenSuite

diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md
@@ -324,6 +324,7 @@ These settings can be used to determine which parts of the plan are accelerated
 | `spark.comet.expression.StringTrimBoth.enabled` | Enable Comet acceleration for `StringTrimBoth` | true |
 | `spark.comet.expression.StringTrimLeft.enabled` | Enable Comet acceleration for `StringTrimLeft` | true |
 | `spark.comet.expression.StringTrimRight.enabled` | Enable Comet acceleration for `StringTrimRight` | true |
+| `spark.comet.expression.StructsToCsv.enabled` | Enable Comet acceleration for `StructsToCsv` | true |
 | `spark.comet.expression.StructsToJson.enabled` | Enable Comet acceleration for `StructsToJson` | true |
 | `spark.comet.expression.Substring.enabled` | Enable Comet acceleration for `Substring` | true |
 | `spark.comet.expression.Subtract.enabled` | Enable Comet acceleration for `Subtract` | true |

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -70,8 +70,8 @@ use datafusion::{
 };
 use datafusion_comet_spark_expr::{
     create_comet_physical_fun, create_comet_physical_fun_with_eval_mode, BinaryOutputStyle,
-    BloomFilterAgg, BloomFilterMightContain, EvalMode, SparkHour, SparkMinute, SparkSecond,
-    SumInteger,
+    BloomFilterAgg, BloomFilterMightContain, CsvWriteOptions, EvalMode, SparkHour, SparkMinute,
+    SparkSecond, SumInteger, ToCsv,
 };
 use iceberg::expr::Bind;
 
@@ -644,6 +644,25 @@ impl PhysicalPlanner {
             ExprStruct::MonotonicallyIncreasingId(_) => Ok(Arc::new(
                 MonotonicallyIncreasingId::from_partition_id(self.partition),
             )),
+            ExprStruct::ToCsv(expr) => {
+                let csv_struct_expr =
+                    self.create_expr(expr.child.as_ref().unwrap(), Arc::clone(&input_schema))?;
+                let options = expr.options.clone().unwrap();
+                let csv_write_options = CsvWriteOptions::new(
+                    options.delimiter,
+                    options.quote,
+                    options.escape,
+                    options.null_value,
+                    options.quote_all,
+                    options.ignore_leading_white_space,
+                    options.ignore_trailing_white_space,
+                );
+                Ok(Arc::new(ToCsv::new(
+                    csv_struct_expr,
+                    &options.timezone,
+                    csv_write_options,
+                )))
+            }
             expr => Err(GeneralError(format!("Not implemented: {expr:?}"))),
         }
     }

diff --git a/native/proto/src/proto/expr.proto b/native/proto/src/proto/expr.proto
@@ -86,6 +86,7 @@ message Expr {
     EmptyExpr spark_partition_id = 63;
     EmptyExpr monotonically_increasing_id = 64;
     FromJson from_json = 89;
+    ToCsv to_csv = 90;
   }
 }
 
@@ -275,6 +276,22 @@ message FromJson {
   string timezone = 3;
 }
 
+message ToCsv {
+  Expr child = 1;
+  CsvWriteOptions options = 2;
+}
+
+message CsvWriteOptions {
+  string delimiter = 1;
+  string quote = 2;
+  string escape = 3;
+  string null_value = 4;
+  bool quote_all = 5;
+  bool ignore_leading_white_space = 6;
+  bool ignore_trailing_white_space = 7;
+  string timezone = 8;
+}
+
 enum BinaryOutputStyle {
   UTF8 = 0;
   BASIC = 1;

diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml
@@ -88,6 +88,10 @@ harness = false
 name = "normalize_nan"
 harness = false
 
+[[bench]]
+name = "to_csv"
+harness = false
+
 [[test]]
 name = "test_udf_registration"
 path = "tests/spark_expr_reg.rs"
diff --git a/native/spark-expr/benches/to_csv.rs b/native/spark-expr/benches/to_csv.rs
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    BooleanBuilder, Int16Builder, Int32Builder, Int64Builder, Int8Builder, StringBuilder,
+    StructArray, StructBuilder,
+};
+use arrow::datatypes::{DataType, Field};
+use criterion::{criterion_group, criterion_main, Criterion};
+use datafusion_comet_spark_expr::{to_csv_inner, CsvWriteOptions, EvalMode, SparkCastOptions};
+use std::hint::black_box;
+
+fn create_struct_array(array_size: usize) -> StructArray {
+    let fields = vec![
+        Field::new("f1", DataType::Boolean, true),
+        Field::new("f2", DataType::Int8, true),
+        Field::new("f3", DataType::Int16, true),
+        Field::new("f4", DataType::Int32, true),
+        Field::new("f5", DataType::Int64, true),
+        Field::new("f6", DataType::Utf8, true),
+    ];
+    let mut struct_builder = StructBuilder::from_fields(fields, array_size);
+    for i in 0..array_size {
+        struct_builder
+            .field_builder::<BooleanBuilder>(0)
+            .unwrap()
+            .append_option(if i % 10 == 0 { None } else { Some(i % 2 == 0) });
+
+        struct_builder
+            .field_builder::<Int8Builder>(1)
+            .unwrap()
+            .append_option(if i % 10 == 0 {
+                None
+            } else {
+                Some((i % 128) as i8)
+            });
+
+        struct_builder
+            .field_builder::<Int16Builder>(2)
+            .unwrap()
+            .append_option(if i % 10 == 0 { None } else { Some(i as i16) });
+
+        struct_builder
+            .field_builder::<Int32Builder>(3)
+            .unwrap()
+            .append_option(if i % 10 == 0 { None } else { Some(i as i32) });
+
+        struct_builder
+            .field_builder::<Int64Builder>(4)
+            .unwrap()
+            .append_option(if i % 10 == 0 { None } else { Some(i as i64) });
+
+        struct_builder
+            .field_builder::<StringBuilder>(5)
+            .unwrap()
+            .append_option(if i % 10 == 0 {
+                None
+            } else {
+                Some(format!("string_{}", i))
+            });
+
+        struct_builder.append(true);
+    }
+    struct_builder.finish()
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let array_size = 8192;
+    let timezone = "UTC";
+    let struct_array = create_struct_array(array_size);
+    let default_delimiter = ",";
+    let default_null_value = "";
+    let default_quote = "\"";
+    let default_escape = "\\";
+    let mut cast_options = SparkCastOptions::new(EvalMode::Legacy, timezone, false);
+    cast_options.null_string = default_null_value.to_string();
+    let csv_write_options = CsvWriteOptions::new(
+        default_delimiter.to_string(),
+        default_quote.to_string(),
+        default_escape.to_string(),
+        default_null_value.to_string(),
+        false,
+        true,
+        true,
+    );
+    c.bench_function("to_csv", |b| {
+        b.iter(|| {
+            black_box(to_csv_inner(&struct_array, &cast_options, &csv_write_options).unwrap())
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/native/spark-expr/src/csv_funcs/csv_write_options.rs b/native/spark-expr/src/csv_funcs/csv_write_options.rs
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::fmt::{Display, Formatter};
+
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct CsvWriteOptions {
+    pub delimiter: String,
+    pub quote: String,
+    pub escape: String,
+    pub null_value: String,
+    pub quote_all: bool,
+    pub ignore_leading_white_space: bool,
+    pub ignore_trailing_white_space: bool,
+}
+
+impl Display for CsvWriteOptions {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "csv_write_options(quote={}, escape={}, null_value={}, quote_all={}, ignore_leading_white_space={}, ignore_trailing_white_space={})",
+            self.quote, self.escape, self.null_value, self.quote_all, self.ignore_leading_white_space, self.ignore_trailing_white_space
+        )
+    }
+}
+
+impl CsvWriteOptions {
+    pub fn new(
+        delimiter: String,
+        quote: String,
+        escape: String,
+        null_value: String,
+        quote_all: bool,
+        ignore_leading_white_space: bool,
+        ignore_trailing_white_space: bool,
+    ) -> Self {
+        Self {
+            delimiter,
+            quote,
+            escape,
+            null_value,
+            quote_all,
+            ignore_leading_white_space,
+            ignore_trailing_white_space,
+        }
+    }
+}
diff --git a/native/spark-expr/src/csv_funcs/mod.rs b/native/spark-expr/src/csv_funcs/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod csv_write_options;
+mod to_csv;
+
+pub use csv_write_options::CsvWriteOptions;
+pub use to_csv::{to_csv_inner, ToCsv};