From 3232a51e52cdc1f9caef47be3abd1ee79c8b0a67 Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Fri, 14 Nov 2025 01:13:12 +0530 Subject: [PATCH 01/15] string split implemented --- spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 6bf3776a23..6497f4186d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -162,6 +162,7 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[StringRPad] -> CometStringRPad, classOf[StringLPad] -> CometStringLPad, classOf[StringSpace] -> CometScalarFunction("string_space"), + classOf[StringSplit] -> CometScalarFunction("string_to_array"), classOf[StringTranslate] -> CometScalarFunction("translate"), classOf[StringTrim] -> CometScalarFunction("trim"), classOf[StringTrimBoth] -> CometScalarFunction("btrim"), From 0cea0b1f6f3648092625746a7218fffcdc208fc1 Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Fri, 14 Nov 2025 01:13:23 +0530 Subject: [PATCH 02/15] tests added --- .../comet/CometStringExpressionSuite.scala | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index f9882780c8..44e9b11b14 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -148,6 +148,44 @@ class CometStringExpressionSuite extends CometTestBase { } } + test("split string basic") { + // Basic split tests with 2 arguments (no limit) + withParquetTable((0 until 5).map(i => (s"value$i,test$i", i)), "tbl") { + checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl") + checkSparkAnswerAndOperator("SELECT split('one,two,three', ',') FROM tbl") + checkSparkAnswerAndOperator("SELECT split(col, '-') FROM tbl") + } + } + + test("split string with limit") { + // Split tests with 3 arguments (with limit) + withParquetTable((0 until 5).map(i => (s"a,b,c,d,e", i)), "tbl") { + checkSparkAnswerAndOperator("SELECT split(col, ',', 2) FROM tbl") + checkSparkAnswerAndOperator("SELECT split(col, ',', 3) FROM tbl") + checkSparkAnswerAndOperator("SELECT split(col, ',', -1) FROM tbl") + checkSparkAnswerAndOperator("SELECT split(col, ',', 0) FROM tbl") + } + } + + test("split string with regex patterns") { + // Test with various regex patterns + withParquetTable((0 until 5).map(i => (s"word1 word2 word3", i)), "tbl") { + checkSparkAnswerAndOperator("SELECT split(col, ' ') FROM tbl") + checkSparkAnswerAndOperator("SELECT split(col, '\\\\s+') FROM tbl") + } + + withParquetTable((0 until 5).map(i => (s"foo123bar456baz", i)), "tbl2") { + checkSparkAnswerAndOperator("SELECT split(col, '\\\\d+') FROM tbl2") + } + } + + test("split string edge cases") { + // Test edge cases: empty strings, nulls, single character + withParquetTable(Seq(("", 0), ("single", 1), (null, 2), ("a", 3)), "tbl") { + checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl") + } + } + test("Various String scalar functions") { val table = "names" withTable(table) { From 6afe842b6e670c075d23154e627abe433f02cfa7 Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Fri, 14 Nov 2025 01:52:18 +0530 Subject: [PATCH 03/15] StringSplit support with fuzz testing --- fuzz-testing/src/main/scala/org/apache/comet/fuzz/Meta.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/Meta.scala b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/Meta.scala index 2e29cb930b..a510584b9b 100644 --- a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/Meta.scala +++ b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/Meta.scala @@ -184,6 +184,11 @@ object Meta { FunctionSignature(Seq(SparkStringType, SparkIntegralType)), FunctionSignature(Seq(SparkStringType, SparkIntegralType, SparkStringType)))), createUnaryStringFunction("rtrim"), + createFunctions( + "split", + Seq( + FunctionSignature(Seq(SparkStringType, SparkStringType)), + FunctionSignature(Seq(SparkStringType, SparkStringType, SparkIntType)))), createFunctionWithInputTypes("starts_with", Seq(SparkStringType, SparkStringType)), createFunctionWithInputTypes("string_space", Seq(SparkIntType)), createFunctionWithInputTypes("substring", Seq(SparkStringType, SparkIntType, SparkIntType)), From 8a4c2005592a1f1488f59e3205ab7d9e9ba6e0ca Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Fri, 14 Nov 2025 02:14:03 +0530 Subject: [PATCH 04/15] more tests added for UTF-8 characters --- .../comet/CometStringExpressionSuite.scala | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index 44e9b11b14..09e3165365 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -186,6 +186,88 @@ class CometStringExpressionSuite extends CometTestBase { } } + test("split string with UTF-8 characters") { + // Test with multi-byte UTF-8 characters to verify regex engine compatibility + // between Java (Spark) and Rust (Comet) + + // CJK characters + withParquetTable(Seq(("你好,世界", 0), ("こんにちは,世界", 1)), "tbl_cjk") { + checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_cjk") + } + + // Emoji and symbols + withParquetTable(Seq(("😀,😃,😄", 0), ("🔥,💧,🌍", 1), ("α,β,γ", 2)), "tbl_emoji") { + checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_emoji") + } + + // Combining characters / grapheme clusters + // "é" as combining character (e + combining acute accent) + // vs "é" as single character (precomposed) + withParquetTable( + Seq( + ("café,naïve", 0), // precomposed + ("café,naïve", 1), // combining (if your editor supports it) + ("मानक,हिन्दी", 2) + ), // Devanagari script + "tbl_graphemes") { + checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_graphemes") + } + + // Mixed ASCII and multi-byte with regex patterns + withParquetTable( + Seq(("hello世界test你好", 0), ("foo😀bar😃baz", 1), ("abc한글def", 2)), // Korean Hangul + "tbl_mixed") { + // Split on ASCII word boundaries + checkSparkAnswerAndOperator("SELECT split(col, '[a-z]+') FROM tbl_mixed") + } + + // RTL (Right-to-Left) characters + withParquetTable(Seq(("مرحبا,عالم", 0), ("שלום,עולם", 1)), "tbl_rtl") { // Arabic, Hebrew + checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_rtl") + } + + // Zero-width characters and special Unicode + withParquetTable( + Seq( + ("test\u200Bword", 0), // Zero-width space + ("foo\u00ADbar", 1) + ), // Soft hyphen + "tbl_special") { + checkSparkAnswerAndOperator("SELECT split(col, '\u200B') FROM tbl_special") + } + + // Surrogate pairs (4-byte UTF-8) + withParquetTable( + Seq( + ("𝐇𝐞𝐥𝐥𝐨,𝐖𝐨𝐫𝐥𝐝", 0), // Mathematical bold letters (U+1D400 range) + ("𠜎,𠜱,𠝹", 1) + ), // CJK Extension B + "tbl_surrogate") { + checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_surrogate") + } + } + + test("split string with UTF-8 regex patterns") { + // Test regex patterns that involve UTF-8 characters + + // Split on Unicode character classes + withParquetTable( + Seq( + ("word1 word2 word3", 0), // Regular space and ideographic space (U+3000) + ("test1\u00A0test2", 1) + ), // Non-breaking space + "tbl_space") { + // Split on any whitespace (should match all Unicode whitespace) + checkSparkAnswerAndOperator("SELECT split(col, '\\\\s+') FROM tbl_space") + } + + // Split with limit on UTF-8 strings + withParquetTable(Seq(("你,好,世,界", 0), ("😀,😃,😄,😁", 1)), "tbl_utf8_limit") { + checkSparkAnswerAndOperator("SELECT split(col, ',', 2) FROM tbl_utf8_limit") + checkSparkAnswerAndOperator("SELECT split(col, ',', -1) FROM tbl_utf8_limit") + } + } + test("Various String scalar functions") { val table = "names" withTable(table) { From 63b92c8b8c3e4789ccb4eb209f657c0a8f74c9c8 Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Fri, 14 Nov 2025 16:27:55 +0530 Subject: [PATCH 05/15] rust native implementation --- native/spark-expr/src/comet_scalar_funcs.rs | 4 + native/spark-expr/src/string_funcs/mod.rs | 2 + native/spark-expr/src/string_funcs/split.rs | 312 ++++++++++++++++++++ 3 files changed, 318 insertions(+) create mode 100644 native/spark-expr/src/string_funcs/split.rs diff --git a/native/spark-expr/src/comet_scalar_funcs.rs b/native/spark-expr/src/comet_scalar_funcs.rs index 021bb1c78f..d8c31dd647 100644 --- a/native/spark-expr/src/comet_scalar_funcs.rs +++ b/native/spark-expr/src/comet_scalar_funcs.rs @@ -185,6 +185,10 @@ pub fn create_comet_physical_fun_with_eval_mode( let func = Arc::new(abs); make_comet_scalar_udf!("abs", func, without data_type) } + "string_to_array" => { + let func = Arc::new(crate::string_funcs::spark_split); + make_comet_scalar_udf!("string_to_array", func, without data_type) + } _ => registry.udf(fun_name).map_err(|e| { DataFusionError::Execution(format!( "Function {fun_name} not found in the registry: {e}", diff --git a/native/spark-expr/src/string_funcs/mod.rs b/native/spark-expr/src/string_funcs/mod.rs index aac8204e29..ae00349ba1 100644 --- a/native/spark-expr/src/string_funcs/mod.rs +++ b/native/spark-expr/src/string_funcs/mod.rs @@ -15,8 +15,10 @@ // specific language governing permissions and limitations // under the License. +mod split; mod string_space; mod substring; +pub use split::spark_split; pub use string_space::SparkStringSpace; pub use substring::SubstringExpr; diff --git a/native/spark-expr/src/string_funcs/split.rs b/native/spark-expr/src/string_funcs/split.rs new file mode 100644 index 0000000000..760ed41362 --- /dev/null +++ b/native/spark-expr/src/string_funcs/split.rs @@ -0,0 +1,312 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, GenericStringArray, Int32Array, ListArray, OffsetSizeTrait}; +use arrow::datatypes::{DataType, Field}; +use datafusion::common::{ + cast::as_generic_string_array, exec_err, DataFusionError, Result as DataFusionResult, + ScalarValue, +}; +use datafusion::logical_expr::ColumnarValue; +use regex::Regex; +use std::sync::Arc; + +/// Spark-compatible split function +/// Splits a string around matches of a regex pattern with optional limit +/// +/// Arguments: +/// - string: The string to split +/// - pattern: The regex pattern to split on +/// - limit (optional): Controls the number of splits +/// - limit > 0: At most limit-1 splits, array length <= limit +/// - limit = 0: As many splits as possible, trailing empty strings removed +/// - limit < 0: As many splits as possible, trailing empty strings kept +pub fn spark_split(args: &[ColumnarValue]) -> DataFusionResult { + if args.len() < 2 || args.len() > 3 { + return exec_err!( + "split expects 2 or 3 arguments (string, pattern, [limit]), got {}", + args.len() + ); + } + + // Get limit parameter (default to -1 if not provided) + let limit = if args.len() == 3 { + match &args[2] { + ColumnarValue::Scalar(ScalarValue::Int32(Some(l))) => *l, + ColumnarValue::Scalar(ScalarValue::Int32(None)) => { + // NULL limit, return NULL + return Ok(ColumnarValue::Scalar(ScalarValue::Null)); + } + _ => { + return exec_err!("split limit argument must be an Int32 scalar"); + } + } + } else { + -1 + }; + + match (&args[0], &args[1]) { + (ColumnarValue::Array(string_array), ColumnarValue::Scalar(ScalarValue::Utf8(pattern))) + | ( + ColumnarValue::Array(string_array), + ColumnarValue::Scalar(ScalarValue::LargeUtf8(pattern)), + ) => { + if pattern.is_none() { + // NULL pattern returns NULL + let null_array = new_null_list_array(string_array.len()); + return Ok(ColumnarValue::Array(null_array)); + } + + let pattern_str = pattern.as_ref().unwrap(); + split_array(string_array.as_ref(), pattern_str, limit) + } + (ColumnarValue::Scalar(ScalarValue::Utf8(string)), ColumnarValue::Scalar(pattern_val)) + | ( + ColumnarValue::Scalar(ScalarValue::LargeUtf8(string)), + ColumnarValue::Scalar(pattern_val), + ) => { + if string.is_none() { + return Ok(ColumnarValue::Scalar(ScalarValue::Null)); + } + + let pattern_str = match pattern_val { + ScalarValue::Utf8(Some(p)) | ScalarValue::LargeUtf8(Some(p)) => p, + ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) => { + return Ok(ColumnarValue::Scalar(ScalarValue::Null)); + } + _ => { + return exec_err!("split pattern must be a string"); + } + }; + + let result = split_string(string.as_ref().unwrap(), pattern_str, limit)?; + let string_array = GenericStringArray::::from(result); + let list_array = create_list_array(Arc::new(string_array)); + + Ok(ColumnarValue::Scalar(ScalarValue::List(Arc::new(list_array)))) + } + _ => exec_err!("split expects (array, scalar) or (scalar, scalar) arguments"), + } +} + +fn split_array( + string_array: &dyn arrow::array::Array, + pattern: &str, + limit: i32, +) -> DataFusionResult { + // Compile regex once for the entire array + let regex = Regex::new(pattern).map_err(|e| { + DataFusionError::Execution(format!("Invalid regex pattern '{}': {}", pattern, e)) + })?; + + let string_array = match string_array.data_type() { + DataType::Utf8 => as_generic_string_array::(string_array)?, + DataType::LargeUtf8 => { + // Convert LargeUtf8 to Utf8 for processing + let large_array = as_generic_string_array::(string_array)?; + return split_large_string_array(&large_array, ®ex, limit); + } + _ => { + return exec_err!( + "split expects Utf8 or LargeUtf8 string array, got {:?}", + string_array.data_type() + ); + } + }; + + // Build the result ListArray + let mut offsets: Vec = Vec::with_capacity(string_array.len() + 1); + let mut values: Vec = Vec::new(); + offsets.push(0); + + for i in 0..string_array.len() { + if string_array.is_null(i) { + // NULL input produces empty array element (maintain position) + offsets.push(offsets[i]); + } else { + let string_val = string_array.value(i); + let parts = split_with_regex(string_val, ®ex, limit); + values.extend(parts); + offsets.push(values.len() as i32); + } + } + + let values_array = Arc::new(GenericStringArray::::from(values)) as ArrayRef; + let field = Arc::new(Field::new("item", DataType::Utf8, false)); + let list_array = ListArray::new( + field, + arrow::buffer::OffsetBuffer::new(offsets.into()), + values_array, + None, // No nulls at list level + ); + + Ok(ColumnarValue::Array(Arc::new(list_array))) +} + +fn split_large_string_array( + string_array: &GenericStringArray, + regex: &Regex, + limit: i32, +) -> DataFusionResult { + let mut offsets: Vec = Vec::with_capacity(string_array.len() + 1); + let mut values: Vec = Vec::new(); + offsets.push(0); + + for i in 0..string_array.len() { + if string_array.is_null(i) { + offsets.push(offsets[i]); + } else { + let string_val = string_array.value(i); + let parts = split_with_regex(string_val, regex, limit); + values.extend(parts); + offsets.push(values.len() as i32); + } + } + + let values_array = Arc::new(GenericStringArray::::from(values)) as ArrayRef; + let field = Arc::new(Field::new("item", DataType::Utf8, false)); + let list_array = ListArray::new( + field, + arrow::buffer::OffsetBuffer::new(offsets.into()), + values_array, + None, + ); + + Ok(ColumnarValue::Array(Arc::new(list_array))) +} + +fn split_string(string: &str, pattern: &str, limit: i32) -> DataFusionResult> { + let regex = Regex::new(pattern).map_err(|e| { + DataFusionError::Execution(format!("Invalid regex pattern '{}': {}", pattern, e)) + })?; + + Ok(split_with_regex(string, ®ex, limit)) +} + +fn split_with_regex(string: &str, regex: &Regex, limit: i32) -> Vec { + if limit == 0 { + // limit = 0: split as many times as possible, discard trailing empty strings + let mut parts: Vec = regex.split(string).map(|s| s.to_string()).collect(); + // Remove trailing empty strings + while parts.last().map_or(false, |s| s.is_empty()) { + parts.pop(); + } + if parts.is_empty() { + vec!["".to_string()] + } else { + parts + } + } else if limit > 0 { + // limit > 0: at most limit-1 splits (array length <= limit) + let mut parts: Vec = Vec::new(); + let mut last_end = 0; + let mut count = 0; + + for mat in regex.find_iter(string) { + if count >= limit - 1 { + break; + } + parts.push(string[last_end..mat.start()].to_string()); + last_end = mat.end(); + count += 1; + } + // Add the remaining string + parts.push(string[last_end..].to_string()); + parts + } else { + // limit < 0: split as many times as possible, keep trailing empty strings + regex.split(string).map(|s| s.to_string()).collect() + } +} + +fn create_list_array(values: ArrayRef) -> ListArray { + let field = Arc::new(Field::new("item", DataType::Utf8, false)); + let offsets = vec![0i32, values.len() as i32]; + ListArray::new( + field, + arrow::buffer::OffsetBuffer::new(offsets.into()), + values, + None, + ) +} + +fn new_null_list_array(len: usize) -> ArrayRef { + let field = Arc::new(Field::new("item", DataType::Utf8, false)); + let values = Arc::new(GenericStringArray::::from(Vec::::new())) as ArrayRef; + let offsets = vec![0i32; len + 1]; + let nulls = arrow::buffer::NullBuffer::new_null(len); + + Arc::new(ListArray::new( + field, + arrow::buffer::OffsetBuffer::new(offsets.into()), + values, + Some(nulls), + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::StringArray; + + #[test] + fn test_split_basic() { + let string_array = Arc::new(StringArray::from(vec!["a,b,c", "x,y,z"])) as ArrayRef; + let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))); + let args = vec![ColumnarValue::Array(string_array), pattern]; + + let result = spark_split(&args).unwrap(); + // Should produce [["a", "b", "c"], ["x", "y", "z"]] + assert!(matches!(result, ColumnarValue::Array(_))); + } + + #[test] + fn test_split_with_limit() { + let string_array = Arc::new(StringArray::from(vec!["a,b,c,d"])) as ArrayRef; + let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))); + let limit = ColumnarValue::Scalar(ScalarValue::Int32(Some(2))); + let args = vec![ColumnarValue::Array(string_array), pattern, limit]; + + let result = spark_split(&args).unwrap(); + // Should produce [["a", "b,c,d"]] + assert!(matches!(result, ColumnarValue::Array(_))); + } + + #[test] + fn test_split_regex() { + let parts = split_string("foo123bar456baz", r"\d+", -1).unwrap(); + assert_eq!(parts, vec!["foo", "bar", "baz"]); + } + + #[test] + fn test_split_limit_positive() { + let parts = split_string("a,b,c,d,e", ",", 3).unwrap(); + assert_eq!(parts, vec!["a", "b", "c,d,e"]); + } + + #[test] + fn test_split_limit_zero() { + let parts = split_string("a,b,c,,", ",", 0).unwrap(); + assert_eq!(parts, vec!["a", "b", "c"]); + } + + #[test] + fn test_split_limit_negative() { + let parts = split_string("a,b,c,,", ",", -1).unwrap(); + assert_eq!(parts, vec!["a", "b", "c", "", ""]); + } +} From 9ecc0a67ea185e4e4ead6ada7d737c9033bb1cf2 Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Fri, 14 Nov 2025 16:38:08 +0530 Subject: [PATCH 06/15] renamed the scalar function to split --- native/spark-expr/src/comet_scalar_funcs.rs | 4 ++-- .../main/scala/org/apache/comet/serde/QueryPlanSerde.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/native/spark-expr/src/comet_scalar_funcs.rs b/native/spark-expr/src/comet_scalar_funcs.rs index d8c31dd647..1844942630 100644 --- a/native/spark-expr/src/comet_scalar_funcs.rs +++ b/native/spark-expr/src/comet_scalar_funcs.rs @@ -185,9 +185,9 @@ pub fn create_comet_physical_fun_with_eval_mode( let func = Arc::new(abs); make_comet_scalar_udf!("abs", func, without data_type) } - "string_to_array" => { + "split" => { let func = Arc::new(crate::string_funcs::spark_split); - make_comet_scalar_udf!("string_to_array", func, without data_type) + make_comet_scalar_udf!("split", func, without data_type) } _ => registry.udf(fun_name).map_err(|e| { DataFusionError::Execution(format!( diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 6497f4186d..c1c8311ba3 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -162,7 +162,7 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[StringRPad] -> CometStringRPad, classOf[StringLPad] -> CometStringLPad, classOf[StringSpace] -> CometScalarFunction("string_space"), - classOf[StringSplit] -> CometScalarFunction("string_to_array"), + classOf[StringSplit] -> CometScalarFunction("split"), classOf[StringTranslate] -> CometScalarFunction("translate"), classOf[StringTrim] -> CometScalarFunction("trim"), classOf[StringTrimBoth] -> CometScalarFunction("btrim"), From 10cee6f92f76edb4e1304db974caaa4a06001780 Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Fri, 14 Nov 2025 23:21:30 +0530 Subject: [PATCH 07/15] fixes tests --- native/spark-expr/src/string_funcs/split.rs | 6 +-- .../comet/CometStringExpressionSuite.scala | 40 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/native/spark-expr/src/string_funcs/split.rs b/native/spark-expr/src/string_funcs/split.rs index 760ed41362..e6a646ff14 100644 --- a/native/spark-expr/src/string_funcs/split.rs +++ b/native/spark-expr/src/string_funcs/split.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{ArrayRef, GenericStringArray, Int32Array, ListArray, OffsetSizeTrait}; +use arrow::array::{Array, ArrayRef, GenericStringArray, ListArray}; use arrow::datatypes::{DataType, Field}; use datafusion::common::{ cast::as_generic_string_array, exec_err, DataFusionError, Result as DataFusionResult, @@ -157,8 +157,8 @@ fn split_array( Ok(ColumnarValue::Array(Arc::new(list_array))) } -fn split_large_string_array( - string_array: &GenericStringArray, +fn split_large_string_array( + string_array: &GenericStringArray, regex: &Regex, limit: i32, ) -> DataFusionResult { diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index 09e3165365..173ce74a78 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -151,38 +151,38 @@ class CometStringExpressionSuite extends CometTestBase { test("split string basic") { // Basic split tests with 2 arguments (no limit) withParquetTable((0 until 5).map(i => (s"value$i,test$i", i)), "tbl") { - checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl") + checkSparkAnswerAndOperator("SELECT split(_1, ',') FROM tbl") checkSparkAnswerAndOperator("SELECT split('one,two,three', ',') FROM tbl") - checkSparkAnswerAndOperator("SELECT split(col, '-') FROM tbl") + checkSparkAnswerAndOperator("SELECT split(_1, '-') FROM tbl") } } test("split string with limit") { // Split tests with 3 arguments (with limit) withParquetTable((0 until 5).map(i => (s"a,b,c,d,e", i)), "tbl") { - checkSparkAnswerAndOperator("SELECT split(col, ',', 2) FROM tbl") - checkSparkAnswerAndOperator("SELECT split(col, ',', 3) FROM tbl") - checkSparkAnswerAndOperator("SELECT split(col, ',', -1) FROM tbl") - checkSparkAnswerAndOperator("SELECT split(col, ',', 0) FROM tbl") + checkSparkAnswerAndOperator("SELECT split(_1, ',', 2) FROM tbl") + checkSparkAnswerAndOperator("SELECT split(_1, ',', 3) FROM tbl") + checkSparkAnswerAndOperator("SELECT split(_1, ',', -1) FROM tbl") + checkSparkAnswerAndOperator("SELECT split(_1, ',', 0) FROM tbl") } } test("split string with regex patterns") { // Test with various regex patterns withParquetTable((0 until 5).map(i => (s"word1 word2 word3", i)), "tbl") { - checkSparkAnswerAndOperator("SELECT split(col, ' ') FROM tbl") - checkSparkAnswerAndOperator("SELECT split(col, '\\\\s+') FROM tbl") + checkSparkAnswerAndOperator("SELECT split(_1, ' ') FROM tbl") + checkSparkAnswerAndOperator("SELECT split(_1, '\\\\s+') FROM tbl") } withParquetTable((0 until 5).map(i => (s"foo123bar456baz", i)), "tbl2") { - checkSparkAnswerAndOperator("SELECT split(col, '\\\\d+') FROM tbl2") + checkSparkAnswerAndOperator("SELECT split(_1, '\\\\d+') FROM tbl2") } } test("split string edge cases") { // Test edge cases: empty strings, nulls, single character withParquetTable(Seq(("", 0), ("single", 1), (null, 2), ("a", 3)), "tbl") { - checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl") + checkSparkAnswerAndOperator("SELECT split(_1, ',') FROM tbl") } } @@ -192,12 +192,12 @@ class CometStringExpressionSuite extends CometTestBase { // CJK characters withParquetTable(Seq(("你好,世界", 0), ("こんにちは,世界", 1)), "tbl_cjk") { - checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_cjk") + checkSparkAnswerAndOperator("SELECT split(_1, ',') FROM tbl_cjk") } // Emoji and symbols withParquetTable(Seq(("😀,😃,😄", 0), ("🔥,💧,🌍", 1), ("α,β,γ", 2)), "tbl_emoji") { - checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_emoji") + checkSparkAnswerAndOperator("SELECT split(_1, ',') FROM tbl_emoji") } // Combining characters / grapheme clusters @@ -210,7 +210,7 @@ class CometStringExpressionSuite extends CometTestBase { ("मानक,हिन्दी", 2) ), // Devanagari script "tbl_graphemes") { - checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_graphemes") + checkSparkAnswerAndOperator("SELECT split(_1, ',') FROM tbl_graphemes") } // Mixed ASCII and multi-byte with regex patterns @@ -218,12 +218,12 @@ class CometStringExpressionSuite extends CometTestBase { Seq(("hello世界test你好", 0), ("foo😀bar😃baz", 1), ("abc한글def", 2)), // Korean Hangul "tbl_mixed") { // Split on ASCII word boundaries - checkSparkAnswerAndOperator("SELECT split(col, '[a-z]+') FROM tbl_mixed") + checkSparkAnswerAndOperator("SELECT split(_1, '[a-z]+') FROM tbl_mixed") } // RTL (Right-to-Left) characters withParquetTable(Seq(("مرحبا,عالم", 0), ("שלום,עולם", 1)), "tbl_rtl") { // Arabic, Hebrew - checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_rtl") + checkSparkAnswerAndOperator("SELECT split(_1, ',') FROM tbl_rtl") } // Zero-width characters and special Unicode @@ -233,7 +233,7 @@ class CometStringExpressionSuite extends CometTestBase { ("foo\u00ADbar", 1) ), // Soft hyphen "tbl_special") { - checkSparkAnswerAndOperator("SELECT split(col, '\u200B') FROM tbl_special") + checkSparkAnswerAndOperator("SELECT split(_1, '\u200B') FROM tbl_special") } // Surrogate pairs (4-byte UTF-8) @@ -243,7 +243,7 @@ class CometStringExpressionSuite extends CometTestBase { ("𠜎,𠜱,𠝹", 1) ), // CJK Extension B "tbl_surrogate") { - checkSparkAnswerAndOperator("SELECT split(col, ',') FROM tbl_surrogate") + checkSparkAnswerAndOperator("SELECT split(_1, ',') FROM tbl_surrogate") } } @@ -258,13 +258,13 @@ class CometStringExpressionSuite extends CometTestBase { ), // Non-breaking space "tbl_space") { // Split on any whitespace (should match all Unicode whitespace) - checkSparkAnswerAndOperator("SELECT split(col, '\\\\s+') FROM tbl_space") + checkSparkAnswerAndOperator("SELECT split(_1, '\\\\s+') FROM tbl_space") } // Split with limit on UTF-8 strings withParquetTable(Seq(("你,好,世,界", 0), ("😀,😃,😄,😁", 1)), "tbl_utf8_limit") { - checkSparkAnswerAndOperator("SELECT split(col, ',', 2) FROM tbl_utf8_limit") - checkSparkAnswerAndOperator("SELECT split(col, ',', -1) FROM tbl_utf8_limit") + checkSparkAnswerAndOperator("SELECT split(_1, ',', 2) FROM tbl_utf8_limit") + checkSparkAnswerAndOperator("SELECT split(_1, ',', -1) FROM tbl_utf8_limit") } } From 7c412618060dc42c55fc8cf062f7e513e115c288 Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Sat, 15 Nov 2025 12:32:20 +0530 Subject: [PATCH 08/15] doc update - with build run --- docs/source/user-guide/latest/configs.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md index 1767137b09..3ed8be8515 100644 --- a/docs/source/user-guide/latest/configs.md +++ b/docs/source/user-guide/latest/configs.md @@ -307,6 +307,7 @@ These settings can be used to determine which parts of the plan are accelerated | `spark.comet.expression.StringRepeat.enabled` | Enable Comet acceleration for `StringRepeat` | true | | `spark.comet.expression.StringReplace.enabled` | Enable Comet acceleration for `StringReplace` | true | | `spark.comet.expression.StringSpace.enabled` | Enable Comet acceleration for `StringSpace` | true | +| `spark.comet.expression.StringSplit.enabled` | Enable Comet acceleration for `StringSplit` | true | | `spark.comet.expression.StringTranslate.enabled` | Enable Comet acceleration for `StringTranslate` | true | | `spark.comet.expression.StringTrim.enabled` | Enable Comet acceleration for `StringTrim` | true | | `spark.comet.expression.StringTrimBoth.enabled` | Enable Comet acceleration for `StringTrimBoth` | true | From fc4b0298fe0d87c4fe79fbe187c52fd71dd0e23c Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Mon, 17 Nov 2025 09:55:40 +0530 Subject: [PATCH 09/15] checkstyle fixes --- native/spark-expr/src/string_funcs/split.rs | 4 +++- .../scala/org/apache/comet/CometStringExpressionSuite.scala | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/native/spark-expr/src/string_funcs/split.rs b/native/spark-expr/src/string_funcs/split.rs index e6a646ff14..f3c2c33782 100644 --- a/native/spark-expr/src/string_funcs/split.rs +++ b/native/spark-expr/src/string_funcs/split.rs @@ -97,7 +97,9 @@ pub fn spark_split(args: &[ColumnarValue]) -> DataFusionResult { let string_array = GenericStringArray::::from(result); let list_array = create_list_array(Arc::new(string_array)); - Ok(ColumnarValue::Scalar(ScalarValue::List(Arc::new(list_array)))) + Ok(ColumnarValue::Scalar(ScalarValue::List(Arc::new( + list_array, + )))) } _ => exec_err!("split expects (array, scalar) or (scalar, scalar) arguments"), } diff --git a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala index 173ce74a78..ae5e5a73d6 100644 --- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala @@ -159,7 +159,7 @@ class CometStringExpressionSuite extends CometTestBase { test("split string with limit") { // Split tests with 3 arguments (with limit) - withParquetTable((0 until 5).map(i => (s"a,b,c,d,e", i)), "tbl") { + withParquetTable((0 until 5).map(i => ("a,b,c,d,e", i)), "tbl") { checkSparkAnswerAndOperator("SELECT split(_1, ',', 2) FROM tbl") checkSparkAnswerAndOperator("SELECT split(_1, ',', 3) FROM tbl") checkSparkAnswerAndOperator("SELECT split(_1, ',', -1) FROM tbl") @@ -169,12 +169,12 @@ class CometStringExpressionSuite extends CometTestBase { test("split string with regex patterns") { // Test with various regex patterns - withParquetTable((0 until 5).map(i => (s"word1 word2 word3", i)), "tbl") { + withParquetTable((0 until 5).map(i => ("word1 word2 word3", i)), "tbl") { checkSparkAnswerAndOperator("SELECT split(_1, ' ') FROM tbl") checkSparkAnswerAndOperator("SELECT split(_1, '\\\\s+') FROM tbl") } - withParquetTable((0 until 5).map(i => (s"foo123bar456baz", i)), "tbl2") { + withParquetTable((0 until 5).map(i => ("foo123bar456baz", i)), "tbl2") { checkSparkAnswerAndOperator("SELECT split(_1, '\\\\d+') FROM tbl2") } } From 1f8f2b20ee10bb6f224efdc4d52de79b6a594758 Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Mon, 17 Nov 2025 12:13:20 +0530 Subject: [PATCH 10/15] docs: Add StringSplit configuration to user guide --- docs/source/user-guide/latest/configs.md | 297 ++++++++++++----------- 1 file changed, 149 insertions(+), 148 deletions(-) diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md index 3ed8be8515..06b9f15c69 100644 --- a/docs/source/user-guide/latest/configs.md +++ b/docs/source/user-guide/latest/configs.md @@ -179,154 +179,155 @@ These settings can be used to determine which parts of the plan are accelerated - -| Config | Description | Default Value | -|--------|-------------|---------------| -| `spark.comet.expression.Abs.enabled` | Enable Comet acceleration for `Abs` | true | -| `spark.comet.expression.Acos.enabled` | Enable Comet acceleration for `Acos` | true | -| `spark.comet.expression.Add.enabled` | Enable Comet acceleration for `Add` | true | -| `spark.comet.expression.Alias.enabled` | Enable Comet acceleration for `Alias` | true | -| `spark.comet.expression.And.enabled` | Enable Comet acceleration for `And` | true | -| `spark.comet.expression.ArrayAppend.enabled` | Enable Comet acceleration for `ArrayAppend` | true | -| `spark.comet.expression.ArrayCompact.enabled` | Enable Comet acceleration for `ArrayCompact` | true | -| `spark.comet.expression.ArrayContains.enabled` | Enable Comet acceleration for `ArrayContains` | true | -| `spark.comet.expression.ArrayDistinct.enabled` | Enable Comet acceleration for `ArrayDistinct` | true | -| `spark.comet.expression.ArrayExcept.enabled` | Enable Comet acceleration for `ArrayExcept` | true | -| `spark.comet.expression.ArrayFilter.enabled` | Enable Comet acceleration for `ArrayFilter` | true | -| `spark.comet.expression.ArrayInsert.enabled` | Enable Comet acceleration for `ArrayInsert` | true | -| `spark.comet.expression.ArrayIntersect.enabled` | Enable Comet acceleration for `ArrayIntersect` | true | -| `spark.comet.expression.ArrayJoin.enabled` | Enable Comet acceleration for `ArrayJoin` | true | -| `spark.comet.expression.ArrayMax.enabled` | Enable Comet acceleration for `ArrayMax` | true | -| `spark.comet.expression.ArrayMin.enabled` | Enable Comet acceleration for `ArrayMin` | true | -| `spark.comet.expression.ArrayRemove.enabled` | Enable Comet acceleration for `ArrayRemove` | true | -| `spark.comet.expression.ArrayRepeat.enabled` | Enable Comet acceleration for `ArrayRepeat` | true | -| `spark.comet.expression.ArrayUnion.enabled` | Enable Comet acceleration for `ArrayUnion` | true | -| `spark.comet.expression.ArraysOverlap.enabled` | Enable Comet acceleration for `ArraysOverlap` | true | -| `spark.comet.expression.Ascii.enabled` | Enable Comet acceleration for `Ascii` | true | -| `spark.comet.expression.Asin.enabled` | Enable Comet acceleration for `Asin` | true | -| `spark.comet.expression.Atan.enabled` | Enable Comet acceleration for `Atan` | true | -| `spark.comet.expression.Atan2.enabled` | Enable Comet acceleration for `Atan2` | true | -| `spark.comet.expression.AttributeReference.enabled` | Enable Comet acceleration for `AttributeReference` | true | -| `spark.comet.expression.BitLength.enabled` | Enable Comet acceleration for `BitLength` | true | -| `spark.comet.expression.BitwiseAnd.enabled` | Enable Comet acceleration for `BitwiseAnd` | true | -| `spark.comet.expression.BitwiseCount.enabled` | Enable Comet acceleration for `BitwiseCount` | true | -| `spark.comet.expression.BitwiseGet.enabled` | Enable Comet acceleration for `BitwiseGet` | true | -| `spark.comet.expression.BitwiseNot.enabled` | Enable Comet acceleration for `BitwiseNot` | true | -| `spark.comet.expression.BitwiseOr.enabled` | Enable Comet acceleration for `BitwiseOr` | true | -| `spark.comet.expression.BitwiseXor.enabled` | Enable Comet acceleration for `BitwiseXor` | true | -| `spark.comet.expression.CaseWhen.enabled` | Enable Comet acceleration for `CaseWhen` | true | -| `spark.comet.expression.Cast.enabled` | Enable Comet acceleration for `Cast` | true | -| `spark.comet.expression.Ceil.enabled` | Enable Comet acceleration for `Ceil` | true | -| `spark.comet.expression.CheckOverflow.enabled` | Enable Comet acceleration for `CheckOverflow` | true | -| `spark.comet.expression.Chr.enabled` | Enable Comet acceleration for `Chr` | true | -| `spark.comet.expression.Coalesce.enabled` | Enable Comet acceleration for `Coalesce` | true | -| `spark.comet.expression.Concat.enabled` | Enable Comet acceleration for `Concat` | true | -| `spark.comet.expression.ConcatWs.enabled` | Enable Comet acceleration for `ConcatWs` | true | -| `spark.comet.expression.Contains.enabled` | Enable Comet acceleration for `Contains` | true | -| `spark.comet.expression.Cos.enabled` | Enable Comet acceleration for `Cos` | true | -| `spark.comet.expression.Cot.enabled` | Enable Comet acceleration for `Cot` | true | -| `spark.comet.expression.CreateArray.enabled` | Enable Comet acceleration for `CreateArray` | true | -| `spark.comet.expression.CreateNamedStruct.enabled` | Enable Comet acceleration for `CreateNamedStruct` | true | -| `spark.comet.expression.DateAdd.enabled` | Enable Comet acceleration for `DateAdd` | true | -| `spark.comet.expression.DateSub.enabled` | Enable Comet acceleration for `DateSub` | true | -| `spark.comet.expression.DayOfMonth.enabled` | Enable Comet acceleration for `DayOfMonth` | true | -| `spark.comet.expression.DayOfWeek.enabled` | Enable Comet acceleration for `DayOfWeek` | true | -| `spark.comet.expression.DayOfYear.enabled` | Enable Comet acceleration for `DayOfYear` | true | -| `spark.comet.expression.Divide.enabled` | Enable Comet acceleration for `Divide` | true | -| `spark.comet.expression.ElementAt.enabled` | Enable Comet acceleration for `ElementAt` | true | -| `spark.comet.expression.EndsWith.enabled` | Enable Comet acceleration for `EndsWith` | true | -| `spark.comet.expression.EqualNullSafe.enabled` | Enable Comet acceleration for `EqualNullSafe` | true | -| `spark.comet.expression.EqualTo.enabled` | Enable Comet acceleration for `EqualTo` | true | -| `spark.comet.expression.Exp.enabled` | Enable Comet acceleration for `Exp` | true | -| `spark.comet.expression.Expm1.enabled` | Enable Comet acceleration for `Expm1` | true | -| `spark.comet.expression.Flatten.enabled` | Enable Comet acceleration for `Flatten` | true | -| `spark.comet.expression.Floor.enabled` | Enable Comet acceleration for `Floor` | true | -| `spark.comet.expression.FromUnixTime.enabled` | Enable Comet acceleration for `FromUnixTime` | true | -| `spark.comet.expression.GetArrayItem.enabled` | Enable Comet acceleration for `GetArrayItem` | true | -| `spark.comet.expression.GetArrayStructFields.enabled` | Enable Comet acceleration for `GetArrayStructFields` | true | -| `spark.comet.expression.GetMapValue.enabled` | Enable Comet acceleration for `GetMapValue` | true | -| `spark.comet.expression.GetStructField.enabled` | Enable Comet acceleration for `GetStructField` | true | -| `spark.comet.expression.GreaterThan.enabled` | Enable Comet acceleration for `GreaterThan` | true | -| `spark.comet.expression.GreaterThanOrEqual.enabled` | Enable Comet acceleration for `GreaterThanOrEqual` | true | -| `spark.comet.expression.Hex.enabled` | Enable Comet acceleration for `Hex` | true | -| `spark.comet.expression.Hour.enabled` | Enable Comet acceleration for `Hour` | true | -| `spark.comet.expression.If.enabled` | Enable Comet acceleration for `If` | true | -| `spark.comet.expression.In.enabled` | Enable Comet acceleration for `In` | true | -| `spark.comet.expression.InSet.enabled` | Enable Comet acceleration for `InSet` | true | -| `spark.comet.expression.InitCap.enabled` | Enable Comet acceleration for `InitCap` | true | -| `spark.comet.expression.IntegralDivide.enabled` | Enable Comet acceleration for `IntegralDivide` | true | -| `spark.comet.expression.IsNaN.enabled` | Enable Comet acceleration for `IsNaN` | true | -| `spark.comet.expression.IsNotNull.enabled` | Enable Comet acceleration for `IsNotNull` | true | -| `spark.comet.expression.IsNull.enabled` | Enable Comet acceleration for `IsNull` | true | -| `spark.comet.expression.Length.enabled` | Enable Comet acceleration for `Length` | true | -| `spark.comet.expression.LessThan.enabled` | Enable Comet acceleration for `LessThan` | true | -| `spark.comet.expression.LessThanOrEqual.enabled` | Enable Comet acceleration for `LessThanOrEqual` | true | -| `spark.comet.expression.Like.enabled` | Enable Comet acceleration for `Like` | true | -| `spark.comet.expression.Literal.enabled` | Enable Comet acceleration for `Literal` | true | -| `spark.comet.expression.Log.enabled` | Enable Comet acceleration for `Log` | true | -| `spark.comet.expression.Log10.enabled` | Enable Comet acceleration for `Log10` | true | -| `spark.comet.expression.Log2.enabled` | Enable Comet acceleration for `Log2` | true | -| `spark.comet.expression.Lower.enabled` | Enable Comet acceleration for `Lower` | true | -| `spark.comet.expression.MapEntries.enabled` | Enable Comet acceleration for `MapEntries` | true | -| `spark.comet.expression.MapFromArrays.enabled` | Enable Comet acceleration for `MapFromArrays` | true | -| `spark.comet.expression.MapKeys.enabled` | Enable Comet acceleration for `MapKeys` | true | -| `spark.comet.expression.MapValues.enabled` | Enable Comet acceleration for `MapValues` | true | -| `spark.comet.expression.Md5.enabled` | Enable Comet acceleration for `Md5` | true | -| `spark.comet.expression.Minute.enabled` | Enable Comet acceleration for `Minute` | true | -| `spark.comet.expression.MonotonicallyIncreasingID.enabled` | Enable Comet acceleration for `MonotonicallyIncreasingID` | true | -| `spark.comet.expression.Month.enabled` | Enable Comet acceleration for `Month` | true | -| `spark.comet.expression.Multiply.enabled` | Enable Comet acceleration for `Multiply` | true | -| `spark.comet.expression.Murmur3Hash.enabled` | Enable Comet acceleration for `Murmur3Hash` | true | -| `spark.comet.expression.Not.enabled` | Enable Comet acceleration for `Not` | true | -| `spark.comet.expression.OctetLength.enabled` | Enable Comet acceleration for `OctetLength` | true | -| `spark.comet.expression.Or.enabled` | Enable Comet acceleration for `Or` | true | -| `spark.comet.expression.Pow.enabled` | Enable Comet acceleration for `Pow` | true | -| `spark.comet.expression.Quarter.enabled` | Enable Comet acceleration for `Quarter` | true | -| `spark.comet.expression.RLike.enabled` | Enable Comet acceleration for `RLike` | true | -| `spark.comet.expression.Rand.enabled` | Enable Comet acceleration for `Rand` | true | -| `spark.comet.expression.Randn.enabled` | Enable Comet acceleration for `Randn` | true | -| `spark.comet.expression.RegExpReplace.enabled` | Enable Comet acceleration for `RegExpReplace` | true | -| `spark.comet.expression.Remainder.enabled` | Enable Comet acceleration for `Remainder` | true | -| `spark.comet.expression.Reverse.enabled` | Enable Comet acceleration for `Reverse` | true | -| `spark.comet.expression.Round.enabled` | Enable Comet acceleration for `Round` | true | -| `spark.comet.expression.Second.enabled` | Enable Comet acceleration for `Second` | true | -| `spark.comet.expression.Sha1.enabled` | Enable Comet acceleration for `Sha1` | true | -| `spark.comet.expression.Sha2.enabled` | Enable Comet acceleration for `Sha2` | true | -| `spark.comet.expression.ShiftLeft.enabled` | Enable Comet acceleration for `ShiftLeft` | true | -| `spark.comet.expression.ShiftRight.enabled` | Enable Comet acceleration for `ShiftRight` | true | -| `spark.comet.expression.Signum.enabled` | Enable Comet acceleration for `Signum` | true | -| `spark.comet.expression.Sin.enabled` | Enable Comet acceleration for `Sin` | true | -| `spark.comet.expression.SortOrder.enabled` | Enable Comet acceleration for `SortOrder` | true | -| `spark.comet.expression.SparkPartitionID.enabled` | Enable Comet acceleration for `SparkPartitionID` | true | -| `spark.comet.expression.Sqrt.enabled` | Enable Comet acceleration for `Sqrt` | true | -| `spark.comet.expression.StartsWith.enabled` | Enable Comet acceleration for `StartsWith` | true | -| `spark.comet.expression.StaticInvoke.enabled` | Enable Comet acceleration for `StaticInvoke` | true | -| `spark.comet.expression.StringInstr.enabled` | Enable Comet acceleration for `StringInstr` | true | -| `spark.comet.expression.StringLPad.enabled` | Enable Comet acceleration for `StringLPad` | true | -| `spark.comet.expression.StringRPad.enabled` | Enable Comet acceleration for `StringRPad` | true | -| `spark.comet.expression.StringRepeat.enabled` | Enable Comet acceleration for `StringRepeat` | true | -| `spark.comet.expression.StringReplace.enabled` | Enable Comet acceleration for `StringReplace` | true | -| `spark.comet.expression.StringSpace.enabled` | Enable Comet acceleration for `StringSpace` | true | -| `spark.comet.expression.StringSplit.enabled` | Enable Comet acceleration for `StringSplit` | true | -| `spark.comet.expression.StringTranslate.enabled` | Enable Comet acceleration for `StringTranslate` | true | -| `spark.comet.expression.StringTrim.enabled` | Enable Comet acceleration for `StringTrim` | true | -| `spark.comet.expression.StringTrimBoth.enabled` | Enable Comet acceleration for `StringTrimBoth` | true | -| `spark.comet.expression.StringTrimLeft.enabled` | Enable Comet acceleration for `StringTrimLeft` | true | -| `spark.comet.expression.StringTrimRight.enabled` | Enable Comet acceleration for `StringTrimRight` | true | -| `spark.comet.expression.StructsToJson.enabled` | Enable Comet acceleration for `StructsToJson` | true | -| `spark.comet.expression.Substring.enabled` | Enable Comet acceleration for `Substring` | true | -| `spark.comet.expression.Subtract.enabled` | Enable Comet acceleration for `Subtract` | true | -| `spark.comet.expression.Tan.enabled` | Enable Comet acceleration for `Tan` | true | -| `spark.comet.expression.TruncDate.enabled` | Enable Comet acceleration for `TruncDate` | true | -| `spark.comet.expression.TruncTimestamp.enabled` | Enable Comet acceleration for `TruncTimestamp` | true | -| `spark.comet.expression.UnaryMinus.enabled` | Enable Comet acceleration for `UnaryMinus` | true | -| `spark.comet.expression.Unhex.enabled` | Enable Comet acceleration for `Unhex` | true | -| `spark.comet.expression.Upper.enabled` | Enable Comet acceleration for `Upper` | true | -| `spark.comet.expression.WeekDay.enabled` | Enable Comet acceleration for `WeekDay` | true | -| `spark.comet.expression.WeekOfYear.enabled` | Enable Comet acceleration for `WeekOfYear` | true | -| `spark.comet.expression.XxHash64.enabled` | Enable Comet acceleration for `XxHash64` | true | -| `spark.comet.expression.Year.enabled` | Enable Comet acceleration for `Year` | true | - + +| Config | Description | Default Value | +| ---------------------------------------------------------- | --------------------------------------------------------- | ------------- | +| `spark.comet.expression.Abs.enabled` | Enable Comet acceleration for `Abs` | true | +| `spark.comet.expression.Acos.enabled` | Enable Comet acceleration for `Acos` | true | +| `spark.comet.expression.Add.enabled` | Enable Comet acceleration for `Add` | true | +| `spark.comet.expression.Alias.enabled` | Enable Comet acceleration for `Alias` | true | +| `spark.comet.expression.And.enabled` | Enable Comet acceleration for `And` | true | +| `spark.comet.expression.ArrayAppend.enabled` | Enable Comet acceleration for `ArrayAppend` | true | +| `spark.comet.expression.ArrayCompact.enabled` | Enable Comet acceleration for `ArrayCompact` | true | +| `spark.comet.expression.ArrayContains.enabled` | Enable Comet acceleration for `ArrayContains` | true | +| `spark.comet.expression.ArrayDistinct.enabled` | Enable Comet acceleration for `ArrayDistinct` | true | +| `spark.comet.expression.ArrayExcept.enabled` | Enable Comet acceleration for `ArrayExcept` | true | +| `spark.comet.expression.ArrayFilter.enabled` | Enable Comet acceleration for `ArrayFilter` | true | +| `spark.comet.expression.ArrayInsert.enabled` | Enable Comet acceleration for `ArrayInsert` | true | +| `spark.comet.expression.ArrayIntersect.enabled` | Enable Comet acceleration for `ArrayIntersect` | true | +| `spark.comet.expression.ArrayJoin.enabled` | Enable Comet acceleration for `ArrayJoin` | true | +| `spark.comet.expression.ArrayMax.enabled` | Enable Comet acceleration for `ArrayMax` | true | +| `spark.comet.expression.ArrayMin.enabled` | Enable Comet acceleration for `ArrayMin` | true | +| `spark.comet.expression.ArrayRemove.enabled` | Enable Comet acceleration for `ArrayRemove` | true | +| `spark.comet.expression.ArrayRepeat.enabled` | Enable Comet acceleration for `ArrayRepeat` | true | +| `spark.comet.expression.ArrayUnion.enabled` | Enable Comet acceleration for `ArrayUnion` | true | +| `spark.comet.expression.ArraysOverlap.enabled` | Enable Comet acceleration for `ArraysOverlap` | true | +| `spark.comet.expression.Ascii.enabled` | Enable Comet acceleration for `Ascii` | true | +| `spark.comet.expression.Asin.enabled` | Enable Comet acceleration for `Asin` | true | +| `spark.comet.expression.Atan.enabled` | Enable Comet acceleration for `Atan` | true | +| `spark.comet.expression.Atan2.enabled` | Enable Comet acceleration for `Atan2` | true | +| `spark.comet.expression.AttributeReference.enabled` | Enable Comet acceleration for `AttributeReference` | true | +| `spark.comet.expression.BitLength.enabled` | Enable Comet acceleration for `BitLength` | true | +| `spark.comet.expression.BitwiseAnd.enabled` | Enable Comet acceleration for `BitwiseAnd` | true | +| `spark.comet.expression.BitwiseCount.enabled` | Enable Comet acceleration for `BitwiseCount` | true | +| `spark.comet.expression.BitwiseGet.enabled` | Enable Comet acceleration for `BitwiseGet` | true | +| `spark.comet.expression.BitwiseNot.enabled` | Enable Comet acceleration for `BitwiseNot` | true | +| `spark.comet.expression.BitwiseOr.enabled` | Enable Comet acceleration for `BitwiseOr` | true | +| `spark.comet.expression.BitwiseXor.enabled` | Enable Comet acceleration for `BitwiseXor` | true | +| `spark.comet.expression.CaseWhen.enabled` | Enable Comet acceleration for `CaseWhen` | true | +| `spark.comet.expression.Cast.enabled` | Enable Comet acceleration for `Cast` | true | +| `spark.comet.expression.Ceil.enabled` | Enable Comet acceleration for `Ceil` | true | +| `spark.comet.expression.CheckOverflow.enabled` | Enable Comet acceleration for `CheckOverflow` | true | +| `spark.comet.expression.Chr.enabled` | Enable Comet acceleration for `Chr` | true | +| `spark.comet.expression.Coalesce.enabled` | Enable Comet acceleration for `Coalesce` | true | +| `spark.comet.expression.Concat.enabled` | Enable Comet acceleration for `Concat` | true | +| `spark.comet.expression.ConcatWs.enabled` | Enable Comet acceleration for `ConcatWs` | true | +| `spark.comet.expression.Contains.enabled` | Enable Comet acceleration for `Contains` | true | +| `spark.comet.expression.Cos.enabled` | Enable Comet acceleration for `Cos` | true | +| `spark.comet.expression.Cot.enabled` | Enable Comet acceleration for `Cot` | true | +| `spark.comet.expression.CreateArray.enabled` | Enable Comet acceleration for `CreateArray` | true | +| `spark.comet.expression.CreateNamedStruct.enabled` | Enable Comet acceleration for `CreateNamedStruct` | true | +| `spark.comet.expression.DateAdd.enabled` | Enable Comet acceleration for `DateAdd` | true | +| `spark.comet.expression.DateSub.enabled` | Enable Comet acceleration for `DateSub` | true | +| `spark.comet.expression.DayOfMonth.enabled` | Enable Comet acceleration for `DayOfMonth` | true | +| `spark.comet.expression.DayOfWeek.enabled` | Enable Comet acceleration for `DayOfWeek` | true | +| `spark.comet.expression.DayOfYear.enabled` | Enable Comet acceleration for `DayOfYear` | true | +| `spark.comet.expression.Divide.enabled` | Enable Comet acceleration for `Divide` | true | +| `spark.comet.expression.ElementAt.enabled` | Enable Comet acceleration for `ElementAt` | true | +| `spark.comet.expression.EndsWith.enabled` | Enable Comet acceleration for `EndsWith` | true | +| `spark.comet.expression.EqualNullSafe.enabled` | Enable Comet acceleration for `EqualNullSafe` | true | +| `spark.comet.expression.EqualTo.enabled` | Enable Comet acceleration for `EqualTo` | true | +| `spark.comet.expression.Exp.enabled` | Enable Comet acceleration for `Exp` | true | +| `spark.comet.expression.Expm1.enabled` | Enable Comet acceleration for `Expm1` | true | +| `spark.comet.expression.Flatten.enabled` | Enable Comet acceleration for `Flatten` | true | +| `spark.comet.expression.Floor.enabled` | Enable Comet acceleration for `Floor` | true | +| `spark.comet.expression.FromUnixTime.enabled` | Enable Comet acceleration for `FromUnixTime` | true | +| `spark.comet.expression.GetArrayItem.enabled` | Enable Comet acceleration for `GetArrayItem` | true | +| `spark.comet.expression.GetArrayStructFields.enabled` | Enable Comet acceleration for `GetArrayStructFields` | true | +| `spark.comet.expression.GetMapValue.enabled` | Enable Comet acceleration for `GetMapValue` | true | +| `spark.comet.expression.GetStructField.enabled` | Enable Comet acceleration for `GetStructField` | true | +| `spark.comet.expression.GreaterThan.enabled` | Enable Comet acceleration for `GreaterThan` | true | +| `spark.comet.expression.GreaterThanOrEqual.enabled` | Enable Comet acceleration for `GreaterThanOrEqual` | true | +| `spark.comet.expression.Hex.enabled` | Enable Comet acceleration for `Hex` | true | +| `spark.comet.expression.Hour.enabled` | Enable Comet acceleration for `Hour` | true | +| `spark.comet.expression.If.enabled` | Enable Comet acceleration for `If` | true | +| `spark.comet.expression.In.enabled` | Enable Comet acceleration for `In` | true | +| `spark.comet.expression.InSet.enabled` | Enable Comet acceleration for `InSet` | true | +| `spark.comet.expression.InitCap.enabled` | Enable Comet acceleration for `InitCap` | true | +| `spark.comet.expression.IntegralDivide.enabled` | Enable Comet acceleration for `IntegralDivide` | true | +| `spark.comet.expression.IsNaN.enabled` | Enable Comet acceleration for `IsNaN` | true | +| `spark.comet.expression.IsNotNull.enabled` | Enable Comet acceleration for `IsNotNull` | true | +| `spark.comet.expression.IsNull.enabled` | Enable Comet acceleration for `IsNull` | true | +| `spark.comet.expression.Length.enabled` | Enable Comet acceleration for `Length` | true | +| `spark.comet.expression.LessThan.enabled` | Enable Comet acceleration for `LessThan` | true | +| `spark.comet.expression.LessThanOrEqual.enabled` | Enable Comet acceleration for `LessThanOrEqual` | true | +| `spark.comet.expression.Like.enabled` | Enable Comet acceleration for `Like` | true | +| `spark.comet.expression.Literal.enabled` | Enable Comet acceleration for `Literal` | true | +| `spark.comet.expression.Log.enabled` | Enable Comet acceleration for `Log` | true | +| `spark.comet.expression.Log10.enabled` | Enable Comet acceleration for `Log10` | true | +| `spark.comet.expression.Log2.enabled` | Enable Comet acceleration for `Log2` | true | +| `spark.comet.expression.Lower.enabled` | Enable Comet acceleration for `Lower` | true | +| `spark.comet.expression.MapEntries.enabled` | Enable Comet acceleration for `MapEntries` | true | +| `spark.comet.expression.MapFromArrays.enabled` | Enable Comet acceleration for `MapFromArrays` | true | +| `spark.comet.expression.MapKeys.enabled` | Enable Comet acceleration for `MapKeys` | true | +| `spark.comet.expression.MapValues.enabled` | Enable Comet acceleration for `MapValues` | true | +| `spark.comet.expression.Md5.enabled` | Enable Comet acceleration for `Md5` | true | +| `spark.comet.expression.Minute.enabled` | Enable Comet acceleration for `Minute` | true | +| `spark.comet.expression.MonotonicallyIncreasingID.enabled` | Enable Comet acceleration for `MonotonicallyIncreasingID` | true | +| `spark.comet.expression.Month.enabled` | Enable Comet acceleration for `Month` | true | +| `spark.comet.expression.Multiply.enabled` | Enable Comet acceleration for `Multiply` | true | +| `spark.comet.expression.Murmur3Hash.enabled` | Enable Comet acceleration for `Murmur3Hash` | true | +| `spark.comet.expression.Not.enabled` | Enable Comet acceleration for `Not` | true | +| `spark.comet.expression.OctetLength.enabled` | Enable Comet acceleration for `OctetLength` | true | +| `spark.comet.expression.Or.enabled` | Enable Comet acceleration for `Or` | true | +| `spark.comet.expression.Pow.enabled` | Enable Comet acceleration for `Pow` | true | +| `spark.comet.expression.Quarter.enabled` | Enable Comet acceleration for `Quarter` | true | +| `spark.comet.expression.RLike.enabled` | Enable Comet acceleration for `RLike` | true | +| `spark.comet.expression.Rand.enabled` | Enable Comet acceleration for `Rand` | true | +| `spark.comet.expression.Randn.enabled` | Enable Comet acceleration for `Randn` | true | +| `spark.comet.expression.RegExpReplace.enabled` | Enable Comet acceleration for `RegExpReplace` | true | +| `spark.comet.expression.Remainder.enabled` | Enable Comet acceleration for `Remainder` | true | +| `spark.comet.expression.Reverse.enabled` | Enable Comet acceleration for `Reverse` | true | +| `spark.comet.expression.Round.enabled` | Enable Comet acceleration for `Round` | true | +| `spark.comet.expression.Second.enabled` | Enable Comet acceleration for `Second` | true | +| `spark.comet.expression.Sha1.enabled` | Enable Comet acceleration for `Sha1` | true | +| `spark.comet.expression.Sha2.enabled` | Enable Comet acceleration for `Sha2` | true | +| `spark.comet.expression.ShiftLeft.enabled` | Enable Comet acceleration for `ShiftLeft` | true | +| `spark.comet.expression.ShiftRight.enabled` | Enable Comet acceleration for `ShiftRight` | true | +| `spark.comet.expression.Signum.enabled` | Enable Comet acceleration for `Signum` | true | +| `spark.comet.expression.Sin.enabled` | Enable Comet acceleration for `Sin` | true | +| `spark.comet.expression.SortOrder.enabled` | Enable Comet acceleration for `SortOrder` | true | +| `spark.comet.expression.SparkPartitionID.enabled` | Enable Comet acceleration for `SparkPartitionID` | true | +| `spark.comet.expression.Sqrt.enabled` | Enable Comet acceleration for `Sqrt` | true | +| `spark.comet.expression.StartsWith.enabled` | Enable Comet acceleration for `StartsWith` | true | +| `spark.comet.expression.StaticInvoke.enabled` | Enable Comet acceleration for `StaticInvoke` | true | +| `spark.comet.expression.StringInstr.enabled` | Enable Comet acceleration for `StringInstr` | true | +| `spark.comet.expression.StringLPad.enabled` | Enable Comet acceleration for `StringLPad` | true | +| `spark.comet.expression.StringRPad.enabled` | Enable Comet acceleration for `StringRPad` | true | +| `spark.comet.expression.StringRepeat.enabled` | Enable Comet acceleration for `StringRepeat` | true | +| `spark.comet.expression.StringReplace.enabled` | Enable Comet acceleration for `StringReplace` | true | +| `spark.comet.expression.StringSpace.enabled` | Enable Comet acceleration for `StringSpace` | true | +| `spark.comet.expression.StringSplit.enabled` | Enable Comet acceleration for `StringSplit` | true | +| `spark.comet.expression.StringTranslate.enabled` | Enable Comet acceleration for `StringTranslate` | true | +| `spark.comet.expression.StringTrim.enabled` | Enable Comet acceleration for `StringTrim` | true | +| `spark.comet.expression.StringTrimBoth.enabled` | Enable Comet acceleration for `StringTrimBoth` | true | +| `spark.comet.expression.StringTrimLeft.enabled` | Enable Comet acceleration for `StringTrimLeft` | true | +| `spark.comet.expression.StringTrimRight.enabled` | Enable Comet acceleration for `StringTrimRight` | true | +| `spark.comet.expression.StructsToJson.enabled` | Enable Comet acceleration for `StructsToJson` | true | +| `spark.comet.expression.Substring.enabled` | Enable Comet acceleration for `Substring` | true | +| `spark.comet.expression.Subtract.enabled` | Enable Comet acceleration for `Subtract` | true | +| `spark.comet.expression.Tan.enabled` | Enable Comet acceleration for `Tan` | true | +| `spark.comet.expression.TruncDate.enabled` | Enable Comet acceleration for `TruncDate` | true | +| `spark.comet.expression.TruncTimestamp.enabled` | Enable Comet acceleration for `TruncTimestamp` | true | +| `spark.comet.expression.UnaryMinus.enabled` | Enable Comet acceleration for `UnaryMinus` | true | +| `spark.comet.expression.Unhex.enabled` | Enable Comet acceleration for `Unhex` | true | +| `spark.comet.expression.Upper.enabled` | Enable Comet acceleration for `Upper` | true | +| `spark.comet.expression.WeekDay.enabled` | Enable Comet acceleration for `WeekDay` | true | +| `spark.comet.expression.WeekOfYear.enabled` | Enable Comet acceleration for `WeekOfYear` | true | +| `spark.comet.expression.XxHash64.enabled` | Enable Comet acceleration for `XxHash64` | true | +| `spark.comet.expression.Year.enabled` | Enable Comet acceleration for `Year` | true | + + ## Enabling or Disabling Individual Aggregate Expressions From bcb6ed4185c5a920b0432247238692f8ee3c82fc Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Tue, 18 Nov 2025 00:28:36 +0530 Subject: [PATCH 11/15] update doc with make --- docs/source/user-guide/latest/configs.md | 297 +++++++++++------------ 1 file changed, 148 insertions(+), 149 deletions(-) diff --git a/docs/source/user-guide/latest/configs.md b/docs/source/user-guide/latest/configs.md index 06b9f15c69..3ed8be8515 100644 --- a/docs/source/user-guide/latest/configs.md +++ b/docs/source/user-guide/latest/configs.md @@ -179,155 +179,154 @@ These settings can be used to determine which parts of the plan are accelerated - -| Config | Description | Default Value | -| ---------------------------------------------------------- | --------------------------------------------------------- | ------------- | -| `spark.comet.expression.Abs.enabled` | Enable Comet acceleration for `Abs` | true | -| `spark.comet.expression.Acos.enabled` | Enable Comet acceleration for `Acos` | true | -| `spark.comet.expression.Add.enabled` | Enable Comet acceleration for `Add` | true | -| `spark.comet.expression.Alias.enabled` | Enable Comet acceleration for `Alias` | true | -| `spark.comet.expression.And.enabled` | Enable Comet acceleration for `And` | true | -| `spark.comet.expression.ArrayAppend.enabled` | Enable Comet acceleration for `ArrayAppend` | true | -| `spark.comet.expression.ArrayCompact.enabled` | Enable Comet acceleration for `ArrayCompact` | true | -| `spark.comet.expression.ArrayContains.enabled` | Enable Comet acceleration for `ArrayContains` | true | -| `spark.comet.expression.ArrayDistinct.enabled` | Enable Comet acceleration for `ArrayDistinct` | true | -| `spark.comet.expression.ArrayExcept.enabled` | Enable Comet acceleration for `ArrayExcept` | true | -| `spark.comet.expression.ArrayFilter.enabled` | Enable Comet acceleration for `ArrayFilter` | true | -| `spark.comet.expression.ArrayInsert.enabled` | Enable Comet acceleration for `ArrayInsert` | true | -| `spark.comet.expression.ArrayIntersect.enabled` | Enable Comet acceleration for `ArrayIntersect` | true | -| `spark.comet.expression.ArrayJoin.enabled` | Enable Comet acceleration for `ArrayJoin` | true | -| `spark.comet.expression.ArrayMax.enabled` | Enable Comet acceleration for `ArrayMax` | true | -| `spark.comet.expression.ArrayMin.enabled` | Enable Comet acceleration for `ArrayMin` | true | -| `spark.comet.expression.ArrayRemove.enabled` | Enable Comet acceleration for `ArrayRemove` | true | -| `spark.comet.expression.ArrayRepeat.enabled` | Enable Comet acceleration for `ArrayRepeat` | true | -| `spark.comet.expression.ArrayUnion.enabled` | Enable Comet acceleration for `ArrayUnion` | true | -| `spark.comet.expression.ArraysOverlap.enabled` | Enable Comet acceleration for `ArraysOverlap` | true | -| `spark.comet.expression.Ascii.enabled` | Enable Comet acceleration for `Ascii` | true | -| `spark.comet.expression.Asin.enabled` | Enable Comet acceleration for `Asin` | true | -| `spark.comet.expression.Atan.enabled` | Enable Comet acceleration for `Atan` | true | -| `spark.comet.expression.Atan2.enabled` | Enable Comet acceleration for `Atan2` | true | -| `spark.comet.expression.AttributeReference.enabled` | Enable Comet acceleration for `AttributeReference` | true | -| `spark.comet.expression.BitLength.enabled` | Enable Comet acceleration for `BitLength` | true | -| `spark.comet.expression.BitwiseAnd.enabled` | Enable Comet acceleration for `BitwiseAnd` | true | -| `spark.comet.expression.BitwiseCount.enabled` | Enable Comet acceleration for `BitwiseCount` | true | -| `spark.comet.expression.BitwiseGet.enabled` | Enable Comet acceleration for `BitwiseGet` | true | -| `spark.comet.expression.BitwiseNot.enabled` | Enable Comet acceleration for `BitwiseNot` | true | -| `spark.comet.expression.BitwiseOr.enabled` | Enable Comet acceleration for `BitwiseOr` | true | -| `spark.comet.expression.BitwiseXor.enabled` | Enable Comet acceleration for `BitwiseXor` | true | -| `spark.comet.expression.CaseWhen.enabled` | Enable Comet acceleration for `CaseWhen` | true | -| `spark.comet.expression.Cast.enabled` | Enable Comet acceleration for `Cast` | true | -| `spark.comet.expression.Ceil.enabled` | Enable Comet acceleration for `Ceil` | true | -| `spark.comet.expression.CheckOverflow.enabled` | Enable Comet acceleration for `CheckOverflow` | true | -| `spark.comet.expression.Chr.enabled` | Enable Comet acceleration for `Chr` | true | -| `spark.comet.expression.Coalesce.enabled` | Enable Comet acceleration for `Coalesce` | true | -| `spark.comet.expression.Concat.enabled` | Enable Comet acceleration for `Concat` | true | -| `spark.comet.expression.ConcatWs.enabled` | Enable Comet acceleration for `ConcatWs` | true | -| `spark.comet.expression.Contains.enabled` | Enable Comet acceleration for `Contains` | true | -| `spark.comet.expression.Cos.enabled` | Enable Comet acceleration for `Cos` | true | -| `spark.comet.expression.Cot.enabled` | Enable Comet acceleration for `Cot` | true | -| `spark.comet.expression.CreateArray.enabled` | Enable Comet acceleration for `CreateArray` | true | -| `spark.comet.expression.CreateNamedStruct.enabled` | Enable Comet acceleration for `CreateNamedStruct` | true | -| `spark.comet.expression.DateAdd.enabled` | Enable Comet acceleration for `DateAdd` | true | -| `spark.comet.expression.DateSub.enabled` | Enable Comet acceleration for `DateSub` | true | -| `spark.comet.expression.DayOfMonth.enabled` | Enable Comet acceleration for `DayOfMonth` | true | -| `spark.comet.expression.DayOfWeek.enabled` | Enable Comet acceleration for `DayOfWeek` | true | -| `spark.comet.expression.DayOfYear.enabled` | Enable Comet acceleration for `DayOfYear` | true | -| `spark.comet.expression.Divide.enabled` | Enable Comet acceleration for `Divide` | true | -| `spark.comet.expression.ElementAt.enabled` | Enable Comet acceleration for `ElementAt` | true | -| `spark.comet.expression.EndsWith.enabled` | Enable Comet acceleration for `EndsWith` | true | -| `spark.comet.expression.EqualNullSafe.enabled` | Enable Comet acceleration for `EqualNullSafe` | true | -| `spark.comet.expression.EqualTo.enabled` | Enable Comet acceleration for `EqualTo` | true | -| `spark.comet.expression.Exp.enabled` | Enable Comet acceleration for `Exp` | true | -| `spark.comet.expression.Expm1.enabled` | Enable Comet acceleration for `Expm1` | true | -| `spark.comet.expression.Flatten.enabled` | Enable Comet acceleration for `Flatten` | true | -| `spark.comet.expression.Floor.enabled` | Enable Comet acceleration for `Floor` | true | -| `spark.comet.expression.FromUnixTime.enabled` | Enable Comet acceleration for `FromUnixTime` | true | -| `spark.comet.expression.GetArrayItem.enabled` | Enable Comet acceleration for `GetArrayItem` | true | -| `spark.comet.expression.GetArrayStructFields.enabled` | Enable Comet acceleration for `GetArrayStructFields` | true | -| `spark.comet.expression.GetMapValue.enabled` | Enable Comet acceleration for `GetMapValue` | true | -| `spark.comet.expression.GetStructField.enabled` | Enable Comet acceleration for `GetStructField` | true | -| `spark.comet.expression.GreaterThan.enabled` | Enable Comet acceleration for `GreaterThan` | true | -| `spark.comet.expression.GreaterThanOrEqual.enabled` | Enable Comet acceleration for `GreaterThanOrEqual` | true | -| `spark.comet.expression.Hex.enabled` | Enable Comet acceleration for `Hex` | true | -| `spark.comet.expression.Hour.enabled` | Enable Comet acceleration for `Hour` | true | -| `spark.comet.expression.If.enabled` | Enable Comet acceleration for `If` | true | -| `spark.comet.expression.In.enabled` | Enable Comet acceleration for `In` | true | -| `spark.comet.expression.InSet.enabled` | Enable Comet acceleration for `InSet` | true | -| `spark.comet.expression.InitCap.enabled` | Enable Comet acceleration for `InitCap` | true | -| `spark.comet.expression.IntegralDivide.enabled` | Enable Comet acceleration for `IntegralDivide` | true | -| `spark.comet.expression.IsNaN.enabled` | Enable Comet acceleration for `IsNaN` | true | -| `spark.comet.expression.IsNotNull.enabled` | Enable Comet acceleration for `IsNotNull` | true | -| `spark.comet.expression.IsNull.enabled` | Enable Comet acceleration for `IsNull` | true | -| `spark.comet.expression.Length.enabled` | Enable Comet acceleration for `Length` | true | -| `spark.comet.expression.LessThan.enabled` | Enable Comet acceleration for `LessThan` | true | -| `spark.comet.expression.LessThanOrEqual.enabled` | Enable Comet acceleration for `LessThanOrEqual` | true | -| `spark.comet.expression.Like.enabled` | Enable Comet acceleration for `Like` | true | -| `spark.comet.expression.Literal.enabled` | Enable Comet acceleration for `Literal` | true | -| `spark.comet.expression.Log.enabled` | Enable Comet acceleration for `Log` | true | -| `spark.comet.expression.Log10.enabled` | Enable Comet acceleration for `Log10` | true | -| `spark.comet.expression.Log2.enabled` | Enable Comet acceleration for `Log2` | true | -| `spark.comet.expression.Lower.enabled` | Enable Comet acceleration for `Lower` | true | -| `spark.comet.expression.MapEntries.enabled` | Enable Comet acceleration for `MapEntries` | true | -| `spark.comet.expression.MapFromArrays.enabled` | Enable Comet acceleration for `MapFromArrays` | true | -| `spark.comet.expression.MapKeys.enabled` | Enable Comet acceleration for `MapKeys` | true | -| `spark.comet.expression.MapValues.enabled` | Enable Comet acceleration for `MapValues` | true | -| `spark.comet.expression.Md5.enabled` | Enable Comet acceleration for `Md5` | true | -| `spark.comet.expression.Minute.enabled` | Enable Comet acceleration for `Minute` | true | -| `spark.comet.expression.MonotonicallyIncreasingID.enabled` | Enable Comet acceleration for `MonotonicallyIncreasingID` | true | -| `spark.comet.expression.Month.enabled` | Enable Comet acceleration for `Month` | true | -| `spark.comet.expression.Multiply.enabled` | Enable Comet acceleration for `Multiply` | true | -| `spark.comet.expression.Murmur3Hash.enabled` | Enable Comet acceleration for `Murmur3Hash` | true | -| `spark.comet.expression.Not.enabled` | Enable Comet acceleration for `Not` | true | -| `spark.comet.expression.OctetLength.enabled` | Enable Comet acceleration for `OctetLength` | true | -| `spark.comet.expression.Or.enabled` | Enable Comet acceleration for `Or` | true | -| `spark.comet.expression.Pow.enabled` | Enable Comet acceleration for `Pow` | true | -| `spark.comet.expression.Quarter.enabled` | Enable Comet acceleration for `Quarter` | true | -| `spark.comet.expression.RLike.enabled` | Enable Comet acceleration for `RLike` | true | -| `spark.comet.expression.Rand.enabled` | Enable Comet acceleration for `Rand` | true | -| `spark.comet.expression.Randn.enabled` | Enable Comet acceleration for `Randn` | true | -| `spark.comet.expression.RegExpReplace.enabled` | Enable Comet acceleration for `RegExpReplace` | true | -| `spark.comet.expression.Remainder.enabled` | Enable Comet acceleration for `Remainder` | true | -| `spark.comet.expression.Reverse.enabled` | Enable Comet acceleration for `Reverse` | true | -| `spark.comet.expression.Round.enabled` | Enable Comet acceleration for `Round` | true | -| `spark.comet.expression.Second.enabled` | Enable Comet acceleration for `Second` | true | -| `spark.comet.expression.Sha1.enabled` | Enable Comet acceleration for `Sha1` | true | -| `spark.comet.expression.Sha2.enabled` | Enable Comet acceleration for `Sha2` | true | -| `spark.comet.expression.ShiftLeft.enabled` | Enable Comet acceleration for `ShiftLeft` | true | -| `spark.comet.expression.ShiftRight.enabled` | Enable Comet acceleration for `ShiftRight` | true | -| `spark.comet.expression.Signum.enabled` | Enable Comet acceleration for `Signum` | true | -| `spark.comet.expression.Sin.enabled` | Enable Comet acceleration for `Sin` | true | -| `spark.comet.expression.SortOrder.enabled` | Enable Comet acceleration for `SortOrder` | true | -| `spark.comet.expression.SparkPartitionID.enabled` | Enable Comet acceleration for `SparkPartitionID` | true | -| `spark.comet.expression.Sqrt.enabled` | Enable Comet acceleration for `Sqrt` | true | -| `spark.comet.expression.StartsWith.enabled` | Enable Comet acceleration for `StartsWith` | true | -| `spark.comet.expression.StaticInvoke.enabled` | Enable Comet acceleration for `StaticInvoke` | true | -| `spark.comet.expression.StringInstr.enabled` | Enable Comet acceleration for `StringInstr` | true | -| `spark.comet.expression.StringLPad.enabled` | Enable Comet acceleration for `StringLPad` | true | -| `spark.comet.expression.StringRPad.enabled` | Enable Comet acceleration for `StringRPad` | true | -| `spark.comet.expression.StringRepeat.enabled` | Enable Comet acceleration for `StringRepeat` | true | -| `spark.comet.expression.StringReplace.enabled` | Enable Comet acceleration for `StringReplace` | true | -| `spark.comet.expression.StringSpace.enabled` | Enable Comet acceleration for `StringSpace` | true | -| `spark.comet.expression.StringSplit.enabled` | Enable Comet acceleration for `StringSplit` | true | -| `spark.comet.expression.StringTranslate.enabled` | Enable Comet acceleration for `StringTranslate` | true | -| `spark.comet.expression.StringTrim.enabled` | Enable Comet acceleration for `StringTrim` | true | -| `spark.comet.expression.StringTrimBoth.enabled` | Enable Comet acceleration for `StringTrimBoth` | true | -| `spark.comet.expression.StringTrimLeft.enabled` | Enable Comet acceleration for `StringTrimLeft` | true | -| `spark.comet.expression.StringTrimRight.enabled` | Enable Comet acceleration for `StringTrimRight` | true | -| `spark.comet.expression.StructsToJson.enabled` | Enable Comet acceleration for `StructsToJson` | true | -| `spark.comet.expression.Substring.enabled` | Enable Comet acceleration for `Substring` | true | -| `spark.comet.expression.Subtract.enabled` | Enable Comet acceleration for `Subtract` | true | -| `spark.comet.expression.Tan.enabled` | Enable Comet acceleration for `Tan` | true | -| `spark.comet.expression.TruncDate.enabled` | Enable Comet acceleration for `TruncDate` | true | -| `spark.comet.expression.TruncTimestamp.enabled` | Enable Comet acceleration for `TruncTimestamp` | true | -| `spark.comet.expression.UnaryMinus.enabled` | Enable Comet acceleration for `UnaryMinus` | true | -| `spark.comet.expression.Unhex.enabled` | Enable Comet acceleration for `Unhex` | true | -| `spark.comet.expression.Upper.enabled` | Enable Comet acceleration for `Upper` | true | -| `spark.comet.expression.WeekDay.enabled` | Enable Comet acceleration for `WeekDay` | true | -| `spark.comet.expression.WeekOfYear.enabled` | Enable Comet acceleration for `WeekOfYear` | true | -| `spark.comet.expression.XxHash64.enabled` | Enable Comet acceleration for `XxHash64` | true | -| `spark.comet.expression.Year.enabled` | Enable Comet acceleration for `Year` | true | - - + +| Config | Description | Default Value | +|--------|-------------|---------------| +| `spark.comet.expression.Abs.enabled` | Enable Comet acceleration for `Abs` | true | +| `spark.comet.expression.Acos.enabled` | Enable Comet acceleration for `Acos` | true | +| `spark.comet.expression.Add.enabled` | Enable Comet acceleration for `Add` | true | +| `spark.comet.expression.Alias.enabled` | Enable Comet acceleration for `Alias` | true | +| `spark.comet.expression.And.enabled` | Enable Comet acceleration for `And` | true | +| `spark.comet.expression.ArrayAppend.enabled` | Enable Comet acceleration for `ArrayAppend` | true | +| `spark.comet.expression.ArrayCompact.enabled` | Enable Comet acceleration for `ArrayCompact` | true | +| `spark.comet.expression.ArrayContains.enabled` | Enable Comet acceleration for `ArrayContains` | true | +| `spark.comet.expression.ArrayDistinct.enabled` | Enable Comet acceleration for `ArrayDistinct` | true | +| `spark.comet.expression.ArrayExcept.enabled` | Enable Comet acceleration for `ArrayExcept` | true | +| `spark.comet.expression.ArrayFilter.enabled` | Enable Comet acceleration for `ArrayFilter` | true | +| `spark.comet.expression.ArrayInsert.enabled` | Enable Comet acceleration for `ArrayInsert` | true | +| `spark.comet.expression.ArrayIntersect.enabled` | Enable Comet acceleration for `ArrayIntersect` | true | +| `spark.comet.expression.ArrayJoin.enabled` | Enable Comet acceleration for `ArrayJoin` | true | +| `spark.comet.expression.ArrayMax.enabled` | Enable Comet acceleration for `ArrayMax` | true | +| `spark.comet.expression.ArrayMin.enabled` | Enable Comet acceleration for `ArrayMin` | true | +| `spark.comet.expression.ArrayRemove.enabled` | Enable Comet acceleration for `ArrayRemove` | true | +| `spark.comet.expression.ArrayRepeat.enabled` | Enable Comet acceleration for `ArrayRepeat` | true | +| `spark.comet.expression.ArrayUnion.enabled` | Enable Comet acceleration for `ArrayUnion` | true | +| `spark.comet.expression.ArraysOverlap.enabled` | Enable Comet acceleration for `ArraysOverlap` | true | +| `spark.comet.expression.Ascii.enabled` | Enable Comet acceleration for `Ascii` | true | +| `spark.comet.expression.Asin.enabled` | Enable Comet acceleration for `Asin` | true | +| `spark.comet.expression.Atan.enabled` | Enable Comet acceleration for `Atan` | true | +| `spark.comet.expression.Atan2.enabled` | Enable Comet acceleration for `Atan2` | true | +| `spark.comet.expression.AttributeReference.enabled` | Enable Comet acceleration for `AttributeReference` | true | +| `spark.comet.expression.BitLength.enabled` | Enable Comet acceleration for `BitLength` | true | +| `spark.comet.expression.BitwiseAnd.enabled` | Enable Comet acceleration for `BitwiseAnd` | true | +| `spark.comet.expression.BitwiseCount.enabled` | Enable Comet acceleration for `BitwiseCount` | true | +| `spark.comet.expression.BitwiseGet.enabled` | Enable Comet acceleration for `BitwiseGet` | true | +| `spark.comet.expression.BitwiseNot.enabled` | Enable Comet acceleration for `BitwiseNot` | true | +| `spark.comet.expression.BitwiseOr.enabled` | Enable Comet acceleration for `BitwiseOr` | true | +| `spark.comet.expression.BitwiseXor.enabled` | Enable Comet acceleration for `BitwiseXor` | true | +| `spark.comet.expression.CaseWhen.enabled` | Enable Comet acceleration for `CaseWhen` | true | +| `spark.comet.expression.Cast.enabled` | Enable Comet acceleration for `Cast` | true | +| `spark.comet.expression.Ceil.enabled` | Enable Comet acceleration for `Ceil` | true | +| `spark.comet.expression.CheckOverflow.enabled` | Enable Comet acceleration for `CheckOverflow` | true | +| `spark.comet.expression.Chr.enabled` | Enable Comet acceleration for `Chr` | true | +| `spark.comet.expression.Coalesce.enabled` | Enable Comet acceleration for `Coalesce` | true | +| `spark.comet.expression.Concat.enabled` | Enable Comet acceleration for `Concat` | true | +| `spark.comet.expression.ConcatWs.enabled` | Enable Comet acceleration for `ConcatWs` | true | +| `spark.comet.expression.Contains.enabled` | Enable Comet acceleration for `Contains` | true | +| `spark.comet.expression.Cos.enabled` | Enable Comet acceleration for `Cos` | true | +| `spark.comet.expression.Cot.enabled` | Enable Comet acceleration for `Cot` | true | +| `spark.comet.expression.CreateArray.enabled` | Enable Comet acceleration for `CreateArray` | true | +| `spark.comet.expression.CreateNamedStruct.enabled` | Enable Comet acceleration for `CreateNamedStruct` | true | +| `spark.comet.expression.DateAdd.enabled` | Enable Comet acceleration for `DateAdd` | true | +| `spark.comet.expression.DateSub.enabled` | Enable Comet acceleration for `DateSub` | true | +| `spark.comet.expression.DayOfMonth.enabled` | Enable Comet acceleration for `DayOfMonth` | true | +| `spark.comet.expression.DayOfWeek.enabled` | Enable Comet acceleration for `DayOfWeek` | true | +| `spark.comet.expression.DayOfYear.enabled` | Enable Comet acceleration for `DayOfYear` | true | +| `spark.comet.expression.Divide.enabled` | Enable Comet acceleration for `Divide` | true | +| `spark.comet.expression.ElementAt.enabled` | Enable Comet acceleration for `ElementAt` | true | +| `spark.comet.expression.EndsWith.enabled` | Enable Comet acceleration for `EndsWith` | true | +| `spark.comet.expression.EqualNullSafe.enabled` | Enable Comet acceleration for `EqualNullSafe` | true | +| `spark.comet.expression.EqualTo.enabled` | Enable Comet acceleration for `EqualTo` | true | +| `spark.comet.expression.Exp.enabled` | Enable Comet acceleration for `Exp` | true | +| `spark.comet.expression.Expm1.enabled` | Enable Comet acceleration for `Expm1` | true | +| `spark.comet.expression.Flatten.enabled` | Enable Comet acceleration for `Flatten` | true | +| `spark.comet.expression.Floor.enabled` | Enable Comet acceleration for `Floor` | true | +| `spark.comet.expression.FromUnixTime.enabled` | Enable Comet acceleration for `FromUnixTime` | true | +| `spark.comet.expression.GetArrayItem.enabled` | Enable Comet acceleration for `GetArrayItem` | true | +| `spark.comet.expression.GetArrayStructFields.enabled` | Enable Comet acceleration for `GetArrayStructFields` | true | +| `spark.comet.expression.GetMapValue.enabled` | Enable Comet acceleration for `GetMapValue` | true | +| `spark.comet.expression.GetStructField.enabled` | Enable Comet acceleration for `GetStructField` | true | +| `spark.comet.expression.GreaterThan.enabled` | Enable Comet acceleration for `GreaterThan` | true | +| `spark.comet.expression.GreaterThanOrEqual.enabled` | Enable Comet acceleration for `GreaterThanOrEqual` | true | +| `spark.comet.expression.Hex.enabled` | Enable Comet acceleration for `Hex` | true | +| `spark.comet.expression.Hour.enabled` | Enable Comet acceleration for `Hour` | true | +| `spark.comet.expression.If.enabled` | Enable Comet acceleration for `If` | true | +| `spark.comet.expression.In.enabled` | Enable Comet acceleration for `In` | true | +| `spark.comet.expression.InSet.enabled` | Enable Comet acceleration for `InSet` | true | +| `spark.comet.expression.InitCap.enabled` | Enable Comet acceleration for `InitCap` | true | +| `spark.comet.expression.IntegralDivide.enabled` | Enable Comet acceleration for `IntegralDivide` | true | +| `spark.comet.expression.IsNaN.enabled` | Enable Comet acceleration for `IsNaN` | true | +| `spark.comet.expression.IsNotNull.enabled` | Enable Comet acceleration for `IsNotNull` | true | +| `spark.comet.expression.IsNull.enabled` | Enable Comet acceleration for `IsNull` | true | +| `spark.comet.expression.Length.enabled` | Enable Comet acceleration for `Length` | true | +| `spark.comet.expression.LessThan.enabled` | Enable Comet acceleration for `LessThan` | true | +| `spark.comet.expression.LessThanOrEqual.enabled` | Enable Comet acceleration for `LessThanOrEqual` | true | +| `spark.comet.expression.Like.enabled` | Enable Comet acceleration for `Like` | true | +| `spark.comet.expression.Literal.enabled` | Enable Comet acceleration for `Literal` | true | +| `spark.comet.expression.Log.enabled` | Enable Comet acceleration for `Log` | true | +| `spark.comet.expression.Log10.enabled` | Enable Comet acceleration for `Log10` | true | +| `spark.comet.expression.Log2.enabled` | Enable Comet acceleration for `Log2` | true | +| `spark.comet.expression.Lower.enabled` | Enable Comet acceleration for `Lower` | true | +| `spark.comet.expression.MapEntries.enabled` | Enable Comet acceleration for `MapEntries` | true | +| `spark.comet.expression.MapFromArrays.enabled` | Enable Comet acceleration for `MapFromArrays` | true | +| `spark.comet.expression.MapKeys.enabled` | Enable Comet acceleration for `MapKeys` | true | +| `spark.comet.expression.MapValues.enabled` | Enable Comet acceleration for `MapValues` | true | +| `spark.comet.expression.Md5.enabled` | Enable Comet acceleration for `Md5` | true | +| `spark.comet.expression.Minute.enabled` | Enable Comet acceleration for `Minute` | true | +| `spark.comet.expression.MonotonicallyIncreasingID.enabled` | Enable Comet acceleration for `MonotonicallyIncreasingID` | true | +| `spark.comet.expression.Month.enabled` | Enable Comet acceleration for `Month` | true | +| `spark.comet.expression.Multiply.enabled` | Enable Comet acceleration for `Multiply` | true | +| `spark.comet.expression.Murmur3Hash.enabled` | Enable Comet acceleration for `Murmur3Hash` | true | +| `spark.comet.expression.Not.enabled` | Enable Comet acceleration for `Not` | true | +| `spark.comet.expression.OctetLength.enabled` | Enable Comet acceleration for `OctetLength` | true | +| `spark.comet.expression.Or.enabled` | Enable Comet acceleration for `Or` | true | +| `spark.comet.expression.Pow.enabled` | Enable Comet acceleration for `Pow` | true | +| `spark.comet.expression.Quarter.enabled` | Enable Comet acceleration for `Quarter` | true | +| `spark.comet.expression.RLike.enabled` | Enable Comet acceleration for `RLike` | true | +| `spark.comet.expression.Rand.enabled` | Enable Comet acceleration for `Rand` | true | +| `spark.comet.expression.Randn.enabled` | Enable Comet acceleration for `Randn` | true | +| `spark.comet.expression.RegExpReplace.enabled` | Enable Comet acceleration for `RegExpReplace` | true | +| `spark.comet.expression.Remainder.enabled` | Enable Comet acceleration for `Remainder` | true | +| `spark.comet.expression.Reverse.enabled` | Enable Comet acceleration for `Reverse` | true | +| `spark.comet.expression.Round.enabled` | Enable Comet acceleration for `Round` | true | +| `spark.comet.expression.Second.enabled` | Enable Comet acceleration for `Second` | true | +| `spark.comet.expression.Sha1.enabled` | Enable Comet acceleration for `Sha1` | true | +| `spark.comet.expression.Sha2.enabled` | Enable Comet acceleration for `Sha2` | true | +| `spark.comet.expression.ShiftLeft.enabled` | Enable Comet acceleration for `ShiftLeft` | true | +| `spark.comet.expression.ShiftRight.enabled` | Enable Comet acceleration for `ShiftRight` | true | +| `spark.comet.expression.Signum.enabled` | Enable Comet acceleration for `Signum` | true | +| `spark.comet.expression.Sin.enabled` | Enable Comet acceleration for `Sin` | true | +| `spark.comet.expression.SortOrder.enabled` | Enable Comet acceleration for `SortOrder` | true | +| `spark.comet.expression.SparkPartitionID.enabled` | Enable Comet acceleration for `SparkPartitionID` | true | +| `spark.comet.expression.Sqrt.enabled` | Enable Comet acceleration for `Sqrt` | true | +| `spark.comet.expression.StartsWith.enabled` | Enable Comet acceleration for `StartsWith` | true | +| `spark.comet.expression.StaticInvoke.enabled` | Enable Comet acceleration for `StaticInvoke` | true | +| `spark.comet.expression.StringInstr.enabled` | Enable Comet acceleration for `StringInstr` | true | +| `spark.comet.expression.StringLPad.enabled` | Enable Comet acceleration for `StringLPad` | true | +| `spark.comet.expression.StringRPad.enabled` | Enable Comet acceleration for `StringRPad` | true | +| `spark.comet.expression.StringRepeat.enabled` | Enable Comet acceleration for `StringRepeat` | true | +| `spark.comet.expression.StringReplace.enabled` | Enable Comet acceleration for `StringReplace` | true | +| `spark.comet.expression.StringSpace.enabled` | Enable Comet acceleration for `StringSpace` | true | +| `spark.comet.expression.StringSplit.enabled` | Enable Comet acceleration for `StringSplit` | true | +| `spark.comet.expression.StringTranslate.enabled` | Enable Comet acceleration for `StringTranslate` | true | +| `spark.comet.expression.StringTrim.enabled` | Enable Comet acceleration for `StringTrim` | true | +| `spark.comet.expression.StringTrimBoth.enabled` | Enable Comet acceleration for `StringTrimBoth` | true | +| `spark.comet.expression.StringTrimLeft.enabled` | Enable Comet acceleration for `StringTrimLeft` | true | +| `spark.comet.expression.StringTrimRight.enabled` | Enable Comet acceleration for `StringTrimRight` | true | +| `spark.comet.expression.StructsToJson.enabled` | Enable Comet acceleration for `StructsToJson` | true | +| `spark.comet.expression.Substring.enabled` | Enable Comet acceleration for `Substring` | true | +| `spark.comet.expression.Subtract.enabled` | Enable Comet acceleration for `Subtract` | true | +| `spark.comet.expression.Tan.enabled` | Enable Comet acceleration for `Tan` | true | +| `spark.comet.expression.TruncDate.enabled` | Enable Comet acceleration for `TruncDate` | true | +| `spark.comet.expression.TruncTimestamp.enabled` | Enable Comet acceleration for `TruncTimestamp` | true | +| `spark.comet.expression.UnaryMinus.enabled` | Enable Comet acceleration for `UnaryMinus` | true | +| `spark.comet.expression.Unhex.enabled` | Enable Comet acceleration for `Unhex` | true | +| `spark.comet.expression.Upper.enabled` | Enable Comet acceleration for `Upper` | true | +| `spark.comet.expression.WeekDay.enabled` | Enable Comet acceleration for `WeekDay` | true | +| `spark.comet.expression.WeekOfYear.enabled` | Enable Comet acceleration for `WeekOfYear` | true | +| `spark.comet.expression.XxHash64.enabled` | Enable Comet acceleration for `XxHash64` | true | +| `spark.comet.expression.Year.enabled` | Enable Comet acceleration for `Year` | true | + ## Enabling or Disabling Individual Aggregate Expressions From 890c56eb2abab27dcff44df8ae2c259b2dc74fae Mon Sep 17 00:00:00 2001 From: Shekhar Rajak Date: Sat, 6 Dec 2025 00:01:34 +0530 Subject: [PATCH 12/15] PR check fixes --- native/spark-expr/src/string_funcs/split.rs | 10 ++++------ .../org/apache/comet/serde/CometScalarFunction.scala | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/native/spark-expr/src/string_funcs/split.rs b/native/spark-expr/src/string_funcs/split.rs index f3c2c33782..35595f865f 100644 --- a/native/spark-expr/src/string_funcs/split.rs +++ b/native/spark-expr/src/string_funcs/split.rs @@ -120,7 +120,7 @@ fn split_array( DataType::LargeUtf8 => { // Convert LargeUtf8 to Utf8 for processing let large_array = as_generic_string_array::(string_array)?; - return split_large_string_array(&large_array, ®ex, limit); + return split_large_string_array(large_array, ®ex, limit); } _ => { return exec_err!( @@ -204,7 +204,7 @@ fn split_with_regex(string: &str, regex: &Regex, limit: i32) -> Vec { // limit = 0: split as many times as possible, discard trailing empty strings let mut parts: Vec = regex.split(string).map(|s| s.to_string()).collect(); // Remove trailing empty strings - while parts.last().map_or(false, |s| s.is_empty()) { + while parts.last().is_some_and(|s| s.is_empty()) { parts.pop(); } if parts.is_empty() { @@ -216,15 +216,13 @@ fn split_with_regex(string: &str, regex: &Regex, limit: i32) -> Vec { // limit > 0: at most limit-1 splits (array length <= limit) let mut parts: Vec = Vec::new(); let mut last_end = 0; - let mut count = 0; - for mat in regex.find_iter(string) { - if count >= limit - 1 { + for (count, mat) in regex.find_iter(string).enumerate() { + if count >= (limit - 1) as usize { break; } parts.push(string[last_end..mat.start()].to_string()); last_end = mat.end(); - count += 1; } // Add the remaining string parts.push(string[last_end..].to_string()); diff --git a/spark/src/main/scala/org/apache/comet/serde/CometScalarFunction.scala b/spark/src/main/scala/org/apache/comet/serde/CometScalarFunction.scala index aa3bf775fb..3108bd4126 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometScalarFunction.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometScalarFunction.scala @@ -22,13 +22,13 @@ package org.apache.comet.serde import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.comet.serde.ExprOuterClass.Expr -import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} +import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProtoWithReturnType} /** Serde for scalar function. */ case class CometScalarFunction[T <: Expression](name: String) extends CometExpressionSerde[T] { override def convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { val childExpr = expr.children.map(exprToProtoInternal(_, inputs, binding)) - val optExpr = scalarFunctionExprToProto(name, childExpr: _*) + val optExpr = scalarFunctionExprToProtoWithReturnType(name, expr.dataType, false, childExpr: _*) optExprWithInfo(optExpr, expr, expr.children: _*) } } From 68b1d07e378c98a603832409b320d86b1559feae Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Sat, 6 Dec 2025 00:18:02 +0530 Subject: [PATCH 13/15] PR check fixes --- .../comet/serde/CometScalarFunction.scala | 4 +-- .../apache/comet/serde/QueryPlanSerde.scala | 2 +- .../org/apache/comet/serde/strings.scala | 30 +++++++++++++++++-- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/CometScalarFunction.scala b/spark/src/main/scala/org/apache/comet/serde/CometScalarFunction.scala index 3108bd4126..aa3bf775fb 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometScalarFunction.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometScalarFunction.scala @@ -22,13 +22,13 @@ package org.apache.comet.serde import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.comet.serde.ExprOuterClass.Expr -import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProtoWithReturnType} +import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} /** Serde for scalar function. */ case class CometScalarFunction[T <: Expression](name: String) extends CometExpressionSerde[T] { override def convert(expr: T, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { val childExpr = expr.children.map(exprToProtoInternal(_, inputs, binding)) - val optExpr = scalarFunctionExprToProtoWithReturnType(name, expr.dataType, false, childExpr: _*) + val optExpr = scalarFunctionExprToProto(name, childExpr: _*) optExprWithInfo(optExpr, expr, expr.children: _*) } } diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index ba89b99759..06166251cc 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -163,7 +163,7 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[StringRPad] -> CometStringRPad, classOf[StringLPad] -> CometStringLPad, classOf[StringSpace] -> CometScalarFunction("string_space"), - classOf[StringSplit] -> CometScalarFunction("split"), + classOf[StringSplit] -> CometStringSplit, classOf[StringTranslate] -> CometScalarFunction("translate"), classOf[StringTrim] -> CometScalarFunction("trim"), classOf[StringTrimBoth] -> CometScalarFunction("btrim"), diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 15f4b238f2..271d572dea 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -21,14 +21,14 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, Expression, InitCap, Length, Like, Literal, Lower, RegExpReplace, RLike, StringLPad, StringRepeat, StringRPad, Substring, Upper} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, Expression, InitCap, Length, Like, Literal, Lower, RegExpReplace, RLike, StringLPad, StringRepeat, StringRPad, StringSplit, Substring, Upper} import org.apache.spark.sql.types.{BinaryType, DataTypes, LongType, StringType} import org.apache.comet.CometConf import org.apache.comet.CometSparkSessionExtensions.withInfo import org.apache.comet.expressions.{CometCast, CometEvalMode, RegExp} import org.apache.comet.serde.ExprOuterClass.Expr -import org.apache.comet.serde.QueryPlanSerde.{createBinaryExpr, exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} +import org.apache.comet.serde.QueryPlanSerde.{createBinaryExpr, exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto, scalarFunctionExprToProtoWithReturnType} object CometStringRepeat extends CometExpressionSerde[StringRepeat] { @@ -259,6 +259,32 @@ object CometRegExpReplace extends CometExpressionSerde[RegExpReplace] { } } +/** + * Serde for StringSplit expression. + * This is a custom Comet function (not a built-in DataFusion function), + * so we need to include the return type in the protobuf to avoid + * DataFusion registry lookup failures. + */ +object CometStringSplit extends CometExpressionSerde[StringSplit] { + + override def convert( + expr: StringSplit, + inputs: Seq[Attribute], + binding: Boolean): Option[Expr] = { + val strExpr = exprToProtoInternal(expr.str, inputs, binding) + val regexExpr = exprToProtoInternal(expr.regex, inputs, binding) + val limitExpr = exprToProtoInternal(expr.limit, inputs, binding) + val optExpr = scalarFunctionExprToProtoWithReturnType( + "split", + expr.dataType, + false, + strExpr, + regexExpr, + limitExpr) + optExprWithInfo(optExpr, expr, expr.str, expr.regex, expr.limit) + } +} + trait CommonStringExprs { def stringDecode( From 2199b5f4edede66f77729d7754774a78360005bb Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Sat, 20 Dec 2025 19:34:45 +0530 Subject: [PATCH 14/15] check fixes --- spark/src/main/scala/org/apache/comet/serde/strings.scala | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 271d572dea..7b28fd2d38 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -260,10 +260,9 @@ object CometRegExpReplace extends CometExpressionSerde[RegExpReplace] { } /** - * Serde for StringSplit expression. - * This is a custom Comet function (not a built-in DataFusion function), - * so we need to include the return type in the protobuf to avoid - * DataFusion registry lookup failures. + * Serde for StringSplit expression. This is a custom Comet function (not a built-in DataFusion + * function), so we need to include the return type in the protobuf to avoid DataFusion registry + * lookup failures. */ object CometStringSplit extends CometExpressionSerde[StringSplit] { From bd5b0018fadae1a92830c208e73726347520c9e6 Mon Sep 17 00:00:00 2001 From: shekharrajak Date: Mon, 12 Jan 2026 23:59:13 +0530 Subject: [PATCH 15/15] Fix split function to return NULL for NULL inputs --- native/spark-expr/src/string_funcs/split.rs | 56 +++++++++++++++++++-- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/native/spark-expr/src/string_funcs/split.rs b/native/spark-expr/src/string_funcs/split.rs index 35595f865f..ebecb62fab 100644 --- a/native/spark-expr/src/string_funcs/split.rs +++ b/native/spark-expr/src/string_funcs/split.rs @@ -133,27 +133,31 @@ fn split_array( // Build the result ListArray let mut offsets: Vec = Vec::with_capacity(string_array.len() + 1); let mut values: Vec = Vec::new(); + let mut null_buffer_builder = arrow::array::BooleanBufferBuilder::new(string_array.len()); offsets.push(0); for i in 0..string_array.len() { if string_array.is_null(i) { - // NULL input produces empty array element (maintain position) + // NULL input produces NULL in result (Spark behavior) offsets.push(offsets[i]); + null_buffer_builder.append(false); // false = NULL } else { let string_val = string_array.value(i); let parts = split_with_regex(string_val, ®ex, limit); values.extend(parts); offsets.push(values.len() as i32); + null_buffer_builder.append(true); // true = valid } } let values_array = Arc::new(GenericStringArray::::from(values)) as ArrayRef; - let field = Arc::new(Field::new("item", DataType::Utf8, false)); + let field = Arc::new(Field::new("item", DataType::Utf8, true)); + let nulls = arrow::buffer::NullBuffer::new(null_buffer_builder.finish()); let list_array = ListArray::new( field, arrow::buffer::OffsetBuffer::new(offsets.into()), values_array, - None, // No nulls at list level + Some(nulls), ); Ok(ColumnarValue::Array(Arc::new(list_array))) @@ -166,26 +170,31 @@ fn split_large_string_array( ) -> DataFusionResult { let mut offsets: Vec = Vec::with_capacity(string_array.len() + 1); let mut values: Vec = Vec::new(); + let mut null_buffer_builder = arrow::array::BooleanBufferBuilder::new(string_array.len()); offsets.push(0); for i in 0..string_array.len() { if string_array.is_null(i) { + // NULL input produces NULL in result (Spark behavior) offsets.push(offsets[i]); + null_buffer_builder.append(false); // false = NULL } else { let string_val = string_array.value(i); let parts = split_with_regex(string_val, regex, limit); values.extend(parts); offsets.push(values.len() as i32); + null_buffer_builder.append(true); // true = valid } } let values_array = Arc::new(GenericStringArray::::from(values)) as ArrayRef; - let field = Arc::new(Field::new("item", DataType::Utf8, false)); + let field = Arc::new(Field::new("item", DataType::Utf8, true)); + let nulls = arrow::buffer::NullBuffer::new(null_buffer_builder.finish()); let list_array = ListArray::new( field, arrow::buffer::OffsetBuffer::new(offsets.into()), values_array, - None, + Some(nulls), ); Ok(ColumnarValue::Array(Arc::new(list_array))) @@ -309,4 +318,41 @@ mod tests { let parts = split_string("a,b,c,,", ",", -1).unwrap(); assert_eq!(parts, vec!["a", "b", "c", "", ""]); } + + #[test] + fn test_split_with_nulls() { + // Test that NULL inputs produce NULL outputs (not empty arrays) + let string_array = Arc::new(StringArray::from(vec![ + Some("a,b,c"), + None, + Some("x,y"), + None, + ])) as ArrayRef; + let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))); + let args = vec![ColumnarValue::Array(string_array), pattern]; + + let result = spark_split(&args).unwrap(); + match result { + ColumnarValue::Array(arr) => { + let list_array = arr.as_any().downcast_ref::().unwrap(); + assert_eq!(list_array.len(), 4); + // First row: valid ["a", "b", "c"] + assert!(!list_array.is_null(0)); + // Second row: NULL + assert!(list_array.is_null(1)); + // Third row: valid ["x", "y"] + assert!(!list_array.is_null(2)); + // Fourth row: NULL + assert!(list_array.is_null(3)); + } + _ => panic!("Expected Array result"), + } + } + + #[test] + fn test_split_empty_string() { + // Test that empty string input produces array with single empty string + let parts = split_string("", ",", -1).unwrap(); + assert_eq!(parts, vec![""]); + } }