diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index cc1d53b3aad67..9517aac8e085b 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -176,7 +176,7 @@ fn substr(args: &[ArrayRef]) -> Result { // `get_true_start_end('Hi🌏', 1, None) -> (0, 6)` // `get_true_start_end('Hi🌏', 1, 1) -> (0, 1)` // `get_true_start_end('Hi🌏', -10, 2) -> (0, 0)` -fn get_true_start_end( +pub fn get_true_start_end( input: &str, start: i64, count: Option, @@ -235,7 +235,7 @@ fn get_true_start_end( // string, such as `substr(long_str_with_1k_chars, 1, 32)`. // In such case the overhead of ASCII-validation may not be worth it, so // skip the validation for short prefix for now. -fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>( +pub fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>( string_array: &V, start: &Int64Array, count: Option<&Int64Array>, diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml index dc5875e5a1d4c..ad2620a532f24 100644 --- a/datafusion/spark/Cargo.toml +++ b/datafusion/spark/Cargo.toml @@ -57,6 +57,7 @@ sha1 = "0.10" url = { workspace = true } [dev-dependencies] +arrow = { workspace = true, features = ["test_utils"] } criterion = { workspace = true } [[bench]] @@ -74,3 +75,7 @@ name = "hex" [[bench]] harness = false name = "slice" + +[[bench]] +harness = false +name = "substring" diff --git a/datafusion/spark/benches/substring.rs b/datafusion/spark/benches/substring.rs new file mode 100644 index 0000000000000..286758f43de3c --- /dev/null +++ b/datafusion/spark/benches/substring.rs @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait}; +use arrow::datatypes::{DataType, Field}; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{Criterion, SamplingMode, criterion_group, criterion_main}; +use datafusion_common::DataFusionError; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_spark::function::string::substring; +use std::hint::black_box; +use std::sync::Arc; + +fn create_args_without_count( + size: usize, + str_len: usize, + start_half_way: bool, + force_view_types: bool, +) -> Vec { + let start_array = Arc::new(Int64Array::from( + (0..size) + .map(|_| { + if start_half_way { + (str_len / 2) as i64 + } else { + 1i64 + } + }) + .collect::>(), + )); + + if force_view_types { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(start_array), + ] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), + ] + } +} + +fn create_args_with_count( + size: usize, + str_len: usize, + count_max: usize, + force_view_types: bool, +) -> Vec { + let start_array = + Arc::new(Int64Array::from((0..size).map(|_| 1).collect::>())); + let count = count_max.min(str_len) as i64; + let count_array = Arc::new(Int64Array::from( + (0..size).map(|_| count).collect::>(), + )); + + if force_view_types { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(start_array), + ColumnarValue::Array(count_array), + ] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), + ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef), + ] + } +} + +#[expect(clippy::needless_pass_by_value)] +fn invoke_substr_with_args( + args: Vec, + number_rows: usize, +) -> Result { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into()) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + substring().invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields, + number_rows, + return_field: Field::new("f", DataType::Utf8View, true).into(), + config_options: Arc::clone(&config_options), + }) +} + +fn criterion_benchmark(c: &mut Criterion) { + for size in [1024, 4096] { + // string_len = 12, substring_len=6 (see `create_args_without_count`) + let len = 12; + let mut group = c.benchmark_group("SHORTER THAN 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_without_count::(size, len, true, true); + group.bench_function( + format!("substr_string_view [size={size}, strlen={len}]"), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_without_count::(size, len, false, false); + group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| { + b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))) + }); + + let args = create_args_without_count::(size, len, true, false); + group.bench_function( + format!("substr_large_string [size={size}, strlen={len}]"), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + group.finish(); + + // string_len = 128, start=1, count=64, substring_len=64 + let len = 128; + let count = 64; + let mut group = c.benchmark_group("LONGER THAN 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_with_count::(size, len, count, true); + group.bench_function( + format!("substr_string_view [size={size}, count={count}, strlen={len}]",), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + format!("substr_string [size={size}, count={count}, strlen={len}]",), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + format!("substr_large_string [size={size}, count={count}, strlen={len}]",), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + group.finish(); + + // string_len = 128, start=1, count=6, substring_len=6 + let len = 128; + let count = 6; + let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + + let args = create_args_with_count::(size, len, count, true); + group.bench_function( + format!("substr_string_view [size={size}, count={count}, strlen={len}]",), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + format!("substr_string [size={size}, count={count}, strlen={len}]",), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + format!("substr_large_string [size={size}, count={count}, strlen={len}]",), + |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))), + ); + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index 369d381a9c35b..1f0108cf509c7 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -25,6 +25,7 @@ pub mod length; pub mod like; pub mod luhn_check; pub mod space; +pub mod substring; use datafusion_expr::ScalarUDF; use datafusion_functions::make_udf_function; @@ -40,6 +41,7 @@ make_udf_function!(like::SparkLike, like); make_udf_function!(luhn_check::SparkLuhnCheck, luhn_check); make_udf_function!(format_string::FormatStringFunc, format_string); make_udf_function!(space::SparkSpace, space); +make_udf_function!(substring::SparkSubstring, substring); pub mod expr_fn { use datafusion_functions::export_functions; @@ -90,6 +92,11 @@ pub mod expr_fn { strfmt args )); export_functions!((space, "Returns a string consisting of n spaces.", arg1)); + export_functions!(( + substring, + "Returns the substring from string `str` starting at position `pos` with length `length.", + str pos length + )); } pub fn functions() -> Vec> { @@ -104,5 +111,6 @@ pub fn functions() -> Vec> { luhn_check(), format_string(), space(), + substring(), ] } diff --git a/datafusion/spark/src/function/string/substring.rs b/datafusion/spark/src/function/string/substring.rs new file mode 100644 index 0000000000000..524262b12f193 --- /dev/null +++ b/datafusion/spark/src/function/string/substring.rs @@ -0,0 +1,258 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, ArrayBuilder, ArrayRef, AsArray, GenericStringBuilder, Int64Array, + OffsetSizeTrait, StringArrayType, StringViewBuilder, +}; +use arrow::datatypes::DataType; +use datafusion_common::arrow::datatypes::{Field, FieldRef}; +use datafusion_common::cast::as_int64_array; +use datafusion_common::types::{ + NativeType, logical_int32, logical_int64, logical_string, +}; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{Coercion, ReturnFieldArgs, TypeSignatureClass}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, + Volatility, +}; +use datafusion_functions::unicode::substr::{enable_ascii_fast_path, get_true_start_end}; +use datafusion_functions::utils::make_scalar_function; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible `substring` expression +/// +/// +/// Returns the substring from string starting at position pos with length len. +/// Position is 1-indexed. If pos is negative, it counts from the end of the string. +/// Returns NULL if any input is NULL. +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkSubstring { + signature: Signature, + aliases: Vec, +} + +impl Default for SparkSubstring { + fn default() -> Self { + Self::new() + } +} + +impl SparkSubstring { + pub fn new() -> Self { + let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string())); + let int64 = Coercion::new_implicit( + TypeSignatureClass::Native(logical_int64()), + vec![TypeSignatureClass::Native(logical_int32())], + NativeType::Int64, + ); + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![string.clone(), int64.clone()]), + TypeSignature::Coercible(vec![ + string.clone(), + int64.clone(), + int64.clone(), + ]), + ], + Volatility::Immutable, + ) + .with_parameter_names(vec![ + "str".to_string(), + "pos".to_string(), + "length".to_string(), + ]) + .expect("valid parameter names"), + aliases: vec![String::from("substr")], + } + } +} + +impl ScalarUDFImpl for SparkSubstring { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "substring" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn aliases(&self) -> &[String] { + &self.aliases + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(spark_substring, vec![])(&args.args) + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + datafusion_common::internal_err!( + "return_type should not be called for Spark substring" + ) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result { + // Spark semantics: substring returns NULL if ANY input is NULL + let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); + + Ok(Arc::new(Field::new( + "substring", + args.arg_fields[0].data_type().clone(), + nullable, + ))) + } +} + +fn spark_substring(args: &[ArrayRef]) -> Result { + let start_array = as_int64_array(&args[1])?; + let length_array = if args.len() > 2 { + Some(as_int64_array(&args[2])?) + } else { + None + }; + + match args[0].data_type() { + DataType::Utf8 => spark_substring_impl( + &args[0].as_string::(), + start_array, + length_array, + GenericStringBuilder::::new(), + ), + DataType::LargeUtf8 => spark_substring_impl( + &args[0].as_string::(), + start_array, + length_array, + GenericStringBuilder::::new(), + ), + DataType::Utf8View => spark_substring_impl( + &args[0].as_string_view(), + start_array, + length_array, + StringViewBuilder::new(), + ), + other => exec_err!( + "Unsupported data type {other:?} for function spark_substring, expected Utf8View, Utf8 or LargeUtf8." + ), + } +} + +/// Convert Spark's start position to DataFusion's 1-based start position. +/// +/// Spark semantics: +/// - Positive start: 1-based index from beginning +/// - Zero start: treated as 1 +/// - Negative start: counts from end of string +/// +/// Returns the converted 1-based start position for use with `get_true_start_end`. +#[inline] +fn spark_start_to_datafusion_start(start: i64, len: usize) -> i64 { + if start >= 0 { + start.max(1) + } else { + let len_i64 = i64::try_from(len).unwrap_or(i64::MAX); + let start = start.saturating_add(len_i64).saturating_add(1); + start.max(1) + } +} + +trait StringArrayBuilder: ArrayBuilder { + fn append_value(&mut self, val: &str); + fn append_null(&mut self); +} + +impl StringArrayBuilder for GenericStringBuilder { + fn append_value(&mut self, val: &str) { + GenericStringBuilder::append_value(self, val); + } + fn append_null(&mut self) { + GenericStringBuilder::append_null(self); + } +} + +impl StringArrayBuilder for StringViewBuilder { + fn append_value(&mut self, val: &str) { + StringViewBuilder::append_value(self, val); + } + fn append_null(&mut self) { + StringViewBuilder::append_null(self); + } +} + +fn spark_substring_impl<'a, V, B>( + string_array: &V, + start_array: &Int64Array, + length_array: Option<&Int64Array>, + mut builder: B, +) -> Result +where + V: StringArrayType<'a>, + B: StringArrayBuilder, +{ + let is_ascii = enable_ascii_fast_path(string_array, start_array, length_array); + + for i in 0..string_array.len() { + if string_array.is_null(i) || start_array.is_null(i) { + builder.append_null(); + continue; + } + + if let Some(len_arr) = length_array + && len_arr.is_null(i) + { + builder.append_null(); + continue; + } + + let string = string_array.value(i); + let start = start_array.value(i); + let len_opt = length_array.map(|arr| arr.value(i)); + + // Spark: negative length returns empty string + if let Some(len) = len_opt + && len < 0 + { + builder.append_value(""); + continue; + } + + let string_len = if is_ascii { + string.len() + } else { + string.chars().count() + }; + + let adjusted_start = spark_start_to_datafusion_start(start, string_len); + + let (byte_start, byte_end) = get_true_start_end( + string, + adjusted_start, + len_opt.map(|l| l as u64), + is_ascii, + ); + let substr = &string[byte_start..byte_end]; + builder.append_value(substr); + } + + Ok(builder.finish()) +} diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs index aad3ceed68ce3..f67367734cf93 100644 --- a/datafusion/spark/src/lib.rs +++ b/datafusion/spark/src/lib.rs @@ -95,6 +95,7 @@ //![`Expr`]: datafusion_expr::Expr pub mod function; +pub mod planner; use datafusion_catalog::TableFunction; use datafusion_common::Result; diff --git a/datafusion/spark/src/planner.rs b/datafusion/spark/src/planner.rs new file mode 100644 index 0000000000000..8b68617828175 --- /dev/null +++ b/datafusion/spark/src/planner.rs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_expr::Expr; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::planner::{ExprPlanner, PlannerResult}; + +#[derive(Default, Debug)] +pub struct SparkFunctionPlanner; + +impl ExprPlanner for SparkFunctionPlanner { + fn plan_substring( + &self, + args: Vec, + ) -> datafusion_common::Result>> { + Ok(PlannerResult::Planned(Expr::ScalarFunction( + ScalarFunction::new_udf(crate::function::string::substring(), args), + ))) + } +} diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index d416dc1bcfbfc..19ec3e7613942 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -21,6 +21,7 @@ use std::fs::File; use std::io::Write; use std::path::Path; use std::sync::Arc; +use std::vec; use arrow::array::{ Array, ArrayRef, BinaryArray, Float64Array, Int32Array, LargeBinaryArray, @@ -80,11 +81,18 @@ impl TestContext { // hardcode target partitions so plans are deterministic .with_target_partitions(4); let runtime = Arc::new(RuntimeEnv::default()); - let mut state = SessionStateBuilder::new() + + let mut state_builder = SessionStateBuilder::new() .with_config(config) - .with_runtime_env(runtime) - .with_default_features() - .build(); + .with_runtime_env(runtime); + + if is_spark_path(relative_path) { + state_builder = state_builder.with_expr_planners(vec![Arc::new( + datafusion_spark::planner::SparkFunctionPlanner, + )]); + } + + let mut state = state_builder.with_default_features().build(); if is_spark_path(relative_path) { info!("Registering Spark functions"); diff --git a/datafusion/sqllogictest/test_files/spark/string/substr.slt b/datafusion/sqllogictest/test_files/spark/string/substr.slt deleted file mode 100644 index 0942bdd86a4ef..0000000000000 --- a/datafusion/sqllogictest/test_files/spark/string/substr.slt +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT substr('Spark SQL', -3); -## PySpark 3.5.5 Result: {'substr(Spark SQL, -3, 2147483647)': 'SQL', 'typeof(substr(Spark SQL, -3, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(-3)': 'int'} -#query -#SELECT substr('Spark SQL'::string, -3::int); - -## Original Query: SELECT substr('Spark SQL', 5); -## PySpark 3.5.5 Result: {'substr(Spark SQL, 5, 2147483647)': 'k SQL', 'typeof(substr(Spark SQL, 5, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int'} -#query -#SELECT substr('Spark SQL'::string, 5::int); - -## Original Query: SELECT substr('Spark SQL', 5, 1); -## PySpark 3.5.5 Result: {'substr(Spark SQL, 5, 1)': 'k', 'typeof(substr(Spark SQL, 5, 1))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int', 'typeof(1)': 'int'} -#query -#SELECT substr('Spark SQL'::string, 5::int, 1::int); diff --git a/datafusion/sqllogictest/test_files/spark/string/substring.slt b/datafusion/sqllogictest/test_files/spark/string/substring.slt index 847ce4b6d4739..5bf2fdf2fb954 100644 --- a/datafusion/sqllogictest/test_files/spark/string/substring.slt +++ b/datafusion/sqllogictest/test_files/spark/string/substring.slt @@ -15,23 +15,189 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT substring('Spark SQL', -3); -## PySpark 3.5.5 Result: {'substring(Spark SQL, -3, 2147483647)': 'SQL', 'typeof(substring(Spark SQL, -3, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(-3)': 'int'} -#query -#SELECT substring('Spark SQL'::string, -3::int); - -## Original Query: SELECT substring('Spark SQL', 5); -## PySpark 3.5.5 Result: {'substring(Spark SQL, 5, 2147483647)': 'k SQL', 'typeof(substring(Spark SQL, 5, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int'} -#query -#SELECT substring('Spark SQL'::string, 5::int); - -## Original Query: SELECT substring('Spark SQL', 5, 1); -## PySpark 3.5.5 Result: {'substring(Spark SQL, 5, 1)': 'k', 'typeof(substring(Spark SQL, 5, 1))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int', 'typeof(1)': 'int'} -#query -#SELECT substring('Spark SQL'::string, 5::int, 1::int); + +query T +SELECT substring('Spark SQL'::string, 0::int); +---- +Spark SQL + +query T +SELECT substring('Spark SQL'::string, 5::int); +---- +k SQL + +query T +SELECT substring('Spark SQL'::string, 3::int, 1::int); +---- +a + +# Test negative start +query T +SELECT substring('Spark SQL'::string, -3::int); +---- +SQL + +query T +SELECT substring('Spark SQL'::string, -3::int, 2::int); +---- +SQ + +# Test length exceeding string length +query T +SELECT substring('Spark SQL'::string, 2::int, 700::int); +---- +park SQL + +# Test start position beyond string length +query T +SELECT substring('Spark SQL'::string, 30::int); +---- +(empty) + +query T +SELECT substring('Spark SQL'::string, -30::int); +---- +Spark SQL + +# Test negative length +query T +SELECT substring('Spark SQL'::string, 3::int, -1::int); +---- +(empty) + +query T +SELECT substring('Spark SQL'::string, 3::int, 0::int); +---- +(empty) + +# Test unicode strings +query T +SELECT substring('joséésoj'::string, 5::int); +---- +ésoj + +query T +SELECT substring('joséésoj'::string, 5::int, 2::int); +---- +és + +# NULL handling +query T +SELECT substring('Spark SQL'::string, NULL::int); +---- +NULL + +query T +SELECT substring(NULL::string, 5::int); +---- +NULL + +query T +SELECT substring(NULL::string, 3::int, 1::int); +---- +NULL + +query T +SELECT substring('Spark SQL'::string, NULL::int, 1::int); +---- +NULL + +query T +SELECT substring('Spark SQL'::string, 3::int, NULL::int); +---- +NULL + +query T +SELECT substring(column1, column2) +FROM VALUES +('Spark SQL'::string, 0::int), +('Spark SQL'::string, 5::int), +('Spark SQL'::string, -3::int), +('Spark SQL'::string, 500::int), +('Spark SQL'::string, -300::int), +(NULL::string, 5::int), +('Spark SQL'::string, NULL::int); +---- +Spark SQL +k SQL +SQL +(empty) +Spark SQL +NULL +NULL + +query T +SELECT substring(column1, column2, column3) +FROM VALUES +('Spark SQL'::string, -3::int, 2::int), +('Spark SQL'::string, 3::int, 1::int), +('Spark SQL'::string, 3::int, 700::int), +('Spark SQL'::string, 3::int, -1::int), +('Spark SQL'::string, 3::int, 0::int), +('Spark SQL'::string, 300::int, 3::int), +('Spark SQL'::string, -300::int, 3::int), +(NULL::string, 3::int, 1::int), +('Spark SQL'::string, NULL::int, 1::int), +('Spark SQL'::string, 3::int, NULL::int); +---- +SQ +a +ark SQL +(empty) +(empty) +(empty) +Spa +NULL +NULL +NULL + +# alias substr + +query T +SELECT substr('Spark SQL'::string, 0::int); +---- +Spark SQL + +query T +SELECT substr(column1, column2) +FROM VALUES +('Spark SQL'::string, 0::int), +('Spark SQL'::string, 5::int), +('Spark SQL'::string, -3::int), +('Spark SQL'::string, 500::int), +('Spark SQL'::string, -300::int), +(NULL::string, 5::int), +('Spark SQL'::string, NULL::int); +---- +Spark SQL +k SQL +SQL +(empty) +Spark SQL +NULL +NULL + +query T +SELECT substr(column1, column2, column3) +FROM VALUES +('Spark SQL'::string, -3::int, 2::int), +('Spark SQL'::string, 3::int, 1::int), +('Spark SQL'::string, 3::int, 700::int), +('Spark SQL'::string, 3::int, -1::int), +('Spark SQL'::string, 3::int, 0::int), +('Spark SQL'::string, 300::int, 3::int), +('Spark SQL'::string, -300::int, 3::int), +(NULL::string, 3::int, 1::int), +('Spark SQL'::string, NULL::int, 1::int), +('Spark SQL'::string, 3::int, NULL::int); +---- +SQ +a +ark SQL +(empty) +(empty) +(empty) +Spa +NULL +NULL +NULL