diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs
index cc1d53b3aad67..9517aac8e085b 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -176,7 +176,7 @@ fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
 // `get_true_start_end('Hi🌏', 1, None) -> (0, 6)`
 // `get_true_start_end('Hi🌏', 1, 1) -> (0, 1)`
 // `get_true_start_end('Hi🌏', -10, 2) -> (0, 0)`
-fn get_true_start_end(
+pub fn get_true_start_end(
     input: &str,
     start: i64,
     count: Option<u64>,
@@ -235,7 +235,7 @@ fn get_true_start_end(
 // string, such as `substr(long_str_with_1k_chars, 1, 32)`.
 // In such case the overhead of ASCII-validation may not be worth it, so
 // skip the validation for short prefix for now.
-fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
+pub fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
     string_array: &V,
     start: &Int64Array,
     count: Option<&Int64Array>,
diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml
index dc5875e5a1d4c..ad2620a532f24 100644
--- a/datafusion/spark/Cargo.toml
+++ b/datafusion/spark/Cargo.toml
@@ -57,6 +57,7 @@ sha1 = "0.10"
 url = { workspace = true }
 
 [dev-dependencies]
+arrow = { workspace = true, features = ["test_utils"] }
 criterion = { workspace = true }
 
 [[bench]]
@@ -74,3 +75,7 @@ name = "hex"
 [[bench]]
 harness = false
 name = "slice"
+
+[[bench]]
+harness = false
+name = "substring"
diff --git a/datafusion/spark/benches/substring.rs b/datafusion/spark/benches/substring.rs
new file mode 100644
index 0000000000000..286758f43de3c
--- /dev/null
+++ b/datafusion/spark/benches/substring.rs
@@ -0,0 +1,207 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait};
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::{
+    create_string_array_with_len, create_string_view_array_with_len,
+};
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_spark::function::string::substring;
+use std::hint::black_box;
+use std::sync::Arc;
+
+fn create_args_without_count<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    start_half_way: bool,
+    force_view_types: bool,
+) -> Vec<ColumnarValue> {
+    let start_array = Arc::new(Int64Array::from(
+        (0..size)
+            .map(|_| {
+                if start_half_way {
+                    (str_len / 2) as i64
+                } else {
+                    1i64
+                }
+            })
+            .collect::<Vec<_>>(),
+    ));
+
+    if force_view_types {
+        let string_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(start_array),
+        ]
+    } else {
+        let string_array =
+            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+        ]
+    }
+}
+
+fn create_args_with_count<O: OffsetSizeTrait>(
+    size: usize,
+    str_len: usize,
+    count_max: usize,
+    force_view_types: bool,
+) -> Vec<ColumnarValue> {
+    let start_array =
+        Arc::new(Int64Array::from((0..size).map(|_| 1).collect::<Vec<_>>()));
+    let count = count_max.min(str_len) as i64;
+    let count_array = Arc::new(Int64Array::from(
+        (0..size).map(|_| count).collect::<Vec<_>>(),
+    ));
+
+    if force_view_types {
+        let string_array =
+            Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(start_array),
+            ColumnarValue::Array(count_array),
+        ]
+    } else {
+        let string_array =
+            Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+        vec![
+            ColumnarValue::Array(string_array),
+            ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
+            ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef),
+        ]
+    }
+}
+
+#[expect(clippy::needless_pass_by_value)]
+fn invoke_substr_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    substring().invoke_with_args(ScalarFunctionArgs {
+        args: args.clone(),
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Utf8View, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        // string_len = 12, substring_len=6 (see `create_args_without_count`)
+        let len = 12;
+        let mut group = c.benchmark_group("SHORTER THAN 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_without_count::<i32>(size, len, true, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_without_count::<i32>(size, len, false, false);
+        group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
+            b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
+        });
+
+        let args = create_args_without_count::<i64>(size, len, true, false);
+        group.bench_function(
+            format!("substr_large_string [size={size}, strlen={len}]"),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+
+        // string_len = 128, start=1, count=64, substring_len=64
+        let len = 128;
+        let count = 64;
+        let mut group = c.benchmark_group("LONGER THAN 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false);
+        group.bench_function(
+            format!("substr_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i64>(size, len, count, false);
+        group.bench_function(
+            format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+
+        // string_len = 128, start=1, count=6, substring_len=6
+        let len = 128;
+        let count = 6;
+        let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12");
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+
+        let args = create_args_with_count::<i32>(size, len, count, true);
+        group.bench_function(
+            format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i32>(size, len, count, false);
+        group.bench_function(
+            format!("substr_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        let args = create_args_with_count::<i64>(size, len, count, false);
+        group.bench_function(
+            format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
+            |b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
+        );
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs
index 369d381a9c35b..1f0108cf509c7 100644
--- a/datafusion/spark/src/function/string/mod.rs
+++ b/datafusion/spark/src/function/string/mod.rs
@@ -25,6 +25,7 @@ pub mod length;
 pub mod like;
 pub mod luhn_check;
 pub mod space;
+pub mod substring;
 
 use datafusion_expr::ScalarUDF;
 use datafusion_functions::make_udf_function;
@@ -40,6 +41,7 @@ make_udf_function!(like::SparkLike, like);
 make_udf_function!(luhn_check::SparkLuhnCheck, luhn_check);
 make_udf_function!(format_string::FormatStringFunc, format_string);
 make_udf_function!(space::SparkSpace, space);
+make_udf_function!(substring::SparkSubstring, substring);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
@@ -90,6 +92,11 @@ pub mod expr_fn {
         strfmt args
     ));
     export_functions!((space, "Returns a string consisting of n spaces.", arg1));
+    export_functions!((
+        substring,
+        "Returns the substring from string `str` starting at position `pos` with length `length.",
+        str pos length
+    ));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
@@ -104,5 +111,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
         luhn_check(),
         format_string(),
         space(),
+        substring(),
     ]
 }
diff --git a/datafusion/spark/src/function/string/substring.rs b/datafusion/spark/src/function/string/substring.rs
new file mode 100644
index 0000000000000..524262b12f193
--- /dev/null
+++ b/datafusion/spark/src/function/string/substring.rs
@@ -0,0 +1,258 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{
+    Array, ArrayBuilder, ArrayRef, AsArray, GenericStringBuilder, Int64Array,
+    OffsetSizeTrait, StringArrayType, StringViewBuilder,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::arrow::datatypes::{Field, FieldRef};
+use datafusion_common::cast::as_int64_array;
+use datafusion_common::types::{
+    NativeType, logical_int32, logical_int64, logical_string,
+};
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{Coercion, ReturnFieldArgs, TypeSignatureClass};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
+    Volatility,
+};
+use datafusion_functions::unicode::substr::{enable_ascii_fast_path, get_true_start_end};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `substring` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#substring>
+///
+/// Returns the substring from string starting at position pos with length len.
+/// Position is 1-indexed. If pos is negative, it counts from the end of the string.
+/// Returns NULL if any input is NULL.
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSubstring {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkSubstring {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSubstring {
+    pub fn new() -> Self {
+        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
+        let int64 = Coercion::new_implicit(
+            TypeSignatureClass::Native(logical_int64()),
+            vec![TypeSignatureClass::Native(logical_int32())],
+            NativeType::Int64,
+        );
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![string.clone(), int64.clone()]),
+                    TypeSignature::Coercible(vec![
+                        string.clone(),
+                        int64.clone(),
+                        int64.clone(),
+                    ]),
+                ],
+                Volatility::Immutable,
+            )
+            .with_parameter_names(vec![
+                "str".to_string(),
+                "pos".to_string(),
+                "length".to_string(),
+            ])
+            .expect("valid parameter names"),
+            aliases: vec![String::from("substr")],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSubstring {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "substring"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
+        make_scalar_function(spark_substring, vec![])(&args.args)
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        datafusion_common::internal_err!(
+            "return_type should not be called for Spark substring"
+        )
+    }
+
+    fn return_field_from_args(&self, args: ReturnFieldArgs<'_>) -> Result<FieldRef> {
+        // Spark semantics: substring returns NULL if ANY input is NULL
+        let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
+
+        Ok(Arc::new(Field::new(
+            "substring",
+            args.arg_fields[0].data_type().clone(),
+            nullable,
+        )))
+    }
+}
+
+fn spark_substring(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let start_array = as_int64_array(&args[1])?;
+    let length_array = if args.len() > 2 {
+        Some(as_int64_array(&args[2])?)
+    } else {
+        None
+    };
+
+    match args[0].data_type() {
+        DataType::Utf8 => spark_substring_impl(
+            &args[0].as_string::<i32>(),
+            start_array,
+            length_array,
+            GenericStringBuilder::<i32>::new(),
+        ),
+        DataType::LargeUtf8 => spark_substring_impl(
+            &args[0].as_string::<i64>(),
+            start_array,
+            length_array,
+            GenericStringBuilder::<i64>::new(),
+        ),
+        DataType::Utf8View => spark_substring_impl(
+            &args[0].as_string_view(),
+            start_array,
+            length_array,
+            StringViewBuilder::new(),
+        ),
+        other => exec_err!(
+            "Unsupported data type {other:?} for function spark_substring, expected Utf8View, Utf8 or LargeUtf8."
+        ),
+    }
+}
+
+/// Convert Spark's start position to DataFusion's 1-based start position.
+///
+/// Spark semantics:
+/// - Positive start: 1-based index from beginning
+/// - Zero start: treated as 1
+/// - Negative start: counts from end of string
+///
+/// Returns the converted 1-based start position for use with `get_true_start_end`.
+#[inline]
+fn spark_start_to_datafusion_start(start: i64, len: usize) -> i64 {
+    if start >= 0 {
+        start.max(1)
+    } else {
+        let len_i64 = i64::try_from(len).unwrap_or(i64::MAX);
+        let start = start.saturating_add(len_i64).saturating_add(1);
+        start.max(1)
+    }
+}
+
+trait StringArrayBuilder: ArrayBuilder {
+    fn append_value(&mut self, val: &str);
+    fn append_null(&mut self);
+}
+
+impl<O: OffsetSizeTrait> StringArrayBuilder for GenericStringBuilder<O> {
+    fn append_value(&mut self, val: &str) {
+        GenericStringBuilder::append_value(self, val);
+    }
+    fn append_null(&mut self) {
+        GenericStringBuilder::append_null(self);
+    }
+}
+
+impl StringArrayBuilder for StringViewBuilder {
+    fn append_value(&mut self, val: &str) {
+        StringViewBuilder::append_value(self, val);
+    }
+    fn append_null(&mut self) {
+        StringViewBuilder::append_null(self);
+    }
+}
+
+fn spark_substring_impl<'a, V, B>(
+    string_array: &V,
+    start_array: &Int64Array,
+    length_array: Option<&Int64Array>,
+    mut builder: B,
+) -> Result<ArrayRef>
+where
+    V: StringArrayType<'a>,
+    B: StringArrayBuilder,
+{
+    let is_ascii = enable_ascii_fast_path(string_array, start_array, length_array);
+
+    for i in 0..string_array.len() {
+        if string_array.is_null(i) || start_array.is_null(i) {
+            builder.append_null();
+            continue;
+        }
+
+        if let Some(len_arr) = length_array
+            && len_arr.is_null(i)
+        {
+            builder.append_null();
+            continue;
+        }
+
+        let string = string_array.value(i);
+        let start = start_array.value(i);
+        let len_opt = length_array.map(|arr| arr.value(i));
+
+        // Spark: negative length returns empty string
+        if let Some(len) = len_opt
+            && len < 0
+        {
+            builder.append_value("");
+            continue;
+        }
+
+        let string_len = if is_ascii {
+            string.len()
+        } else {
+            string.chars().count()
+        };
+
+        let adjusted_start = spark_start_to_datafusion_start(start, string_len);
+
+        let (byte_start, byte_end) = get_true_start_end(
+            string,
+            adjusted_start,
+            len_opt.map(|l| l as u64),
+            is_ascii,
+        );
+        let substr = &string[byte_start..byte_end];
+        builder.append_value(substr);
+    }
+
+    Ok(builder.finish())
+}
diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs
index aad3ceed68ce3..f67367734cf93 100644
--- a/datafusion/spark/src/lib.rs
+++ b/datafusion/spark/src/lib.rs
@@ -95,6 +95,7 @@
 //![`Expr`]: datafusion_expr::Expr
 
 pub mod function;
+pub mod planner;
 
 use datafusion_catalog::TableFunction;
 use datafusion_common::Result;
diff --git a/datafusion/spark/src/planner.rs b/datafusion/spark/src/planner.rs
new file mode 100644
index 0000000000000..8b68617828175
--- /dev/null
+++ b/datafusion/spark/src/planner.rs
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_expr::Expr;
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::planner::{ExprPlanner, PlannerResult};
+
+#[derive(Default, Debug)]
+pub struct SparkFunctionPlanner;
+
+impl ExprPlanner for SparkFunctionPlanner {
+    fn plan_substring(
+        &self,
+        args: Vec<Expr>,
+    ) -> datafusion_common::Result<PlannerResult<Vec<Expr>>> {
+        Ok(PlannerResult::Planned(Expr::ScalarFunction(
+            ScalarFunction::new_udf(crate::function::string::substring(), args),
+        )))
+    }
+}
diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs
index d416dc1bcfbfc..19ec3e7613942 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -21,6 +21,7 @@ use std::fs::File;
 use std::io::Write;
 use std::path::Path;
 use std::sync::Arc;
+use std::vec;
 
 use arrow::array::{
     Array, ArrayRef, BinaryArray, Float64Array, Int32Array, LargeBinaryArray,
@@ -80,11 +81,18 @@ impl TestContext {
             // hardcode target partitions so plans are deterministic
             .with_target_partitions(4);
         let runtime = Arc::new(RuntimeEnv::default());
-        let mut state = SessionStateBuilder::new()
+
+        let mut state_builder = SessionStateBuilder::new()
             .with_config(config)
-            .with_runtime_env(runtime)
-            .with_default_features()
-            .build();
+            .with_runtime_env(runtime);
+
+        if is_spark_path(relative_path) {
+            state_builder = state_builder.with_expr_planners(vec![Arc::new(
+                datafusion_spark::planner::SparkFunctionPlanner,
+            )]);
+        }
+
+        let mut state = state_builder.with_default_features().build();
 
         if is_spark_path(relative_path) {
             info!("Registering Spark functions");
diff --git a/datafusion/sqllogictest/test_files/spark/string/substr.slt b/datafusion/sqllogictest/test_files/spark/string/substr.slt
deleted file mode 100644
index 0942bdd86a4ef..0000000000000
--- a/datafusion/sqllogictest/test_files/spark/string/substr.slt
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-
-#   http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT substr('Spark SQL', -3);
-## PySpark 3.5.5 Result: {'substr(Spark SQL, -3, 2147483647)': 'SQL', 'typeof(substr(Spark SQL, -3, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(-3)': 'int'}
-#query
-#SELECT substr('Spark SQL'::string, -3::int);
-
-## Original Query: SELECT substr('Spark SQL', 5);
-## PySpark 3.5.5 Result: {'substr(Spark SQL, 5, 2147483647)': 'k SQL', 'typeof(substr(Spark SQL, 5, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int'}
-#query
-#SELECT substr('Spark SQL'::string, 5::int);
-
-## Original Query: SELECT substr('Spark SQL', 5, 1);
-## PySpark 3.5.5 Result: {'substr(Spark SQL, 5, 1)': 'k', 'typeof(substr(Spark SQL, 5, 1))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int', 'typeof(1)': 'int'}
-#query
-#SELECT substr('Spark SQL'::string, 5::int, 1::int);
diff --git a/datafusion/sqllogictest/test_files/spark/string/substring.slt b/datafusion/sqllogictest/test_files/spark/string/substring.slt
index 847ce4b6d4739..5bf2fdf2fb954 100644
--- a/datafusion/sqllogictest/test_files/spark/string/substring.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/substring.slt
@@ -15,23 +15,189 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT substring('Spark SQL', -3);
-## PySpark 3.5.5 Result: {'substring(Spark SQL, -3, 2147483647)': 'SQL', 'typeof(substring(Spark SQL, -3, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(-3)': 'int'}
-#query
-#SELECT substring('Spark SQL'::string, -3::int);
-
-## Original Query: SELECT substring('Spark SQL', 5);
-## PySpark 3.5.5 Result: {'substring(Spark SQL, 5, 2147483647)': 'k SQL', 'typeof(substring(Spark SQL, 5, 2147483647))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int'}
-#query
-#SELECT substring('Spark SQL'::string, 5::int);
-
-## Original Query: SELECT substring('Spark SQL', 5, 1);
-## PySpark 3.5.5 Result: {'substring(Spark SQL, 5, 1)': 'k', 'typeof(substring(Spark SQL, 5, 1))': 'string', 'typeof(Spark SQL)': 'string', 'typeof(5)': 'int', 'typeof(1)': 'int'}
-#query
-#SELECT substring('Spark SQL'::string, 5::int, 1::int);
+
+query T
+SELECT substring('Spark SQL'::string, 0::int);
+----
+Spark SQL
+
+query T
+SELECT substring('Spark SQL'::string, 5::int);
+----
+k SQL
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, 1::int);
+----
+a
+
+# Test negative start
+query T
+SELECT substring('Spark SQL'::string, -3::int);
+----
+SQL
+
+query T
+SELECT substring('Spark SQL'::string, -3::int, 2::int);
+----
+SQ
+
+# Test length exceeding string length
+query T
+SELECT substring('Spark SQL'::string, 2::int, 700::int);
+----
+park SQL
+
+# Test start position beyond string length
+query T
+SELECT substring('Spark SQL'::string, 30::int);
+----
+(empty)
+
+query T
+SELECT substring('Spark SQL'::string, -30::int);
+----
+Spark SQL
+
+# Test negative length
+query T
+SELECT substring('Spark SQL'::string, 3::int, -1::int);
+----
+(empty)
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, 0::int);
+----
+(empty)
+
+# Test unicode strings
+query T
+SELECT substring('joséésoj'::string, 5::int);
+----
+ésoj
+
+query T
+SELECT substring('joséésoj'::string, 5::int, 2::int);
+----
+és
+
+# NULL handling
+query T
+SELECT substring('Spark SQL'::string, NULL::int);
+----
+NULL
+
+query T
+SELECT substring(NULL::string, 5::int);
+----
+NULL
+
+query T
+SELECT substring(NULL::string, 3::int, 1::int);
+----
+NULL
+
+query T
+SELECT substring('Spark SQL'::string, NULL::int, 1::int);
+----
+NULL
+
+query T
+SELECT substring('Spark SQL'::string, 3::int, NULL::int);
+----
+NULL
+
+query T
+SELECT substring(column1, column2)
+FROM VALUES
+('Spark SQL'::string, 0::int),
+('Spark SQL'::string, 5::int),
+('Spark SQL'::string, -3::int),
+('Spark SQL'::string, 500::int),
+('Spark SQL'::string, -300::int),
+(NULL::string, 5::int),
+('Spark SQL'::string, NULL::int);
+----
+Spark SQL
+k SQL
+SQL
+(empty)
+Spark SQL
+NULL
+NULL
+
+query T
+SELECT substring(column1, column2, column3)
+FROM VALUES
+('Spark SQL'::string, -3::int, 2::int),
+('Spark SQL'::string, 3::int, 1::int),
+('Spark SQL'::string, 3::int, 700::int),
+('Spark SQL'::string, 3::int, -1::int),
+('Spark SQL'::string, 3::int, 0::int),
+('Spark SQL'::string, 300::int, 3::int),
+('Spark SQL'::string, -300::int, 3::int),
+(NULL::string, 3::int, 1::int),
+('Spark SQL'::string, NULL::int, 1::int),
+('Spark SQL'::string, 3::int, NULL::int);
+----
+SQ
+a
+ark SQL
+(empty)
+(empty)
+(empty)
+Spa
+NULL
+NULL
+NULL
+
+# alias substr
+
+query T
+SELECT substr('Spark SQL'::string, 0::int);
+----
+Spark SQL
+
+query T
+SELECT substr(column1, column2)
+FROM VALUES
+('Spark SQL'::string, 0::int),
+('Spark SQL'::string, 5::int),
+('Spark SQL'::string, -3::int),
+('Spark SQL'::string, 500::int),
+('Spark SQL'::string, -300::int),
+(NULL::string, 5::int),
+('Spark SQL'::string, NULL::int);
+----
+Spark SQL
+k SQL
+SQL
+(empty)
+Spark SQL
+NULL
+NULL
+
+query T
+SELECT substr(column1, column2, column3)
+FROM VALUES
+('Spark SQL'::string, -3::int, 2::int),
+('Spark SQL'::string, 3::int, 1::int),
+('Spark SQL'::string, 3::int, 700::int),
+('Spark SQL'::string, 3::int, -1::int),
+('Spark SQL'::string, 3::int, 0::int),
+('Spark SQL'::string, 300::int, 3::int),
+('Spark SQL'::string, -300::int, 3::int),
+(NULL::string, 3::int, 1::int),
+('Spark SQL'::string, NULL::int, 1::int),
+('Spark SQL'::string, 3::int, NULL::int);
+----
+SQ
+a
+ark SQL
+(empty)
+(empty)
+(empty)
+Spa
+NULL
+NULL
+NULL