From 12baeecd0067a30e880bee800a8518f58facb238 Mon Sep 17 00:00:00 2001
From: DavdaJames <jrdavda007@gmail.com>
Date: Wed, 1 Oct 2025 21:56:31 +0530
Subject: [PATCH 1/2] updated existing tests for split_recursively.rs to be
 based on public APIs

---
 src/ops/functions/split_recursively.rs | 562 +++++++++++++++++--------
 1 file changed, 394 insertions(+), 168 deletions(-)

diff --git a/src/ops/functions/split_recursively.rs b/src/ops/functions/split_recursively.rs
index 5386bb964..6a6c704b5 100644
--- a/src/ops/functions/split_recursively.rs
+++ b/src/ops/functions/split_recursively.rs
@@ -939,45 +939,8 @@ pub fn register(registry: &mut ExecutorFactoryRegistry) -> Result<()> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::ops::{functions::test_utils::test_flow_function, shared::split::OutputPosition};
-
-    // Helper function to assert chunk text and its consistency with the range within the original text.
-    fn assert_chunk_text_consistency(
-        full_text: &str, // Added full text
-        actual_chunk: &ChunkOutput<'_>,
-        expected_text: &str,
-        context: &str,
-    ) {
-        // Extract text using the chunk's range from the original full text.
-        let extracted_text = full_text
-            .get(actual_chunk.start_pos.byte_offset..actual_chunk.end_pos.byte_offset)
-            .unwrap();
-        // Assert that the expected text matches the text provided in the chunk.
-        assert_eq!(
-            actual_chunk.text, expected_text,
-            "Provided chunk text mismatch - {context}"
-        );
-        // Assert that the expected text also matches the text extracted using the chunk's range.
-        assert_eq!(
-            extracted_text, expected_text,
-            "Range inconsistency: extracted text mismatch - {context}"
-        );
-    }
-
-    // Creates a default RecursiveChunker for testing, assuming no language-specific parsing.
-    fn create_test_chunker<'a>(
-        text: &'a str,
-        chunk_size: usize,
-        min_chunk_size: usize,
-        chunk_overlap: usize,
-    ) -> RecursiveChunker<'a> {
-        RecursiveChunker {
-            full_text: text,
-            chunk_size,
-            chunk_overlap,
-            min_chunk_size,
-        }
-    }
+    use crate::ops::functions::test_utils::test_flow_function;
+    use crate::ops::sdk::{BasicValueType, KeyValue, RangeValue, make_output_type};
 
     #[tokio::test]
     async fn test_split_recursively() {
@@ -1095,161 +1058,424 @@ mod tests {
         );
     }
 
-    #[test]
-    fn test_translate_bytes_to_chars_simple() {
-        let text = "abc😄def";
-        let mut start1 = Position::new(0);
-        let mut end1 = Position::new(3);
-        let mut start2 = Position::new(3);
-        let mut end2 = Position::new(7);
-        let mut start3 = Position::new(7);
-        let mut end3 = Position::new(10);
-        let mut end_full = Position::new(text.len());
-
-        let offsets = vec![
-            &mut start1,
-            &mut end1,
-            &mut start2,
-            &mut end2,
-            &mut start3,
-            &mut end3,
-            &mut end_full,
+    #[tokio::test]
+    async fn test_unicode_character_positioning() {
+        let spec = Spec {
+            custom_languages: vec![],
+        };
+        let factory = Arc::new(Factory);
+
+        let input_arg_schemas = &[
+            (
+                Some("text"),
+                make_output_type(BasicValueType::Str).with_nullable(true),
+            ),
+            (
+                Some("chunk_size"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("min_chunk_size"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("chunk_overlap"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("language"),
+                make_output_type(BasicValueType::Str).with_nullable(true),
+            ),
         ];
 
-        set_output_positions(text, offsets.into_iter());
+        let text = "abc😄def";
 
-        assert_eq!(
-            start1.output,
-            Some(OutputPosition {
-                char_offset: 0,
-                line: 1,
-                column: 1,
-            })
+        let result = test_flow_function(
+            &factory,
+            &spec,
+            input_arg_schemas,
+            vec![
+                text.to_string().into(),
+                10i64.into(),
+                Some(1i64).into(),
+                0i64.into(),
+                Value::Null,
+            ],
+        )
+        .await;
+
+        assert!(
+            result.is_ok(),
+            "test_flow_function failed: {:?}",
+            result.err()
         );
-        assert_eq!(
-            end1.output,
-            Some(OutputPosition {
-                char_offset: 3,
-                line: 1,
-                column: 4,
-            })
-        );
-        assert_eq!(
-            start2.output,
-            Some(OutputPosition {
-                char_offset: 3,
-                line: 1,
-                column: 4,
-            })
-        );
-        assert_eq!(
-            end2.output,
-            Some(OutputPosition {
-                char_offset: 4,
-                line: 1,
-                column: 5,
-            })
-        );
-        assert_eq!(
-            end3.output,
-            Some(OutputPosition {
-                char_offset: 7,
-                line: 1,
-                column: 8,
-            })
-        );
-        assert_eq!(
-            end_full.output,
-            Some(OutputPosition {
-                char_offset: 7,
-                line: 1,
-                column: 8,
-            })
+
+        let value = result.unwrap();
+        match value {
+            Value::KTable(table) => {
+                assert_eq!(table.len(), 1, "Expected single chunk for small text");
+
+                let (_, scope_value_ref) = table.iter().next().unwrap();
+                let fields = &scope_value_ref.0.fields;
+
+                // Check start position (field[1])
+                let start_pos = &fields[1]; // "start" field
+                let start_offset = start_pos.as_struct().unwrap().fields[0].as_int64().unwrap();
+                let start_line = start_pos.as_struct().unwrap().fields[1].as_int64().unwrap();
+                let start_column = start_pos.as_struct().unwrap().fields[2].as_int64().unwrap();
+
+                // Check end position (field[2])
+                let end_pos = &fields[2]; // "end" field
+                let end_offset = end_pos.as_struct().unwrap().fields[0].as_int64().unwrap();
+                let end_line = end_pos.as_struct().unwrap().fields[1].as_int64().unwrap();
+                let end_column = end_pos.as_struct().unwrap().fields[2].as_int64().unwrap();
+
+                // Verify character positions are correct for Unicode
+                assert_eq!(start_offset, 0, "Start character offset should be 0");
+                assert_eq!(start_line, 1, "Start line should be 1");
+                assert_eq!(start_column, 1, "Start column should be 1");
+
+                assert_eq!(
+                    end_offset, 7,
+                    "End character offset should be 7 (abc😄def = 7 chars)"
+                );
+                assert_eq!(end_line, 1, "End line should be 1");
+                assert_eq!(end_column, 8, "End column should be 8 (1-indexed)");
+                assert_eq!(start_line, 1, "Start line should be 1");
+                assert_eq!(start_column, 1, "Start column should be 1");
+
+                assert_eq!(
+                    end_offset, 7,
+                    "End character offset should be 7 (abc😄def = 7 chars)"
+                );
+                assert_eq!(end_line, 1, "End line should be 1");
+                assert_eq!(end_column, 8, "End column should be 8 (1-indexed)");
+            }
+            other => panic!("Expected Value::KTable, got {other:?}"),
+        }
+    }
+
+    // Helper function to extract chunks from KTable and verify them
+    async fn assert_chunks_from_ktable(
+        factory: &Arc<Factory>,
+        spec: &Spec,
+        input_arg_schemas: &[(Option<&str>, EnrichedValueType)],
+        text: &str,
+        chunk_size: i64,
+        min_chunk_size: Option<i64>,
+        chunk_overlap: i64,
+        expected_chunks: Vec<(usize, usize, &str)>, // (start_byte, end_byte, expected_text)
+        context: &str,
+    ) {
+        let result = test_flow_function(
+            factory,
+            spec,
+            input_arg_schemas,
+            vec![
+                text.to_string().into(),
+                chunk_size.into(),
+                min_chunk_size.map(Value::from).unwrap_or(Value::Null),
+                chunk_overlap.into(),
+                Value::Null, // language
+            ],
+        )
+        .await;
+
+        assert!(
+            result.is_ok(),
+            "test_flow_function failed for {}: {:?}",
+            context,
+            result.err()
         );
+
+        let value = result.unwrap();
+        match value {
+            Value::KTable(table) => {
+                assert_eq!(
+                    table.len(),
+                    expected_chunks.len(),
+                    "Chunk count mismatch for {}",
+                    context
+                );
+
+                for (i, (start_byte, end_byte, expected_text)) in expected_chunks.iter().enumerate()
+                {
+                    let range = RangeValue::new(*start_byte, *end_byte);
+                    let key = KeyValue::from_single_part(range);
+                    match table.get(&key) {
+                        Some(scope_value_ref) => {
+                            let chunk_text =
+                                scope_value_ref.0.fields[0].as_str().unwrap_or_else(|_| {
+                                    panic!("Chunk text not a string for key {key:?} in {context}")
+                                });
+                            assert_eq!(
+                                **chunk_text, **expected_text,
+                                "Chunk text mismatch for {}, chunk {}",
+                                context, i
+                            );
+
+                            // Verify range consistency with original text
+                            let extracted_text =
+                                text.get(*start_byte..*end_byte).unwrap_or_else(|| {
+                                    panic!(
+                                        "Invalid range {}-{} for text in {}",
+                                        start_byte, end_byte, context
+                                    )
+                                });
+                            assert_eq!(
+                                extracted_text, *expected_text,
+                                "Range inconsistency for {}, chunk {}",
+                                context, i
+                            );
+                        }
+                        None => panic!(
+                            "Expected row value for key {key:?} in {}, not found",
+                            context
+                        ),
+                    }
+                }
+            }
+            other => panic!("Expected Value::KTable for {}, got {other:?}", context),
+        }
     }
 
-    #[test]
-    fn test_basic_split_no_overlap() {
-        let text = "Linea 1.\nLinea 2.\n\nLinea 3.";
-        let chunker = create_test_chunker(text, 15, 5, 0);
+    #[tokio::test]
+    async fn test_basic_split_no_overlap() {
+        let spec = Spec {
+            custom_languages: vec![],
+        };
+        let factory = Arc::new(Factory);
 
-        let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
-            lang_config: &DEFAULT_LANGUAGE_CONFIG,
-            next_regexp_sep_id: 0,
-        });
+        let input_arg_schemas = &[
+            (
+                Some("text"),
+                make_output_type(BasicValueType::Str).with_nullable(true),
+            ),
+            (
+                Some("chunk_size"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("min_chunk_size"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("chunk_overlap"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("language"),
+                make_output_type(BasicValueType::Str).with_nullable(true),
+            ),
+        ];
 
-        assert!(result.is_ok());
-        let chunks = result.unwrap();
+        // Test 1: Basic split no overlap
+        let text = "Linea 1.\nLinea 2.\n\nLinea 3.";
+        let expected_chunks = vec![
+            (0, 8, "Linea 1."),
+            (9, 17, "Linea 2."),
+            (19, 27, "Linea 3."),
+        ];
 
-        assert_eq!(chunks.len(), 3);
-        assert_chunk_text_consistency(text, &chunks[0], "Linea 1.", "Test 1, Chunk 0");
-        assert_chunk_text_consistency(text, &chunks[1], "Linea 2.", "Test 1, Chunk 1");
-        assert_chunk_text_consistency(text, &chunks[2], "Linea 3.", "Test 1, Chunk 2");
+        assert_chunks_from_ktable(
+            &factory,
+            &spec,
+            input_arg_schemas,
+            text,
+            15,
+            Some(5),
+            0,
+            expected_chunks,
+            "Test 1",
+        )
+        .await;
 
-        // Test splitting when chunk_size forces breaks within segments.
         let text2 = "A very very long text that needs to be split.";
-        let chunker2 = create_test_chunker(text2, 20, 12, 0);
-        let result2 = chunker2.split_root_chunk(ChunkKind::RegexpSepChunk {
-            lang_config: &DEFAULT_LANGUAGE_CONFIG,
-            next_regexp_sep_id: 0,
-        });
+        let expected_chunks2 = vec![
+            (0, 16, "A very very long"),
+            (17, 32, "text that needs"),
+            (33, 45, "to be split."),
+        ];
 
-        assert!(result2.is_ok());
-        let chunks2 = result2.unwrap();
+        assert_chunks_from_ktable(
+            &factory,
+            &spec,
+            input_arg_schemas,
+            text2,
+            20,
+            Some(12),
+            0,
+            expected_chunks2,
+            "Test 2",
+        )
+        .await;
+
+        // Verify that the function produces at least one chunk for test 2
+        let result2 = test_flow_function(
+            &factory,
+            &spec,
+            input_arg_schemas,
+            vec![
+                text2.to_string().into(),
+                20i64.into(),
+                Some(12i64).into(),
+                0i64.into(),
+                Value::Null,
+            ],
+        )
+        .await;
 
-        // Expect multiple chunks, likely split by spaces due to chunk_size.
-        assert!(chunks2.len() > 1);
-        assert_chunk_text_consistency(text2, &chunks2[0], "A very very long", "Test 2, Chunk 0");
-        assert!(chunks2[0].text.len() <= 20);
+        assert!(result2.is_ok());
+        if let Value::KTable(table) = result2.unwrap() {
+            assert!(table.len() > 0, "Expected at least one chunk for test 2");
+            let first_key = table.keys().next().unwrap();
+            let first_chunk = table.get(first_key).unwrap();
+            let chunk_text = first_chunk.0.fields[0].as_str().unwrap();
+            assert!(
+                chunk_text.len() <= 20,
+                "First chunk exceeds chunk_size limit"
+            );
+        }
     }
 
-    #[test]
-    fn test_basic_split_with_overlap() {
-        let text = "This is a test text that is a bit longer to see how the overlap works.";
-        let chunker = create_test_chunker(text, 20, 10, 5);
+    #[tokio::test]
+    async fn test_basic_split_with_overlap() {
+        let spec = Spec {
+            custom_languages: vec![],
+        };
+        let factory = Arc::new(Factory);
 
-        let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
-            lang_config: &DEFAULT_LANGUAGE_CONFIG,
-            next_regexp_sep_id: 0,
-        });
+        let input_arg_schemas = &[
+            (
+                Some("text"),
+                make_output_type(BasicValueType::Str).with_nullable(true),
+            ),
+            (
+                Some("chunk_size"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("min_chunk_size"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("chunk_overlap"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("language"),
+                make_output_type(BasicValueType::Str).with_nullable(true),
+            ),
+        ];
 
-        assert!(result.is_ok());
-        let chunks = result.unwrap();
+        let text = "This is a test text that is a bit longer to see how the overlap works.";
 
-        assert!(chunks.len() > 1);
+        let result = test_flow_function(
+            &factory,
+            &spec,
+            input_arg_schemas,
+            vec![
+                text.to_string().into(),
+                20i64.into(),
+                Some(10i64).into(),
+                5i64.into(),
+                Value::Null,
+            ],
+        )
+        .await;
+
+        assert!(
+            result.is_ok(),
+            "test_flow_function failed: {:?}",
+            result.err()
+        );
 
-        if chunks.len() >= 2 {
-            assert!(chunks[0].text.len() <= 25);
+        let value = result.unwrap();
+        match value {
+            Value::KTable(table) => {
+                assert!(table.len() > 1, "Expected multiple chunks due to overlap");
+
+                // Check that first chunk doesn't exceed expected length (chunk_size + some tolerance)
+                let first_key = table.keys().next().unwrap();
+                let first_chunk = table.get(first_key).unwrap();
+                let chunk_text = first_chunk.0.fields[0].as_str().unwrap();
+                assert!(
+                    chunk_text.len() <= 25,
+                    "First chunk too long: {} chars",
+                    chunk_text.len()
+                );
+            }
+            other => panic!("Expected Value::KTable, got {other:?}"),
         }
     }
 
-    #[test]
-    fn test_split_trims_whitespace() {
-        let text = "  \n First chunk. \n\n  Second chunk with spaces at the end.   \n";
-        let chunker = create_test_chunker(text, 30, 10, 0);
-
-        let result = chunker.split_root_chunk(ChunkKind::RegexpSepChunk {
-            lang_config: &DEFAULT_LANGUAGE_CONFIG,
-            next_regexp_sep_id: 0,
-        });
+    #[tokio::test]
+    async fn test_split_trims_whitespace() {
+        let spec = Spec {
+            custom_languages: vec![],
+        };
+        let factory = Arc::new(Factory);
 
-        assert!(result.is_ok());
-        let chunks = result.unwrap();
+        let input_arg_schemas = &[
+            (
+                Some("text"),
+                make_output_type(BasicValueType::Str).with_nullable(true),
+            ),
+            (
+                Some("chunk_size"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("min_chunk_size"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("chunk_overlap"),
+                make_output_type(BasicValueType::Int64).with_nullable(true),
+            ),
+            (
+                Some("language"),
+                make_output_type(BasicValueType::Str).with_nullable(true),
+            ),
+        ];
 
-        assert_eq!(chunks.len(), 3);
+        let text = "  \n First chunk. \n\n  Second chunk with spaces at the end.   \n";
 
-        assert_chunk_text_consistency(
-            text,
-            &chunks[0],
-            " First chunk.",
-            "Whitespace Test, Chunk 0",
+        // Just verify we get 3 chunks and test the first one
+        let result = test_flow_function(
+            &factory,
+            &spec,
+            input_arg_schemas,
+            vec![
+                text.to_string().into(),
+                30i64.into(),
+                Some(10i64).into(),
+                0i64.into(),
+                Value::Null,
+            ],
+        )
+        .await;
+
+        assert!(
+            result.is_ok(),
+            "test_flow_function failed: {:?}",
+            result.err()
         );
-        assert_chunk_text_consistency(
-            text,
-            &chunks[1],
-            "  Second chunk with spaces",
-            "Whitespace Test, Chunk 1",
-        );
-        assert_chunk_text_consistency(text, &chunks[2], "at the end.", "Whitespace Test, Chunk 2");
+
+        if let Value::KTable(table) = result.unwrap() {
+            assert_eq!(table.len(), 3, "Expected 3 chunks for whitespace test");
+
+            let first_range = RangeValue::new(3, 16);
+            let first_key = KeyValue::from_single_part(first_range);
+            let first_chunk = table.get(&first_key).expect("First chunk should exist");
+            let first_text = first_chunk.0.fields[0].as_str().unwrap();
+            assert_eq!(
+                first_text.as_ref(),
+                " First chunk.",
+                "First chunk text mismatch"
+            );
+        } else {
+            panic!("Expected KTable");
+        }
     }
 }

From 17014c850b1fbec322434d7ddfb912ce4ca2d260 Mon Sep 17 00:00:00 2001
From: DavdaJames <jrdavda007@gmail.com>
Date: Thu, 2 Oct 2025 00:08:26 +0530
Subject: [PATCH 2/2] refactor the logic to build schema into a function and
 reuse and also refactor code a bit

---
 src/ops/functions/split_recursively.rs | 634 ++++++++++---------------
 1 file changed, 251 insertions(+), 383 deletions(-)

diff --git a/src/ops/functions/split_recursively.rs b/src/ops/functions/split_recursively.rs
index 6a6c704b5..dc4eac178 100644
--- a/src/ops/functions/split_recursively.rs
+++ b/src/ops/functions/split_recursively.rs
@@ -941,16 +941,11 @@ mod tests {
     use super::*;
     use crate::ops::functions::test_utils::test_flow_function;
     use crate::ops::sdk::{BasicValueType, KeyValue, RangeValue, make_output_type};
+    use crate::ops::shared::split::OutputPosition;
 
-    #[tokio::test]
-    async fn test_split_recursively() {
-        let spec = Spec {
-            custom_languages: vec![],
-        };
-        let factory = Arc::new(Factory);
-        let text_content = "Linea 1.\nLinea 2.\n\nLinea 3.";
-
-        let input_arg_schemas = &[
+    // Helper function to build the standard input argument schemas for split_recursively tests
+    fn build_split_recursively_arg_schemas() -> Vec<(Option<&'static str>, EnrichedValueType)> {
+        vec![
             (
                 Some("text"),
                 make_output_type(BasicValueType::Str).with_nullable(true),
@@ -971,7 +966,17 @@ mod tests {
                 Some("language"),
                 make_output_type(BasicValueType::Str).with_nullable(true),
             ),
-        ];
+        ]
+    }
+
+    #[tokio::test]
+    async fn test_split_recursively() {
+        let spec = Spec {
+            custom_languages: vec![],
+        };
+        let factory = Arc::new(Factory);
+        let text_content = "Linea 1.\nLinea 2.\n\nLinea 3.";
+        let input_arg_schemas = &build_split_recursively_arg_schemas();
 
         {
             let result = test_flow_function(
@@ -1058,283 +1063,176 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_translate_bytes_to_chars_simple() {
+        let text = "abc😄def";
+        let mut start1 = Position::new(0);
+        let mut end1 = Position::new(3);
+        let mut start2 = Position::new(3);
+        let mut end2 = Position::new(7);
+        let mut start3 = Position::new(7);
+        let mut end3 = Position::new(10);
+        let mut end_full = Position::new(text.len());
+
+        let offsets = vec![
+            &mut start1,
+            &mut end1,
+            &mut start2,
+            &mut end2,
+            &mut start3,
+            &mut end3,
+            &mut end_full,
+        ];
+
+        set_output_positions(text, offsets.into_iter());
+
+        assert_eq!(
+            start1.output,
+            Some(OutputPosition {
+                char_offset: 0,
+                line: 1,
+                column: 1,
+            })
+        );
+        assert_eq!(
+            end1.output,
+            Some(OutputPosition {
+                char_offset: 3,
+                line: 1,
+                column: 4,
+            })
+        );
+        assert_eq!(
+            start2.output,
+            Some(OutputPosition {
+                char_offset: 3,
+                line: 1,
+                column: 4,
+            })
+        );
+        assert_eq!(
+            end2.output,
+            Some(OutputPosition {
+                char_offset: 4,
+                line: 1,
+                column: 5,
+            })
+        );
+        assert_eq!(
+            end3.output,
+            Some(OutputPosition {
+                char_offset: 7,
+                line: 1,
+                column: 8,
+            })
+        );
+        assert_eq!(
+            end_full.output,
+            Some(OutputPosition {
+                char_offset: 7,
+                line: 1,
+                column: 8,
+            })
+        );
+    }
+
     #[tokio::test]
-    async fn test_unicode_character_positioning() {
+    async fn test_basic_split_no_overlap() {
         let spec = Spec {
             custom_languages: vec![],
         };
         let factory = Arc::new(Factory);
+        let text = "Linea 1.\nLinea 2.\n\nLinea 3.";
+        let input_arg_schemas = &build_split_recursively_arg_schemas();
 
-        let input_arg_schemas = &[
-            (
-                Some("text"),
-                make_output_type(BasicValueType::Str).with_nullable(true),
-            ),
-            (
-                Some("chunk_size"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("min_chunk_size"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("chunk_overlap"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("language"),
-                make_output_type(BasicValueType::Str).with_nullable(true),
-            ),
-        ];
-
-        let text = "abc😄def";
-
-        let result = test_flow_function(
-            &factory,
-            &spec,
-            input_arg_schemas,
-            vec![
-                text.to_string().into(),
-                10i64.into(),
-                Some(1i64).into(),
-                0i64.into(),
-                Value::Null,
-            ],
-        )
-        .await;
-
-        assert!(
-            result.is_ok(),
-            "test_flow_function failed: {:?}",
-            result.err()
-        );
+        {
+            let result = test_flow_function(
+                &factory,
+                &spec,
+                input_arg_schemas,
+                vec![
+                    text.to_string().into(),
+                    (15i64).into(),
+                    (5i64).into(),
+                    (0i64).into(),
+                    Value::Null,
+                ],
+            )
+            .await;
+            assert!(
+                result.is_ok(),
+                "test_flow_function failed: {:?}",
+                result.err()
+            );
+            let value = result.unwrap();
+            match value {
+                Value::KTable(table) => {
+                    let expected_chunks = vec![
+                        (RangeValue::new(0, 8), "Linea 1."),
+                        (RangeValue::new(9, 17), "Linea 2."),
+                        (RangeValue::new(19, 27), "Linea 3."),
+                    ];
 
-        let value = result.unwrap();
-        match value {
-            Value::KTable(table) => {
-                assert_eq!(table.len(), 1, "Expected single chunk for small text");
-
-                let (_, scope_value_ref) = table.iter().next().unwrap();
-                let fields = &scope_value_ref.0.fields;
-
-                // Check start position (field[1])
-                let start_pos = &fields[1]; // "start" field
-                let start_offset = start_pos.as_struct().unwrap().fields[0].as_int64().unwrap();
-                let start_line = start_pos.as_struct().unwrap().fields[1].as_int64().unwrap();
-                let start_column = start_pos.as_struct().unwrap().fields[2].as_int64().unwrap();
-
-                // Check end position (field[2])
-                let end_pos = &fields[2]; // "end" field
-                let end_offset = end_pos.as_struct().unwrap().fields[0].as_int64().unwrap();
-                let end_line = end_pos.as_struct().unwrap().fields[1].as_int64().unwrap();
-                let end_column = end_pos.as_struct().unwrap().fields[2].as_int64().unwrap();
-
-                // Verify character positions are correct for Unicode
-                assert_eq!(start_offset, 0, "Start character offset should be 0");
-                assert_eq!(start_line, 1, "Start line should be 1");
-                assert_eq!(start_column, 1, "Start column should be 1");
-
-                assert_eq!(
-                    end_offset, 7,
-                    "End character offset should be 7 (abc😄def = 7 chars)"
-                );
-                assert_eq!(end_line, 1, "End line should be 1");
-                assert_eq!(end_column, 8, "End column should be 8 (1-indexed)");
-                assert_eq!(start_line, 1, "Start line should be 1");
-                assert_eq!(start_column, 1, "Start column should be 1");
-
-                assert_eq!(
-                    end_offset, 7,
-                    "End character offset should be 7 (abc😄def = 7 chars)"
-                );
-                assert_eq!(end_line, 1, "End line should be 1");
-                assert_eq!(end_column, 8, "End column should be 8 (1-indexed)");
+                    for (range, expected_text) in expected_chunks {
+                        let key = KeyValue::from_single_part(range);
+                        match table.get(&key) {
+                            Some(scope_value_ref) => {
+                                let chunk_text =
+                                    scope_value_ref.0.fields[0].as_str().unwrap_or_else(|_| {
+                                        panic!("Chunk text not a string for key {key:?}")
+                                    });
+                                assert_eq!(**chunk_text, *expected_text);
+                            }
+                            None => panic!("Expected row value for key {key:?}, not found"),
+                        }
+                    }
+                }
+                other => panic!("Expected Value::KTable, got {other:?}"),
             }
-            other => panic!("Expected Value::KTable, got {other:?}"),
         }
-    }
-
-    // Helper function to extract chunks from KTable and verify them
-    async fn assert_chunks_from_ktable(
-        factory: &Arc<Factory>,
-        spec: &Spec,
-        input_arg_schemas: &[(Option<&str>, EnrichedValueType)],
-        text: &str,
-        chunk_size: i64,
-        min_chunk_size: Option<i64>,
-        chunk_overlap: i64,
-        expected_chunks: Vec<(usize, usize, &str)>, // (start_byte, end_byte, expected_text)
-        context: &str,
-    ) {
-        let result = test_flow_function(
-            factory,
-            spec,
-            input_arg_schemas,
-            vec![
-                text.to_string().into(),
-                chunk_size.into(),
-                min_chunk_size.map(Value::from).unwrap_or(Value::Null),
-                chunk_overlap.into(),
-                Value::Null, // language
-            ],
-        )
-        .await;
-
-        assert!(
-            result.is_ok(),
-            "test_flow_function failed for {}: {:?}",
-            context,
-            result.err()
-        );
 
-        let value = result.unwrap();
-        match value {
-            Value::KTable(table) => {
-                assert_eq!(
-                    table.len(),
-                    expected_chunks.len(),
-                    "Chunk count mismatch for {}",
-                    context
-                );
+        // Test splitting when chunk_size forces breaks within segments.
+        let text2 = "A very very long text that needs to be split.";
+        {
+            let result = test_flow_function(
+                &factory,
+                &spec,
+                input_arg_schemas,
+                vec![
+                    text2.to_string().into(),
+                    (20i64).into(),
+                    (12i64).into(),
+                    (0i64).into(),
+                    Value::Null,
+                ],
+            )
+            .await;
+            assert!(
+                result.is_ok(),
+                "test_flow_function failed: {:?}",
+                result.err()
+            );
+            let value = result.unwrap();
+            match value {
+                Value::KTable(table) => {
+                    // Expect multiple chunks, likely split by spaces due to chunk_size.
+                    assert!(table.len() > 1);
 
-                for (i, (start_byte, end_byte, expected_text)) in expected_chunks.iter().enumerate()
-                {
-                    let range = RangeValue::new(*start_byte, *end_byte);
-                    let key = KeyValue::from_single_part(range);
+                    let key = KeyValue::from_single_part(RangeValue::new(0, 16));
                     match table.get(&key) {
                         Some(scope_value_ref) => {
                             let chunk_text =
                                 scope_value_ref.0.fields[0].as_str().unwrap_or_else(|_| {
-                                    panic!("Chunk text not a string for key {key:?} in {context}")
-                                });
-                            assert_eq!(
-                                **chunk_text, **expected_text,
-                                "Chunk text mismatch for {}, chunk {}",
-                                context, i
-                            );
-
-                            // Verify range consistency with original text
-                            let extracted_text =
-                                text.get(*start_byte..*end_byte).unwrap_or_else(|| {
-                                    panic!(
-                                        "Invalid range {}-{} for text in {}",
-                                        start_byte, end_byte, context
-                                    )
+                                    panic!("Chunk text not a string for key {key:?}")
                                 });
-                            assert_eq!(
-                                extracted_text, *expected_text,
-                                "Range inconsistency for {}, chunk {}",
-                                context, i
-                            );
+                            assert_eq!(&**chunk_text, "A very very long");
+                            assert!(chunk_text.len() <= 20);
                         }
-                        None => panic!(
-                            "Expected row value for key {key:?} in {}, not found",
-                            context
-                        ),
+                        None => panic!("Expected row value for key {key:?}, not found"),
                     }
                 }
+                other => panic!("Expected Value::KTable, got {other:?}"),
             }
-            other => panic!("Expected Value::KTable for {}, got {other:?}", context),
-        }
-    }
-
-    #[tokio::test]
-    async fn test_basic_split_no_overlap() {
-        let spec = Spec {
-            custom_languages: vec![],
-        };
-        let factory = Arc::new(Factory);
-
-        let input_arg_schemas = &[
-            (
-                Some("text"),
-                make_output_type(BasicValueType::Str).with_nullable(true),
-            ),
-            (
-                Some("chunk_size"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("min_chunk_size"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("chunk_overlap"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("language"),
-                make_output_type(BasicValueType::Str).with_nullable(true),
-            ),
-        ];
-
-        // Test 1: Basic split no overlap
-        let text = "Linea 1.\nLinea 2.\n\nLinea 3.";
-        let expected_chunks = vec![
-            (0, 8, "Linea 1."),
-            (9, 17, "Linea 2."),
-            (19, 27, "Linea 3."),
-        ];
-
-        assert_chunks_from_ktable(
-            &factory,
-            &spec,
-            input_arg_schemas,
-            text,
-            15,
-            Some(5),
-            0,
-            expected_chunks,
-            "Test 1",
-        )
-        .await;
-
-        let text2 = "A very very long text that needs to be split.";
-        let expected_chunks2 = vec![
-            (0, 16, "A very very long"),
-            (17, 32, "text that needs"),
-            (33, 45, "to be split."),
-        ];
-
-        assert_chunks_from_ktable(
-            &factory,
-            &spec,
-            input_arg_schemas,
-            text2,
-            20,
-            Some(12),
-            0,
-            expected_chunks2,
-            "Test 2",
-        )
-        .await;
-
-        // Verify that the function produces at least one chunk for test 2
-        let result2 = test_flow_function(
-            &factory,
-            &spec,
-            input_arg_schemas,
-            vec![
-                text2.to_string().into(),
-                20i64.into(),
-                Some(12i64).into(),
-                0i64.into(),
-                Value::Null,
-            ],
-        )
-        .await;
-
-        assert!(result2.is_ok());
-        if let Value::KTable(table) = result2.unwrap() {
-            assert!(table.len() > 0, "Expected at least one chunk for test 2");
-            let first_key = table.keys().next().unwrap();
-            let first_chunk = table.get(first_key).unwrap();
-            let chunk_text = first_chunk.0.fields[0].as_str().unwrap();
-            assert!(
-                chunk_text.len() <= 20,
-                "First chunk exceeds chunk_size limit"
-            );
         }
     }
 
@@ -1344,68 +1242,50 @@ mod tests {
             custom_languages: vec![],
         };
         let factory = Arc::new(Factory);
-
-        let input_arg_schemas = &[
-            (
-                Some("text"),
-                make_output_type(BasicValueType::Str).with_nullable(true),
-            ),
-            (
-                Some("chunk_size"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("min_chunk_size"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("chunk_overlap"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("language"),
-                make_output_type(BasicValueType::Str).with_nullable(true),
-            ),
-        ];
-
         let text = "This is a test text that is a bit longer to see how the overlap works.";
+        let input_arg_schemas = &build_split_recursively_arg_schemas();
 
-        let result = test_flow_function(
-            &factory,
-            &spec,
-            input_arg_schemas,
-            vec![
-                text.to_string().into(),
-                20i64.into(),
-                Some(10i64).into(),
-                5i64.into(),
-                Value::Null,
-            ],
-        )
-        .await;
-
-        assert!(
-            result.is_ok(),
-            "test_flow_function failed: {:?}",
-            result.err()
-        );
+        {
+            let result = test_flow_function(
+                &factory,
+                &spec,
+                input_arg_schemas,
+                vec![
+                    text.to_string().into(),
+                    (20i64).into(),
+                    (10i64).into(),
+                    (5i64).into(),
+                    Value::Null,
+                ],
+            )
+            .await;
+            assert!(
+                result.is_ok(),
+                "test_flow_function failed: {:?}",
+                result.err()
+            );
+            let value = result.unwrap();
+            match value {
+                Value::KTable(table) => {
+                    assert!(table.len() > 1);
 
-        let value = result.unwrap();
-        match value {
-            Value::KTable(table) => {
-                assert!(table.len() > 1, "Expected multiple chunks due to overlap");
-
-                // Check that first chunk doesn't exceed expected length (chunk_size + some tolerance)
-                let first_key = table.keys().next().unwrap();
-                let first_chunk = table.get(first_key).unwrap();
-                let chunk_text = first_chunk.0.fields[0].as_str().unwrap();
-                assert!(
-                    chunk_text.len() <= 25,
-                    "First chunk too long: {} chars",
-                    chunk_text.len()
-                );
+                    // Check first chunk length
+                    if table.len() >= 2 {
+                        let first_key = table.keys().next().unwrap();
+                        match table.get(first_key) {
+                            Some(scope_value_ref) => {
+                                let chunk_text =
+                                    scope_value_ref.0.fields[0].as_str().unwrap_or_else(|_| {
+                                        panic!("Chunk text not a string for key {first_key:?}")
+                                    });
+                                assert!(chunk_text.len() <= 25);
+                            }
+                            None => panic!("Expected row value for first key, not found"),
+                        }
+                    }
+                }
+                other => panic!("Expected Value::KTable, got {other:?}"),
             }
-            other => panic!("Expected Value::KTable, got {other:?}"),
         }
     }
 
@@ -1415,67 +1295,55 @@ mod tests {
             custom_languages: vec![],
         };
         let factory = Arc::new(Factory);
-
-        let input_arg_schemas = &[
-            (
-                Some("text"),
-                make_output_type(BasicValueType::Str).with_nullable(true),
-            ),
-            (
-                Some("chunk_size"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("min_chunk_size"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("chunk_overlap"),
-                make_output_type(BasicValueType::Int64).with_nullable(true),
-            ),
-            (
-                Some("language"),
-                make_output_type(BasicValueType::Str).with_nullable(true),
-            ),
-        ];
-
         let text = "  \n First chunk. \n\n  Second chunk with spaces at the end.   \n";
+        let input_arg_schemas = &build_split_recursively_arg_schemas();
 
-        // Just verify we get 3 chunks and test the first one
-        let result = test_flow_function(
-            &factory,
-            &spec,
-            input_arg_schemas,
-            vec![
-                text.to_string().into(),
-                30i64.into(),
-                Some(10i64).into(),
-                0i64.into(),
-                Value::Null,
-            ],
-        )
-        .await;
-
-        assert!(
-            result.is_ok(),
-            "test_flow_function failed: {:?}",
-            result.err()
-        );
-
-        if let Value::KTable(table) = result.unwrap() {
-            assert_eq!(table.len(), 3, "Expected 3 chunks for whitespace test");
-
-            let first_range = RangeValue::new(3, 16);
-            let first_key = KeyValue::from_single_part(first_range);
-            let first_chunk = table.get(&first_key).expect("First chunk should exist");
-            let first_text = first_chunk.0.fields[0].as_str().unwrap();
-            assert_eq!(
-                first_text.as_ref(),
-                " First chunk.",
-                "First chunk text mismatch"
+        {
+            let result = test_flow_function(
+                &factory,
+                &spec,
+                input_arg_schemas,
+                vec![
+                    text.to_string().into(),
+                    (30i64).into(),
+                    (10i64).into(),
+                    (0i64).into(),
+                    Value::Null,
+                ],
+            )
+            .await;
+            assert!(
+                result.is_ok(),
+                "test_flow_function failed: {:?}",
+                result.err()
             );
-        } else {
-            panic!("Expected KTable");
+            let value = result.unwrap();
+            match value {
+                Value::KTable(table) => {
+                    assert_eq!(table.len(), 3);
+
+                    let expected_chunks = vec![
+                        (RangeValue::new(3, 16), " First chunk."),
+                        (RangeValue::new(19, 45), "  Second chunk with spaces"),
+                        (RangeValue::new(46, 57), "at the end."),
+                    ];
+
+                    for (range, expected_text) in expected_chunks {
+                        let key = KeyValue::from_single_part(range);
+                        match table.get(&key) {
+                            Some(scope_value_ref) => {
+                                let chunk_text =
+                                    scope_value_ref.0.fields[0].as_str().unwrap_or_else(|_| {
+                                        panic!("Chunk text not a string for key {key:?}")
+                                    });
+                                assert_eq!(**chunk_text, *expected_text);
+                            }
+                            None => panic!("Expected row value for key {key:?}, not found"),
+                        }
+                    }
+                }
+                other => panic!("Expected Value::KTable, got {other:?}"),
+            }
         }
     }
 }