Add unit tests

NathalieCharbel · NathalieCharbel · commit 2d3b4fd6f191 · 2025-01-13T21:31:57.000+02:00
diff --git a/tests/unit/experimental/components/text_splitters/test_fixed_size_splitter.py b/tests/unit/experimental/components/text_splitters/test_fixed_size_splitter.py
@@ -16,7 +16,7 @@
 
 import pytest
 from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import (
-    FixedSizeSplitter,
+    FixedSizeSplitter, _adjust_chunk_start, _adjust_chunk_end,
 )
 from neo4j_graphrag.experimental.components.types import TextChunk
 
@@ -78,3 +78,139 @@ def test_invalid_chunk_overlap() -> None:
     with pytest.raises(ValueError) as excinfo:
         FixedSizeSplitter(5, 5)
     assert "chunk_overlap must be strictly less than chunk_size" in str(excinfo)
+
+
+def test_invalid_chunk_size() -> None:
+    with pytest.raises(ValueError) as excinfo:
+        FixedSizeSplitter(0, 0)
+    assert "chunk_size must be strictly greater than 0" in str(excinfo)
+
+
+@pytest.mark.parametrize(
+    "text, approximate_start, expected_start",
+    [
+        # Case: approximate_start is at word boundary already
+        ("Hello World", 6, 6),
+        # Case: approximate_start is at a whitespace already
+        ("Hello World", 5, 5),
+        # Case: approximate_start is at the middle of word and no whitespace is found
+        ("Hello World", 2, 2),
+        # Case: approximate_start is at the middle of a word
+        ("Hello World", 8, 6),
+        # Case: approximate_start = 0
+        ("Hello World", 0, 0),
+    ],
+)
+def test_adjust_chunk_start(text, approximate_start, expected_start):
+    """
+    Test that the _adjust_chunk_start function correctly shifts
+    the start index to avoid breaking words, unless no whitespace is found.
+    """
+    result = _adjust_chunk_start(text, approximate_start)
+    assert result == expected_start
+
+
+@pytest.mark.parametrize(
+    "text, start, approximate_end, expected_end",
+    [
+        # Case: approximate_end is at word boundary already
+        ("Hello World", 0, 5, 5),
+        # Case: approximate_end is at the middle of a word
+        ("Hello World", 0, 8, 6),
+        # Case: approximate_end is at the middle of word and no whitespace is found
+        ("Hello World", 0, 3, 3),
+        # Case: adjusted_end == start => fallback to approximate_end
+        ("Hello World", 6, 7, 7),
+        # Case: end>=len(text)
+        ("Hello World", 6, 15, 15),
+    ],
+)
+def test_adjust_chunk_end(text, start, approximate_end, expected_end):
+    """
+    Test that the _adjust_chunk_end function correctly shifts
+    the end index to avoid breaking words, unless no whitespace is found.
+    """
+    result = _adjust_chunk_end(text, start, approximate_end)
+    assert result == expected_end
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "text, chunk_size, chunk_overlap, approximate, expected_chunks",
+    [
+        # Case: approximate fixed size splitting
+        (
+            "Hello World, this is a test message.",
+            10,
+            2,
+            True,
+            [
+                "Hello ",
+                "World, ",
+                "this is a ",
+                "a test ",
+                "message."
+            ],
+        ),
+        # Case: fixed size splitting
+        (
+            "Hello World, this is a test message.",
+            10,
+            2,
+            False,
+            [
+                "Hello Worl",
+                "rld, this ",
+                "s is a tes",
+                "est messag",
+                "age."
+            ],
+        ),
+        # Case: short text => only one chunk
+        (
+            "Short text",
+            20,
+            5,
+            True,
+            ["Short text"],
+        ),
+        # Case: short text => only one chunk
+        (
+            "Short text",
+            12,
+            4,
+            True,
+            ["Short text"],
+        ),
+        # Case: text with no spaces
+        (
+            "1234567890",
+            5,
+            1,
+            True,
+            ["12345", "56789", "90"],
+        ),
+    ],
+)
+async def test_fixed_size_splitter_run(
+    text, chunk_size, chunk_overlap, approximate, expected_chunks
+):
+    """
+    Test that 'FixedSizeSplitter.run' returns the expected chunks
+    for different configurations.
+    """
+    splitter = FixedSizeSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        approximate=approximate,
+    )
+    text_chunks = await splitter.run(text)
+
+    # Verify number of chunks
+    assert len(text_chunks.chunks) == len(expected_chunks)
+
+    # Verify content of each chunk
+    for i, expected_text in enumerate(expected_chunks):
+        assert text_chunks.chunks[i].text == expected_text
+        assert isinstance(text_chunks.chunks[i], TextChunk)
+        assert text_chunks.chunks[i].index == i