refact(tests): simplified sentences splitter

Daniele Briggi · Daniele Briggi · commit 012b3e769584 · 2025-10-20T08:35:15.000Z
diff --git a/src/sqlite_rag/sentence_splitter.py b/src/sqlite_rag/sentence_splitter.py
@@ -10,27 +10,48 @@ class SentenceSplitter:
 
     def split(self, chunk: Chunk) -> List[Sentence]:
         """Split chunk into sentences."""
-        sentences = []
-
-        sentences_text = self._split_into_sentences(chunk.content)
-        start_offset = 0
-        end_offset = 0
-        for sentence_text in sentences_text:
-            start_offset = chunk.content.index(sentence_text, end_offset)
-            end_offset = start_offset + len(sentence_text)
+        # Split on: sentence endings, semicolons, or paragraph breaks
+        sentence_regex = re.compile(r'(?<=[.!?;])(?:"|\')?\s+(?=[A-Z])|[\n]{2,}')
 
-            sentence = Sentence(
-                content=sentence_text,
-                start_offset=start_offset,
-                end_offset=end_offset,
-            )
-            sentences.append(sentence)
+        sentences = []
+        last_end = 0
+        text = chunk.content
+
+        for match in sentence_regex.finditer(text):
+            segment = text[last_end : match.end()]
+
+            segment = segment.strip()
+            if len(segment) > self.MIN_CHARS_PER_SENTENCE:
+                sentences.append(
+                    Sentence(
+                        content=segment,
+                        start_offset=last_end,
+                        end_offset=last_end + len(segment),
+                    )
+                )
+
+            # Position after the current match
+            last_end = match.end()
+
+        # Last segment
+        if last_end < len(text):
+            segment = text[last_end:]
+
+            segment = segment.strip()
+            if len(segment) > self.MIN_CHARS_PER_SENTENCE:
+                sentences.append(
+                    Sentence(
+                        content=segment,
+                        start_offset=last_end,
+                        end_offset=last_end + len(segment),
+                    )
+                )
 
         return sentences
 
     def _split_into_sentences(self, text: str) -> List[str]:
         """Split into focused segments for semantic matching."""
-        # Split on: sentence endings, semicolons, or paragraph breaks
+
         sentence_endings = re.compile(r'(?<=[.!?;])(?:"|\')?\s+(?=[A-Z])|[\n]{2,}')
         sentences = sentence_endings.split(text)
 
diff --git a/tests/integration/test_engine.py b/tests/integration/test_engine.py
@@ -13,7 +13,7 @@
 
 class TestEngine:
     @pytest.mark.slow
-    def test_stress_embedding_generation(self, engine):
+    def test_stress_embedding_generation(self, engine: Engine):
         """Test embedding generation with a large number of chunks
         to not fail and to never generate duplicated embeddings."""
 
@@ -36,7 +36,7 @@ def random_string(length=30):
 
 
 class TestEngineQuantization:
-    def test_quantize_embedding(self, engine):
+    def test_quantize_embedding(self, engine: Engine):
         """Test quantize called for chunks and sentences embeddings."""
         engine.quantize()
 
@@ -54,8 +54,8 @@ def test_quantize_cleanup(self, engine):
 
 
 class TestEngineSearch:
-    def test_search_with_empty_database(self, engine):
-        results = engine.search("nonexistent query", top_k=5)
+    def test_search_with_empty_database(self, engine: Engine):
+        results = engine.search("nonexistent query", "nonexistent query", top_k=5)
 
         assert len(results) == 0
 
diff --git a/tests/test_sentence_splitter.py b/tests/test_sentence_splitter.py
@@ -20,17 +20,14 @@ def test_split(self):
 
         assert len(sentences) == 3
         assert sentences[0].content == "This is the first sentence."
-        assert sentences[0].sequence == 0
         assert sentences[0].start_offset == 0
         assert sentences[0].end_offset == 27
 
         assert sentences[1].content == "Here is the second sentence!"
-        assert sentences[1].sequence == 1
         assert sentences[1].start_offset == 28
         assert sentences[1].end_offset == 28 + 28
 
         assert sentences[2].content == "And what about the third?"
-        assert sentences[2].sequence == 2
         assert sentences[2].start_offset == 57
         assert sentences[2].end_offset == 57 + 25
 
@@ -66,6 +63,5 @@ def test_split_no_punctuation(self):
 
         assert len(sentences) == 1
         assert sentences[0].content == chunk.content
-        assert sentences[0].sequence == 0
         assert sentences[0].start_offset == 0
         assert sentences[0].end_offset == len(chunk.content)
diff --git a/tests/test_sqlite_rag.py b/tests/test_sqlite_rag.py
@@ -838,10 +838,14 @@ def test_search_uses_retrieval_query_template(self, mocker):
         rag.search(query)
 
         # Assert that engine.search was called with the formatted template
-        expected_query = rag._settings.prompt_template_retrieval_query.format(
+        expected_semantic_query = rag._settings.prompt_template_retrieval_query.format(
             content=query
         )
-        mock_engine.search.assert_called_once_with(expected_query, top_k=10)
+        expected_fts_query = query + "*"
+
+        mock_engine.search.assert_called_once_with(
+            expected_semantic_query, expected_fts_query, top_k=10
+        )
 
     @pytest.mark.parametrize("use_prompt_templates", [True, False])
     def test_search_with_prompt_template(self, mocker, use_prompt_templates):
@@ -865,9 +869,13 @@ def test_search_with_prompt_template(self, mocker, use_prompt_templates):
         rag.search("test query", new_context=False)
 
         # Assert - verify engine.search was called with correct formatted query
-        expected_query = (
+        expected_semantic_query = (
             "task: search result | query: test query"
             if use_prompt_templates
             else "test query"
         )
-        mock_engine.search.assert_called_once_with(expected_query, top_k=10)
+        expected_fts_query = "test query*"
+
+        mock_engine.search.assert_called_once_with(
+            expected_semantic_query, expected_fts_query, top_k=10
+        )