Skip to content

Commit 012b3e7

Browse files
author
Daniele Briggi
committed
refact(tests): simplified sentences splitter
1 parent fa06d14 commit 012b3e7

File tree

4 files changed

+52
-27
lines changed

4 files changed

+52
-27
lines changed

src/sqlite_rag/sentence_splitter.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,27 +10,48 @@ class SentenceSplitter:
1010

1111
def split(self, chunk: Chunk) -> List[Sentence]:
1212
"""Split chunk into sentences."""
13-
sentences = []
14-
15-
sentences_text = self._split_into_sentences(chunk.content)
16-
start_offset = 0
17-
end_offset = 0
18-
for sentence_text in sentences_text:
19-
start_offset = chunk.content.index(sentence_text, end_offset)
20-
end_offset = start_offset + len(sentence_text)
13+
# Split on: sentence endings, semicolons, or paragraph breaks
14+
sentence_regex = re.compile(r'(?<=[.!?;])(?:"|\')?\s+(?=[A-Z])|[\n]{2,}')
2115

22-
sentence = Sentence(
23-
content=sentence_text,
24-
start_offset=start_offset,
25-
end_offset=end_offset,
26-
)
27-
sentences.append(sentence)
16+
sentences = []
17+
last_end = 0
18+
text = chunk.content
19+
20+
for match in sentence_regex.finditer(text):
21+
segment = text[last_end : match.end()]
22+
23+
segment = segment.strip()
24+
if len(segment) > self.MIN_CHARS_PER_SENTENCE:
25+
sentences.append(
26+
Sentence(
27+
content=segment,
28+
start_offset=last_end,
29+
end_offset=last_end + len(segment),
30+
)
31+
)
32+
33+
# Position after the current match
34+
last_end = match.end()
35+
36+
# Last segment
37+
if last_end < len(text):
38+
segment = text[last_end:]
39+
40+
segment = segment.strip()
41+
if len(segment) > self.MIN_CHARS_PER_SENTENCE:
42+
sentences.append(
43+
Sentence(
44+
content=segment,
45+
start_offset=last_end,
46+
end_offset=last_end + len(segment),
47+
)
48+
)
2849

2950
return sentences
3051

3152
def _split_into_sentences(self, text: str) -> List[str]:
3253
"""Split into focused segments for semantic matching."""
33-
# Split on: sentence endings, semicolons, or paragraph breaks
54+
3455
sentence_endings = re.compile(r'(?<=[.!?;])(?:"|\')?\s+(?=[A-Z])|[\n]{2,}')
3556
sentences = sentence_endings.split(text)
3657

tests/integration/test_engine.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
class TestEngine:
1515
@pytest.mark.slow
16-
def test_stress_embedding_generation(self, engine):
16+
def test_stress_embedding_generation(self, engine: Engine):
1717
"""Test embedding generation with a large number of chunks
1818
to not fail and to never generate duplicated embeddings."""
1919

@@ -36,7 +36,7 @@ def random_string(length=30):
3636

3737

3838
class TestEngineQuantization:
39-
def test_quantize_embedding(self, engine):
39+
def test_quantize_embedding(self, engine: Engine):
4040
"""Test quantize called for chunks and sentences embeddings."""
4141
engine.quantize()
4242

@@ -54,8 +54,8 @@ def test_quantize_cleanup(self, engine):
5454

5555

5656
class TestEngineSearch:
57-
def test_search_with_empty_database(self, engine):
58-
results = engine.search("nonexistent query", top_k=5)
57+
def test_search_with_empty_database(self, engine: Engine):
58+
results = engine.search("nonexistent query", "nonexistent query", top_k=5)
5959

6060
assert len(results) == 0
6161

tests/test_sentence_splitter.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,14 @@ def test_split(self):
2020

2121
assert len(sentences) == 3
2222
assert sentences[0].content == "This is the first sentence."
23-
assert sentences[0].sequence == 0
2423
assert sentences[0].start_offset == 0
2524
assert sentences[0].end_offset == 27
2625

2726
assert sentences[1].content == "Here is the second sentence!"
28-
assert sentences[1].sequence == 1
2927
assert sentences[1].start_offset == 28
3028
assert sentences[1].end_offset == 28 + 28
3129

3230
assert sentences[2].content == "And what about the third?"
33-
assert sentences[2].sequence == 2
3431
assert sentences[2].start_offset == 57
3532
assert sentences[2].end_offset == 57 + 25
3633

@@ -66,6 +63,5 @@ def test_split_no_punctuation(self):
6663

6764
assert len(sentences) == 1
6865
assert sentences[0].content == chunk.content
69-
assert sentences[0].sequence == 0
7066
assert sentences[0].start_offset == 0
7167
assert sentences[0].end_offset == len(chunk.content)

tests/test_sqlite_rag.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -838,10 +838,14 @@ def test_search_uses_retrieval_query_template(self, mocker):
838838
rag.search(query)
839839

840840
# Assert that engine.search was called with the formatted template
841-
expected_query = rag._settings.prompt_template_retrieval_query.format(
841+
expected_semantic_query = rag._settings.prompt_template_retrieval_query.format(
842842
content=query
843843
)
844-
mock_engine.search.assert_called_once_with(expected_query, top_k=10)
844+
expected_fts_query = query + "*"
845+
846+
mock_engine.search.assert_called_once_with(
847+
expected_semantic_query, expected_fts_query, top_k=10
848+
)
845849

846850
@pytest.mark.parametrize("use_prompt_templates", [True, False])
847851
def test_search_with_prompt_template(self, mocker, use_prompt_templates):
@@ -865,9 +869,13 @@ def test_search_with_prompt_template(self, mocker, use_prompt_templates):
865869
rag.search("test query", new_context=False)
866870

867871
# Assert - verify engine.search was called with correct formatted query
868-
expected_query = (
872+
expected_semantic_query = (
869873
"task: search result | query: test query"
870874
if use_prompt_templates
871875
else "test query"
872876
)
873-
mock_engine.search.assert_called_once_with(expected_query, top_k=10)
877+
expected_fts_query = "test query*"
878+
879+
mock_engine.search.assert_called_once_with(
880+
expected_semantic_query, expected_fts_query, top_k=10
881+
)

0 commit comments

Comments
 (0)