Skip to content

Commit 43cfe1a

Browse files
author
Daniele Briggi
committed
fix(chunker): overlap applied multiple times
1 parent b087356 commit 43cfe1a

File tree

3 files changed

+103
-4
lines changed

3 files changed

+103
-4
lines changed

src/sqlite_rag/chunker.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ def _recursive_split(self, text: str) -> List[Chunk]:
4747
"", # Character level (fallback)
4848
]
4949

50-
return self._split_text_with_separators(text, separators)
50+
chunks = self._split_text_with_separators(text, separators)
51+
return self._apply_overlap(chunks)
5152

5253
def _split_text_with_separators(
5354
self, text: str, separators: List[str]
@@ -100,7 +101,7 @@ def _split_text_with_separators(
100101
if current_chunk:
101102
chunks.append(Chunk(content=current_chunk.strip()))
102103

103-
return self._apply_overlap(chunks)
104+
return chunks
104105

105106
def _split_by_characters(self, text: str) -> List[Chunk]:
106107
"""Split text at character level when no separators work."""
@@ -133,15 +134,16 @@ def _split_by_characters(self, text: str) -> List[Chunk]:
133134
self._get_token_count(chunk_text) > effective_chunk_size
134135
and end > start + 1
135136
):
136-
end = int(end * 0.9) # Reduce by 10%
137+
attempt_chunk_size = int((end - start) * 0.9) # Reduce by 10%
138+
end = start + attempt_chunk_size
137139
chunk_text = text[start:end]
138140

139141
if chunk_text.strip():
140142
chunks.append(Chunk(content=chunk_text.strip()))
141143

142144
start = end
143145

144-
return self._apply_overlap(chunks)
146+
return chunks
145147

146148
def _apply_overlap(self, chunks: List[Chunk]) -> List[Chunk]:
147149
"""Apply overlap between consecutive chunks."""

tests/assets/doc-base64-images.md

Lines changed: 52 additions & 0 deletions
Large diffs are not rendered by default.

tests/test_chunker.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from pathlib import Path
2+
13
import pytest
24

35
from sqlite_rag.chunker import Chunker
@@ -195,6 +197,31 @@ def test_no_overlap_setting(self, mock_conn):
195197
class TestEdgeCases:
196198
"""Test edge cases and error conditions."""
197199

200+
def test_overlap_applied_only_once(self, mock_conn):
201+
"""Test that overlap is applied only once, even when text goes through multiple separator levels."""
202+
settings = Settings("test-model")
203+
settings.chunk_size = 30 # Small chunk size to force splitting
204+
settings.chunk_overlap = 8 # Significant overlap
205+
206+
chunker = Chunker(mock_conn, settings)
207+
208+
# Create text that will be split by multiple separators:
209+
# 1. First by paragraphs (\n\n)
210+
# 2. Then by sentences (.)
211+
# 3. Finally by words ( )
212+
text = "This is the first paragraph with multiple sentences. This should be split across separators.\n\nThis is the second paragraph with more content. This will also be split by multiple separators and should trigger the overlap bug."
213+
214+
chunks = chunker.chunk(text)
215+
216+
# Verify that no chunk exceeds the chunk_size limit
217+
# If overlap is applied multiple times, chunks will be longer than chunk_size
218+
for i, chunk in enumerate(chunks):
219+
token_count = chunker._get_token_count(chunk.content)
220+
assert token_count <= settings.chunk_size, (
221+
f"Chunk {i} exceeds size limit: {token_count} tokens > {settings.chunk_size} tokens. "
222+
f"Content: '{chunk.content[:100]}...'"
223+
)
224+
198225
def test_chunk_size_equals_overlap(self, mock_conn):
199226
"""Test when chunk_size equals chunk_overlap."""
200227
settings = Settings("test-model")
@@ -219,3 +246,21 @@ def test_very_small_chunk_size(self, mock_conn):
219246

220247
chunks = chunker.chunk(text)
221248
assert len(chunks) >= 1
249+
250+
def test_split_by_character_with_long_string(self, chunker_large):
251+
"""Long string to be split by characters should be split in chunks
252+
with similar size, not become zone-length and loop forever."""
253+
with open(Path(__file__).parent / "assets" / "doc-base64-images.md", "r") as f:
254+
text = f.read()
255+
256+
chunks = chunker_large.chunk(text)
257+
258+
assert len(chunks) > 0
259+
for chunk in chunks:
260+
assert len(chunk.content)
261+
assert (
262+
chunker_large._get_token_count(chunk.content)
263+
<= chunker_large._settings.chunk_size
264+
), pytest.fail(
265+
f"Chunk exceeds size limit: {chunker_large._get_token_count(chunk.content)} tokens > {chunker_large._settings.chunk_size} tokens"
266+
)

0 commit comments

Comments
 (0)