Skip to content

Commit 50c55aa

Browse files
author
Daniele Briggi
committed
feat(chunks): set limit per document
1 parent 471d28e commit 50c55aa

File tree

4 files changed

+84
-29
lines changed

4 files changed

+84
-29
lines changed

src/sqlite_rag/cli.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ def configure_settings(
162162
None,
163163
help="Template for retrieval query prompts, use `{content}` as placeholder",
164164
),
165+
max_chunks_per_document: Optional[int] = typer.Option(
166+
None,
167+
help="Maximum number of chunks to generate per document (0 for no limit)",
168+
),
165169
):
166170
"""Configure settings for the RAG system.
167171
@@ -190,6 +194,7 @@ def configure_settings(
190194
), # Set only if True
191195
"prompt_template_retrieval_document": prompt_template_retrieval_document,
192196
"prompt_template_retrieval_query": prompt_template_retrieval_query,
197+
"max_chunks_per_document": max_chunks_per_document,
193198
}
194199
print(updates)
195200
# Filter out None values (unset options)

src/sqlite_rag/engine.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ def load_model(self):
3636

3737
def process(self, document: Document) -> Document:
3838
chunks = self._chunker.chunk(document.content, document.metadata)
39+
40+
if self._settings.max_chunks_per_document > 0:
41+
chunks = chunks[: self._settings.max_chunks_per_document]
42+
3943
chunks = self.generate_embedding(chunks)
4044
document.chunks = chunks
4145
return document

src/sqlite_rag/settings.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,13 @@ class Settings:
6363
prompt_template_retrieval_document: str = "title: {title} | text: {content}"
6464
prompt_template_retrieval_query: str = "task: search result | query: {content}"
6565

66+
#
67+
# Index settings
68+
#
69+
70+
# Zero means no limit
71+
max_chunks_per_document: int = 1000
72+
6673

6774
class SettingsManager:
6875
def __init__(self, connection: sqlite3.Connection):

tests/test_engine.py

Lines changed: 68 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,74 @@ def test_generate_embedding_with_prompt_template(
5353
"SELECT llm_embed_generate(?) AS embedding", (expected_content,)
5454
)
5555

56+
def test_extract_document_title(self):
57+
text = """# This is the Title
58+
This is the content of the document.
59+
It has multiple lines.
60+
"""
61+
62+
engine = Engine(None, Settings(), None) # type: ignore
63+
64+
title = engine.extract_document_title(text)
65+
assert title == "This is the Title"
66+
67+
@pytest.mark.parametrize(
68+
"fallback, expected_title",
69+
[
70+
(True, "This is the first line of the document without a title."),
71+
(False, None),
72+
],
73+
)
74+
def test_extract_document_title_from_first_line(self, fallback, expected_title):
75+
text = """
76+
This is the first line of the document without a title.
77+
It has multiple lines.
78+
"""
79+
80+
engine = Engine(None, Settings(), None) # type: ignore
81+
82+
title = engine.extract_document_title(text, fallback)
83+
assert title == expected_title
84+
85+
@pytest.mark.parametrize(
86+
"max_chunks_per_document, expected_chunk_count",
87+
[(0, 2), (1, 1), (4, 2)],
88+
)
89+
def test_process_with_max_chunks_per_document(
90+
self, mocker, max_chunks_per_document, expected_chunk_count
91+
):
92+
# Arrange
93+
chunks = [
94+
Chunk(content="Chunk 1"),
95+
Chunk(content="Chunk 2"),
96+
Chunk(content="Chunk 3"),
97+
]
98+
99+
mock_conn = mocker.Mock()
100+
settings = Settings(max_chunks_per_document=max_chunks_per_document)
101+
mock_chunker = mocker.Mock()
102+
mock_chunker.chunk.return_value = chunks
103+
104+
engine = Engine(mock_conn, settings, mock_chunker)
105+
106+
mock_generate_embedding = mocker.patch.object(engine, "generate_embedding")
107+
mock_generate_embedding = mocker.spy(
108+
mock_generate_embedding, "generate_embedding"
109+
)
110+
mock_generate_embedding.return_value = chunks
111+
112+
document = Document(content="Test document content")
113+
114+
# Act
115+
engine.process(document)
116+
117+
# Assert
118+
for call_args in mock_generate_embedding.call_args_list:
119+
chunks = call_args[0][0] # First argument
120+
assert len(chunks) == expected_chunk_count
121+
122+
123+
class TestEngineSearch:
56124
def test_search_with_empty_database(self, engine):
57125
results = engine.search("nonexistent query", top_k=5)
58126

@@ -230,32 +298,3 @@ def test_search_exact_match(self, db_conn):
230298
assert len(results) > 0
231299
assert doc1_id == results[0].document.id
232300
assert 0.0 == results[0].vec_distance
233-
234-
def test_extract_document_title(self):
235-
text = """# This is the Title
236-
This is the content of the document.
237-
It has multiple lines.
238-
"""
239-
240-
engine = Engine(None, Settings(), None) # type: ignore
241-
242-
title = engine.extract_document_title(text)
243-
assert title == "This is the Title"
244-
245-
@pytest.mark.parametrize(
246-
"fallback, expected_title",
247-
[
248-
(True, "This is the first line of the document without a title."),
249-
(False, None),
250-
],
251-
)
252-
def test_extract_document_title_from_first_line(self, fallback, expected_title):
253-
text = """
254-
This is the first line of the document without a title.
255-
It has multiple lines.
256-
"""
257-
258-
engine = Engine(None, Settings(), None) # type: ignore
259-
260-
title = engine.extract_document_title(text, fallback)
261-
assert title == expected_title

0 commit comments

Comments
 (0)