Skip to content

Commit 7348c58

Browse files
author
Daniele Briggi
committed
feat(limits): add file size limit and max chunks per document
1 parent 50c55aa commit 7348c58

File tree

7 files changed

+57
-6
lines changed

7 files changed

+57
-6
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ test-results/
4545
.coverage
4646
.coverage.*
4747
htmlcov/
48+
coverage.xml
4849

4950
# Jupyter Notebook
5051
.ipynb_checkpoints

src/sqlite_rag/chunker.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@
77

88

99
class Chunker:
10+
ESTIMATE_CHARS_PER_TOKEN = 4
11+
1012
def __init__(self, conn: sqlite3.Connection, settings: Settings):
1113
self._conn = conn
1214
self._settings = settings
1315

1416
def chunk(self, text: str, metadata: dict = {}) -> list[Chunk]:
1517
"""Chunk text using Recursive Character Text Splitter."""
1618
chunks = []
19+
1720
if self._get_token_count(text) <= self._settings.chunk_size:
1821
chunks = [Chunk(content=text)]
1922
else:
@@ -25,13 +28,19 @@ def _get_token_count(self, text: str) -> int:
2528
"""Get token count using SQLite AI extension."""
2629
if text == "":
2730
return 0
31+
32+
# Fallback to estimated token count for very large texts
33+
# to avoid performance issues
34+
if len(text) > self._settings.chunk_size * self.ESTIMATE_CHARS_PER_TOKEN * 2:
35+
return self._estimate_tokens_count(text)
36+
2837
cursor = self._conn.execute("SELECT llm_token_count(?) AS count", (text,))
2938
return cursor.fetchone()["count"]
3039

3140
def _estimate_tokens_count(self, text: str) -> int:
3241
"""Estimate token count more conservatively."""
3342
# This is a simple heuristic; adjust as needed
34-
return (len(text) + 3) // 4
43+
return (len(text) + 3) // self.ESTIMATE_CHARS_PER_TOKEN
3544

3645
def _recursive_split(self, text: str) -> List[Chunk]:
3746
"""Recursively split text into chunks with overlap."""
@@ -119,7 +128,7 @@ def _split_by_characters(self, text: str) -> List[Chunk]:
119128
chars_per_token = (
120129
math.ceil(len(text) / total_tokens)
121130
if total_tokens > 0
122-
else 4 # Assume 4 chars per token if no tokens found
131+
else self.ESTIMATE_CHARS_PER_TOKEN # Assume chars per token if no tokens found
123132
)
124133

125134
# Estimate characters that fit the chunk size

src/sqlite_rag/cli.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ def configure_settings(
162162
None,
163163
help="Template for retrieval query prompts, use `{content}` as placeholder",
164164
),
165+
max_document_size_bytes: Optional[int] = typer.Option(
166+
None,
167+
help="Maximum size of a document to process (in bytes) before being truncated",
168+
),
165169
max_chunks_per_document: Optional[int] = typer.Option(
166170
None,
167171
help="Maximum number of chunks to generate per document (0 for no limit)",
@@ -194,6 +198,7 @@ def configure_settings(
194198
), # Set only if True
195199
"prompt_template_retrieval_document": prompt_template_retrieval_document,
196200
"prompt_template_retrieval_query": prompt_template_retrieval_query,
201+
"max_document_size_bytes": max_document_size_bytes,
197202
"max_chunks_per_document": max_chunks_per_document,
198203
}
199204
print(updates)

src/sqlite_rag/reader.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from pathlib import Path
2+
from typing import Optional
23

34
from markitdown import MarkItDown, StreamInfo
45

@@ -45,12 +46,20 @@ def is_supported(path: Path) -> bool:
4546
return path.suffix.lower() in FileReader.extensions
4647

4748
@staticmethod
48-
def parse_file(path: Path) -> str:
49+
def parse_file(path: Path, max_document_size_bytes: Optional[int] = None) -> str:
4950
try:
5051
converter = MarkItDown()
51-
return converter.convert(
52+
text = converter.convert(
5253
path, stream_info=StreamInfo(charset="utf8")
5354
).text_content
55+
56+
# Truncate text characters to max size if needed
57+
text = text.encode("utf-8", errors="ignore")
58+
if max_document_size_bytes:
59+
text = text[:max_document_size_bytes]
60+
61+
return text.decode("utf-8", errors="ignore")
62+
5463
except Exception as exc:
5564
raise ValueError(f"Failed to parse file {path}") from exc
5665

src/sqlite_rag/settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class Settings:
6767
# Index settings
6868
#
6969

70+
# Maximum size of a document to process (in bytes)
71+
max_document_size_bytes: int = 5 * 1024 * 1024 # 5 MB
7072
# Zero means no limit
7173
max_chunks_per_document: int = 1000
7274

src/sqlite_rag/sqliterag.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ def add(
8888
self._logger.info(f"Processing {total_to_process} files...")
8989
try:
9090
for i, file_path in enumerate(files_to_process):
91-
content = FileReader.parse_file(file_path)
91+
content = FileReader.parse_file(
92+
file_path, self._settings.max_document_size_bytes
93+
)
9294

9395
if not content:
9496
self._logger.warning(
@@ -189,7 +191,9 @@ def rebuild(self, remove_missing: bool = False) -> dict:
189191
if doc.uri and Path(doc.uri).exists():
190192
# File still exists, recreate embeddings
191193
try:
192-
content = FileReader.parse_file(Path(doc.uri))
194+
content = FileReader.parse_file(
195+
Path(doc.uri), self._settings.max_document_size_bytes
196+
)
193197
doc.content = content
194198

195199
self._repository.remove_document(doc_id)

tests/test_reader.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,24 @@ def test_markItDown_file_with_unicode_content(self):
9090
# is trying to decode as ASCII instead of UTF-8
9191
content = FileReader.parse_file(Path(f.name))
9292
assert "# This is a document with a Unicode character: ±" in content
93+
94+
def test_parse_file_with_max_document_size_bytes(self):
95+
"""Test that FileReader truncates content when max_document_size_bytes is specified"""
96+
long_content = "This is a very long document." * 100 # ~3000 chars
97+
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f:
98+
f.write(long_content.encode("utf-8"))
99+
temp_file_path = f.name
100+
101+
max_size_bytes = 50
102+
content = FileReader.parse_file(
103+
Path(temp_file_path), max_document_size_bytes=max_size_bytes
104+
)
105+
106+
# Content should be truncated to max_size bytes
107+
assert len(content.encode("utf-8")) <= max_size_bytes
108+
assert content.startswith("This is a very long document.")
109+
110+
# Test without size limit
111+
full_content = FileReader.parse_file(Path(temp_file_path))
112+
assert len(full_content) == len(long_content)
113+
assert full_content == long_content

0 commit comments

Comments
 (0)