feat(limits): add file size limit and max chunks per document

Daniele Briggi · Daniele Briggi · commit 7348c5826b87 · 2025-09-24T10:00:44.000Z
diff --git a/.gitignore b/.gitignore
@@ -45,6 +45,7 @@ test-results/
 .coverage
 .coverage.*
 htmlcov/
+coverage.xml
 
 # Jupyter Notebook
 .ipynb_checkpoints
diff --git a/src/sqlite_rag/chunker.py b/src/sqlite_rag/chunker.py
@@ -7,13 +7,16 @@
 
 
 class Chunker:
+    ESTIMATE_CHARS_PER_TOKEN = 4
+
     def __init__(self, conn: sqlite3.Connection, settings: Settings):
         self._conn = conn
         self._settings = settings
 
     def chunk(self, text: str, metadata: dict = {}) -> list[Chunk]:
         """Chunk text using Recursive Character Text Splitter."""
         chunks = []
+
         if self._get_token_count(text) <= self._settings.chunk_size:
             chunks = [Chunk(content=text)]
         else:
@@ -25,13 +28,19 @@ def _get_token_count(self, text: str) -> int:
         """Get token count using SQLite AI extension."""
         if text == "":
             return 0
+
+        # Fallback to estimated token count for very large texts
+        # to avoid performance issues
+        if len(text) > self._settings.chunk_size * self.ESTIMATE_CHARS_PER_TOKEN * 2:
+            return self._estimate_tokens_count(text)
+
         cursor = self._conn.execute("SELECT llm_token_count(?) AS count", (text,))
         return cursor.fetchone()["count"]
 
     def _estimate_tokens_count(self, text: str) -> int:
         """Estimate token count more conservatively."""
         # This is a simple heuristic; adjust as needed
-        return (len(text) + 3) // 4
+        return (len(text) + 3) // self.ESTIMATE_CHARS_PER_TOKEN
 
     def _recursive_split(self, text: str) -> List[Chunk]:
         """Recursively split text into chunks with overlap."""
@@ -119,7 +128,7 @@ def _split_by_characters(self, text: str) -> List[Chunk]:
         chars_per_token = (
             math.ceil(len(text) / total_tokens)
             if total_tokens > 0
-            else 4  # Assume 4 chars per token if no tokens found
+            else self.ESTIMATE_CHARS_PER_TOKEN  # Assume chars per token if no tokens found
         )
 
         # Estimate characters that fit the chunk size
diff --git a/src/sqlite_rag/cli.py b/src/sqlite_rag/cli.py
@@ -162,6 +162,10 @@ def configure_settings(
         None,
         help="Template for retrieval query prompts, use `{content}` as placeholder",
     ),
+    max_document_size_bytes: Optional[int] = typer.Option(
+        None,
+        help="Maximum size of a document to process (in bytes) before being truncated",
+    ),
     max_chunks_per_document: Optional[int] = typer.Option(
         None,
         help="Maximum number of chunks to generate per document (0 for no limit)",
@@ -194,6 +198,7 @@ def configure_settings(
         ),  # Set only if True
         "prompt_template_retrieval_document": prompt_template_retrieval_document,
         "prompt_template_retrieval_query": prompt_template_retrieval_query,
+        "max_document_size_bytes": max_document_size_bytes,
         "max_chunks_per_document": max_chunks_per_document,
     }
     print(updates)
diff --git a/src/sqlite_rag/reader.py b/src/sqlite_rag/reader.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import Optional
 
 from markitdown import MarkItDown, StreamInfo
 
@@ -45,12 +46,20 @@ def is_supported(path: Path) -> bool:
         return path.suffix.lower() in FileReader.extensions
 
     @staticmethod
-    def parse_file(path: Path) -> str:
+    def parse_file(path: Path, max_document_size_bytes: Optional[int] = None) -> str:
         try:
             converter = MarkItDown()
-            return converter.convert(
+            text = converter.convert(
                 path, stream_info=StreamInfo(charset="utf8")
             ).text_content
+
+            # Truncate text characters to max size if needed
+            text = text.encode("utf-8", errors="ignore")
+            if max_document_size_bytes:
+                text = text[:max_document_size_bytes]
+
+            return text.decode("utf-8", errors="ignore")
+
         except Exception as exc:
             raise ValueError(f"Failed to parse file {path}") from exc
 
diff --git a/src/sqlite_rag/settings.py b/src/sqlite_rag/settings.py
@@ -67,6 +67,8 @@ class Settings:
     # Index settings
     #
 
+    # Maximum size of a document to process (in bytes)
+    max_document_size_bytes: int = 5 * 1024 * 1024  # 5 MB
     # Zero means no limit
     max_chunks_per_document: int = 1000
 
diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py
@@ -88,7 +88,9 @@ def add(
         self._logger.info(f"Processing {total_to_process} files...")
         try:
             for i, file_path in enumerate(files_to_process):
-                content = FileReader.parse_file(file_path)
+                content = FileReader.parse_file(
+                    file_path, self._settings.max_document_size_bytes
+                )
 
                 if not content:
                     self._logger.warning(
@@ -189,7 +191,9 @@ def rebuild(self, remove_missing: bool = False) -> dict:
             if doc.uri and Path(doc.uri).exists():
                 # File still exists, recreate embeddings
                 try:
-                    content = FileReader.parse_file(Path(doc.uri))
+                    content = FileReader.parse_file(
+                        Path(doc.uri), self._settings.max_document_size_bytes
+                    )
                     doc.content = content
 
                     self._repository.remove_document(doc_id)
diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -90,3 +90,24 @@ def test_markItDown_file_with_unicode_content(self):
         # is trying to decode as ASCII instead of UTF-8
         content = FileReader.parse_file(Path(f.name))
         assert "# This is a document with a Unicode character: ±" in content
+
+    def test_parse_file_with_max_document_size_bytes(self):
+        """Test that FileReader truncates content when max_document_size_bytes is specified"""
+        long_content = "This is a very long document." * 100  # ~3000 chars
+        with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as f:
+            f.write(long_content.encode("utf-8"))
+            temp_file_path = f.name
+
+        max_size_bytes = 50
+        content = FileReader.parse_file(
+            Path(temp_file_path), max_document_size_bytes=max_size_bytes
+        )
+
+        # Content should be truncated to max_size bytes
+        assert len(content.encode("utf-8")) <= max_size_bytes
+        assert content.startswith("This is a very long document.")
+
+        # Test without size limit
+        full_content = FileReader.parse_file(Path(temp_file_path))
+        assert len(full_content) == len(long_content)
+        assert full_content == long_content

Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,8 @@ class Settings:`
`67`	`67`	`# Index settings`
`68`	`68`	`#`
`69`	`69`
	`70`	`+ # Maximum size of a document to process (in bytes)`
	`71`	`+ max_document_size_bytes: int = 5 * 1024 * 1024 # 5 MB`
`70`	`72`	`# Zero means no limit`
`71`	`73`	`max_chunks_per_document: int = 1000`
`72`	`74`