sqliteai
diff --git a/‎src/sqlite_rag/chunker.py‎
Lines changed: 80 additions & 58 deletions b/‎src/sqlite_rag/chunker.py‎
Lines changed: 80 additions & 58 deletions
diff --git a/‎src/sqlite_rag/database.py‎
Lines changed: 0 additions & 1 deletion b/‎src/sqlite_rag/database.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/sqlite_rag/engine.py‎
Lines changed: 22 additions & 50 deletions b/‎src/sqlite_rag/engine.py‎
Lines changed: 22 additions & 50 deletions
diff --git a/‎src/sqlite_rag/models/chunk.py‎
Lines changed: 16 additions & 1 deletion b/‎src/sqlite_rag/models/chunk.py‎
Lines changed: 16 additions & 1 deletion
@@ -1,6 +1,8 @@
 import math
 import sqlite3
-from typing import List
+from typing import List, Optional
+
+from sqlite_rag.models.document import Document
 
 from .models.chunk import Chunk
 from .settings import Settings
@@ -13,16 +15,47 @@ def __init__(self, conn: sqlite3.Connection, settings: Settings):
         self._conn = conn
         self._settings = settings
 
-    def chunk(self, text: str, metadata: dict = {}) -> list[Chunk]:
+    def chunk(self, document: Document) -> list[Chunk]:
         """Chunk text using Recursive Character Text Splitter."""
-        chunks = []
+        chunk = self._create_chunk(document.content, title=document.get_title())
+
+        if (
+            self._get_token_count(chunk.get_embedding_text())
+            <= self._settings.chunk_size
+        ):
+            return [chunk]
+
+        return self._recursive_split(document)
+
+    def _create_chunk(
+        self,
+        content: str,
+        head_overlap_text: str = "",
+        title: Optional[str] = None,
+    ) -> Chunk:
+        prompt = None
+        if self._settings.use_prompt_templates:
+            prompt = self._settings.prompt_template_retrieval_document
+
+        return Chunk(
+            content=content,
+            head_overlap_text=head_overlap_text,
+            prompt=prompt,
+            title=title,
+        )
 
-        if self._get_token_count(text) <= self._settings.chunk_size:
-            chunks = [Chunk(content=text)]
-        else:
-            chunks = self._recursive_split(text)
+    def _get_effective_chunk_size(self, prompt: str) -> int:
+        """Calculate effective chunk size considering overlap and other
+        prompt data useful to the model.
 
-        return self._enrich_chunk(chunks, metadata)
+        Args:
+            prompt: The prompt template without content.
+        """
+        if self._settings.chunk_size <= self._settings.chunk_overlap:
+            raise ValueError("Chunk size must be greater than chunk overlap.")
+
+        prompt_size = self._get_token_count(prompt)
+        return self._settings.chunk_size - self._settings.chunk_overlap - prompt_size
 
     def _get_token_count(self, text: str) -> int:
         """Get token count using SQLite AI extension."""
@@ -42,7 +75,7 @@ def _estimate_tokens_count(self, text: str) -> int:
         # This is a simple heuristic; adjust as needed
         return (len(text) + 3) // self.ESTIMATE_CHARS_PER_TOKEN
 
-    def _recursive_split(self, text: str) -> List[Chunk]:
+    def _recursive_split(self, document: Document) -> List[Chunk]:
         """Recursively split text into chunks with overlap."""
         separators = [
             "\n\n",  # Double newlines (paragraphs)
@@ -59,32 +92,47 @@ def _recursive_split(self, text: str) -> List[Chunk]:
             "",  # Character level (fallback)
         ]
 
-        chunks = self._split_text_with_separators(text, separators)
-        return self._apply_overlap(chunks)
+        empty_chunk = self._create_chunk("", title=document.get_title())
+        effective_chunk_size = max(
+            1, self._get_effective_chunk_size(empty_chunk.get_embedding_text())
+        )
+
+        chunks_content = self._split_text_with_separators(
+            document.content, separators, effective_chunk_size
+        )
+        overlaps = self._create_overlaps(chunks_content)
+
+        assert len(chunks_content) == len(overlaps), "Mismatch in chunks and overlaps"
+        return [
+            self._create_chunk(
+                content=chunk, head_overlap_text=overlap, title=document.get_title()
+            )
+            for chunk, overlap in zip(chunks_content, overlaps)
+        ]
 
     def _split_text_with_separators(
-        self, text: str, separators: List[str]
-    ) -> List[Chunk]:
-        """Split text using hierarchical separators."""
+        self, text: str, separators: List[str], effective_chunk_size: int
+    ) -> List[str]:
+        """Split text using hierarchical separators.
+        Args:
+            text: The text to split.
+            separators: List of separators to use in order.
+            effective_chunk_size: Reserved space for actual chunk content.
+        """
         chunks = []
 
         if self._settings.chunk_size <= self._settings.chunk_overlap:
             raise ValueError("Chunk size must be greater than chunk overlap.")
 
         if not separators:
             # Fallback: character-level splitting
-            return self._split_by_characters(text)
+            return self._split_by_characters(text, effective_chunk_size)
 
         separator = separators[0]
         remaining_separators = separators[1:]
 
         if separator == "":
-            return self._split_by_characters(text)
-
-        # Reserve space for overlap
-        effective_chunk_size = max(
-            1, self._settings.chunk_size - self._settings.chunk_overlap
-        )
+            return self._split_by_characters(text, effective_chunk_size)
 
         splits = text.split(separator)
         current_chunk = ""
@@ -97,12 +145,12 @@ def _split_text_with_separators(
             else:
                 # Save current chunk if it exists
                 if current_chunk:
-                    chunks.append(Chunk(content=current_chunk.strip()))
+                    chunks.append(current_chunk)
 
                 # If single split is too large, recursively split it
                 if self._get_token_count(split) > effective_chunk_size:
                     sub_chunks = self._split_text_with_separators(
-                        split, remaining_separators
+                        split, remaining_separators, effective_chunk_size
                     )
                     chunks.extend(sub_chunks)
                     current_chunk = ""
@@ -111,19 +159,14 @@ def _split_text_with_separators(
 
         # Add final chunk
         if current_chunk:
-            chunks.append(Chunk(content=current_chunk.strip()))
+            chunks.append(current_chunk)
 
         return chunks
 
-    def _split_by_characters(self, text: str) -> List[Chunk]:
+    def _split_by_characters(self, text: str, effective_chunk_size: int) -> List[str]:
         """Split text at character level when no separators work."""
         chunks = []
 
-        # Reserve space for overlap
-        effective_chunk_size = max(
-            1, self._settings.chunk_size - self._settings.chunk_overlap
-        )
-
         total_tokens = self._get_token_count(text)
         chars_per_token = (
             math.ceil(len(text) / total_tokens)
@@ -151,40 +194,29 @@ def _split_by_characters(self, text: str) -> List[Chunk]:
                 chunk_text = text[start:end]
 
             if chunk_text.strip():
-                chunks.append(Chunk(content=chunk_text.strip()))
+                chunks.append(chunk_text)
 
             start = end
 
         return chunks
 
-    def _apply_overlap(self, chunks: List[Chunk]) -> List[Chunk]:
+    def _create_overlaps(self, chunks: List[str]) -> List[str]:
         """Apply overlap between consecutive chunks."""
         if len(chunks) <= 1 or self._settings.chunk_overlap <= 0:
-            return chunks
+            # Empty overlap for each chunk
+            return [""] * len(chunks)
 
-        overlapped_chunks = [chunks[0]]  # First chunk has no overlap
+        overlapped_chunks = [""]  # First chunk has no overlap
 
         for i in range(1, len(chunks)):
-            current_content = chunks[i].content
-            prev_content = chunks[i - 1].content
+            prev_content = chunks[i - 1]
 
             # Get overlap text from end of previous chunk
             overlap_text = self._get_overlap_text(
                 prev_content, self._settings.chunk_overlap
             )
 
-            if overlap_text:
-                combined_content = overlap_text + " " + current_content
-                # Core content starts after overlap and separator
-                core_start_pos = len(overlap_text) + 1
-            else:
-                combined_content = current_content
-                # No overlap, core starts at beginning
-                core_start_pos = 0
-
-            overlapped_chunks.append(
-                Chunk(content=combined_content, core_start_pos=core_start_pos)
-            )
+            overlapped_chunks.append(overlap_text)
 
         return overlapped_chunks
 
@@ -202,13 +234,3 @@ def _get_overlap_text(self, text: str, max_overlap_tokens: int) -> str:
 
         # If even single word is too large, return empty
         return ""
-
-    def _enrich_chunk(self, chunks: List[Chunk], metadata: dict) -> List[Chunk]:
-        """Add extra information to chunk which may improve the model embeddings."""
-        for chunk in chunks:
-            if "title" in metadata:
-                chunk.title = metadata["title"]
-            elif "title" in metadata.get("generated", {}):
-                chunk.title = metadata["generated"]["title"]
-
-        return chunks
 
@@ -83,7 +83,6 @@ def _create_schema(conn: sqlite3.Connection, settings: Settings):
                 document_id TEXT,
                 content TEXT,
                 embedding BLOB,
-                core_start_pos INTEGER DEFAULT 0,
                 FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE
             );
         """
 
@@ -7,7 +7,6 @@
 from sqlite_rag.models.document_result import DocumentResult
 
 from .chunker import Chunker
-from .models.chunk import Chunk
 from .models.document import Document
 from .settings import Settings
 
@@ -16,8 +15,6 @@ class Engine:
     # Considered a good default to normilize the score for RRF
     DEFAULT_RRF_K = 60
 
-    GENERATED_TITLE_MAX_CHARS = 100
-
     def __init__(self, conn: sqlite3.Connection, settings: Settings, chunker: Chunker):
         self._conn = conn
         self._settings = settings
@@ -37,43 +34,38 @@ def load_model(self):
         )
 
     def process(self, document: Document) -> Document:
-        chunks = self._chunker.chunk(document.content, document.metadata)
+        if not document.get_title():
+            document.set_generated_title()
+
+        chunks = self._chunker.chunk(document)
 
         if self._settings.max_chunks_per_document > 0:
             chunks = chunks[: self._settings.max_chunks_per_document]
 
-        chunks = self.generate_embedding(chunks)
+        for chunk in chunks:
+            chunk.title = document.get_title()
+            chunk.embedding = self.generate_embedding(chunk.get_embedding_text())
+
         document.chunks = chunks
+
         return document
 
-    def generate_embedding(self, chunks: list[Chunk]) -> list[Chunk]:
+    def generate_embedding(self, text: str) -> bytes:
         """Generate embedding for the given text."""
+        cursor = self._conn.cursor()
 
-        for chunk in chunks:
-            cursor = self._conn.cursor()
-
-            # Format using the prompt template if available
-            content = chunk.content
-            if self._settings.use_prompt_templates:
-                title = chunk.title if chunk.title else "none"
-                content = self._settings.prompt_template_retrieval_document.format(
-                    title=title, content=chunk.content
-                )
-
-            try:
-                cursor.execute("SELECT llm_embed_generate(?) AS embedding", (content,))
-            except sqlite3.Error as e:
-                print(f"Error generating embedding for chunk\n: ```{content}```")
-                raise e
-
-            result = cursor.fetchone()
+        try:
+            cursor.execute("SELECT llm_embed_generate(?) AS embedding", (text,))
+        except sqlite3.Error as e:
+            print(f"Error generating embedding for text\n: ```{text}```")
+            raise e
 
-            if result is None:
-                raise RuntimeError("Failed to generate embedding.")
+        result = cursor.fetchone()
 
-            chunk.embedding = result["embedding"]
+        if result is None:
+            raise RuntimeError("Failed to generate embedding.")
 
-        return chunks
+        return result["embedding"]
 
     def quantize(self) -> None:
         """Quantize stored vector for faster search via quantized scan."""
@@ -114,7 +106,7 @@ def free_context(self) -> None:
 
     def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
         """Semantic search and full-text search sorted with Reciprocal Rank Fusion."""
-        query_embedding = self.generate_embedding([Chunk(content=query)])[0].embedding
+        query_embedding = self.generate_embedding(query)
 
         # Clean up and split into words
         # '*' is used to match while typing
@@ -172,7 +164,6 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
                 documents.content as document_content,
                 documents.metadata,
                 chunks.content AS snippet,
-                chunks.core_start_pos,
                 vec_rank,
                 fts_rank,
                 combined_rank,
@@ -203,8 +194,7 @@ def search(self, query: str, top_k: int = 10) -> list[DocumentResult]:
                     content=row["document_content"],
                     metadata=json.loads(row["metadata"]) if row["metadata"] else {},
                 ),
-                # remove overlapping text from the snippet
-                snippet=row["snippet"][row["core_start_pos"] :],
+                snippet=row["snippet"],
                 vec_rank=row["vec_rank"],
                 fts_rank=row["fts_rank"],
                 combined_rank=row["combined_rank"],
@@ -227,24 +217,6 @@ def versions(self) -> dict:
             "vector_version": row["vector_version"],
         }
 
-    def extract_document_title(
-        self, text: str, fallback_first_line: bool = False
-    ) -> str | None:
-        """Extract title from markdown content."""
-        # Look for first level-1 heading
-        match = re.search(r"^# (.+)$", text, re.MULTILINE)
-        if match:
-            return match.group(1).strip()
-
-        # Fallback: first non-empty line
-        if fallback_first_line:
-            for line in text.splitlines():
-                line = line.strip()
-                if line:
-                    return line[:self.GENERATED_TITLE_MAX_CHARS]
-
-        return None
-
     def close(self):
         """Close the database connection."""
         if self._conn:
 
@@ -5,8 +5,23 @@
 class Chunk:
     id: int | None = None
     document_id: int | None = None
+    # The human readable content of the chunk
+    # (not the representation of the embedding vector)
     content: str = ""
     embedding: str | bytes = b""
-    core_start_pos: int = 0
 
+    prompt: str | None = None
+    head_overlap_text: str = ""
     title: str | None = None
+
+    def get_embedding_text(self) -> str:
+        """Get the content used to generate the embedding from.
+        It can be enriched with overlap text and prompt instructions,
+        depending on the model preferences.
+        """
+        content = self.head_overlap_text + self.content
+
+        if self.prompt:
+            return self.prompt.format(title=self.title or "none", content=content)
+
+        return self.content