feat(cli): add files or text with metadata

Daniele Briggi · Daniele Briggi · commit 148b6e827f0d · 2025-08-19T09:00:12.000Z
diff --git a/src/sqlite_rag/cli.py b/src/sqlite_rag/cli.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import json
 import shlex
 import sys
 from typing import Optional
@@ -31,17 +32,47 @@ def add(
     recursive: bool = typer.Option(
         False, "-r", "--recursive", help="Recursively add all files in directories"
     ),
+    absolute_paths: bool = typer.Option(
+        False,
+        "--absolute-paths",
+        help="Store absolute paths instead of relative paths",
+        is_flag=True,
+    ),
+    metadata: Optional[str] = typer.Option(
+        None,
+        "--metadata",
+        help="Optional metadata in JSON format to associate with the document",
+        metavar="JSON",
+        show_default=False,
+        prompt="Metadata (JSON format, e.g. {'author': 'John Doe', 'date': '2023-10-01'}'",
+    ),
 ):
     """Add a file path to the database"""
     rag = SQLiteRag()
-    rag.add(path, recursive=recursive)
+    rag.add(
+        path,
+        recursive=recursive,
+        absolute_paths=absolute_paths,
+        metadata=json.loads(metadata or "{}"),
+    )
 
 
 @app.command()
-def add_text(text: str, uri: Optional[str] = None):
+def add_text(
+    text: str,
+    uri: Optional[str] = None,
+    metadata: Optional[str] = typer.Option(
+        None,
+        "--metadata",
+        help="Optional metadata in JSON format to associate with the document",
+        metavar="JSON",
+        show_default=False,
+        prompt="Metadata (JSON format, e.g. {'author': 'John Doe', 'date': '2023-10-01'}'",
+    ),
+):
     """Add a text to the database"""
     rag = SQLiteRag()
-    rag.add_text(text, uri=uri, metadata={})
+    rag.add_text(text, uri=uri, metadata=json.loads(metadata or "{}"))
 
 
 @app.command("list")
@@ -86,7 +117,7 @@ def remove(
         raise typer.Exit(1)
 
     # Show document details
-    typer.echo(f"Found document:")
+    typer.echo("Found document:")
     typer.echo(f"ID: {document.id}")
     typer.echo(f"URI: {document.uri or 'N/A'}")
     typer.echo(
@@ -165,7 +196,11 @@ def reset(
 
 @app.command()
 def search(
-    query: str, limit: int = typer.Option(10, help="Number of results to return")
+    query: str,
+    limit: int = typer.Option(10, help="Number of results to return"),
+    debug: bool = typer.Option(
+        False, "-d", "--debug", help="Print extra debug information"
+    ),
 ):
     """Search for documents using hybrid vector + full-text search"""
     rag = SQLiteRag()
@@ -176,12 +211,56 @@ def search(
         return
 
     typer.echo(f"Found {len(results)} documents:")
-    typer.echo(f"{'Pos':<4} {'Preview':<60} {'URI':<50}")
-    typer.echo("-" * 116)
-    for idx, doc in enumerate(results, 1):
-        snippet = f"{doc.snippet[:57]!r}" + "..." if len(doc.snippet) > 60 else f"{doc.snippet!r}"
-        uri = doc.document.uri or "N/A"
-        typer.echo(f"{idx:<4} {snippet:<60} {uri:<50}")
+
+    if debug:
+        # Enhanced debug table with better formatting
+        typer.echo(
+            f"{'#':<3} {'Preview':<55} {'URI':<35} {'C.Rank':<33} {'V.Rank':<8} {'FTS.Rank':<9} {'V.Dist':<18} {'FTS.Score':<18}"
+        )
+        typer.echo("─" * 180)
+
+        for idx, doc in enumerate(results, 1):
+            # Clean snippet display
+            snippet = doc.snippet.replace("\n", " ").replace("\r", "")
+            if len(snippet) > 52:
+                snippet = snippet[:49] + "..."
+
+            # Clean URI display
+            uri = doc.document.uri or "N/A"
+            if len(uri) > 32:
+                uri = "..." + uri[-29:]
+
+            # Format debug values with proper precision
+            c_rank = (
+                f"{doc.combined_rank:.17f}" if doc.combined_rank is not None else "N/A"
+            )
+            v_rank = str(doc.vec_rank) if doc.vec_rank is not None else "N/A"
+            fts_rank = str(doc.fts_rank) if doc.fts_rank is not None else "N/A"
+            v_dist = (
+                f"{doc.vec_distance:.6f}" if doc.vec_distance is not None else "N/A"
+            )
+            fts_score = f"{doc.fts_score:.6f}" if doc.fts_score is not None else "N/A"
+
+            typer.echo(
+                f"{idx:<3} {snippet:<55} {uri:<35} {c_rank:<33} {v_rank:<8} {fts_rank:<9} {v_dist:<18} {fts_score:<18}"
+            )
+    else:
+        # Clean simple table for normal view
+        typer.echo(f"{'#':<3} {'Preview':<60} {'URI':<40}")
+        typer.echo("─" * 105)
+
+        for idx, doc in enumerate(results, 1):
+            # Clean snippet display
+            snippet = doc.snippet.replace("\n", " ").replace("\r", "")
+            if len(snippet) > 57:
+                snippet = snippet[:54] + "..."
+
+            # Clean URI display
+            uri = doc.document.uri or "N/A"
+            if len(uri) > 37:
+                uri = "..." + uri[-34:]
+
+            typer.echo(f"{idx:<3} {snippet:<60} {uri:<40}")
 
 
 def repl_mode():
diff --git a/src/sqlite_rag/engine.py b/src/sqlite_rag/engine.py
@@ -43,24 +43,25 @@ def process(self, document: Document) -> Document:
         document.chunks = chunks
         return document
 
-    # TODO: better to get a list of str and return a list of embeddings?
     def generate_embedding(self, chunks: list[Chunk]) -> list[Chunk]:
         """Generate embedding for the given text."""
         cursor = self._conn.cursor()
 
         for chunk in chunks:
             try:
-                cursor.execute("SELECT llm_embed_generate(?) AS embedding", (chunk.content,))
+                cursor.execute(
+                    "SELECT llm_embed_generate(?) AS embedding", (chunk.content,)
+                )
             except sqlite3.Error as e:
                 print(f"Error generating embedding for chunk\n: ```{chunk.content}```")
                 raise e
-            
+
             result = cursor.fetchone()
 
             if result is None:
                 raise RuntimeError("Failed to generate embedding.")
 
-            chunk.embedding = result['embedding']
+            chunk.embedding = result["embedding"]
 
         return chunks
 
diff --git a/src/sqlite_rag/models/document.py b/src/sqlite_rag/models/document.py
@@ -17,12 +17,6 @@ class Document:
 
     chunks: list["Chunk"] = []
 
-    vec_rank: float | None = None
-    fts_rank: float | None = None
-    combined_rank: float | None = None
-    vec_distance: float | None = None
-    fts_score: float | None = None
-
     def hash(self) -> str:
         """Generate a hash for the document content"""
         return hashlib.blake2b(self.content.encode()).hexdigest()
diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py
@@ -33,7 +33,7 @@ def __init__(self, settings: Optional[Settings] = None):
         self._engine = Engine(self._conn, settings, chunker=self._chunker)
 
         self.ready = False
-    
+
     def _create_db_connection(self) -> sqlite3.Connection:
         conn = sqlite3.connect(self.settings.db_path)
         conn.row_factory = sqlite3.Row
@@ -47,21 +47,35 @@ def _ensure_initialized(self):
 
         self.ready = True
 
-    def add(self, path: str, recursive: bool = False) -> int:
+    def add(
+        self,
+        path: str,
+        recursive: bool = False,
+        absolute_paths: bool = True,
+        metadata: dict = {},
+    ) -> int:
         """Add the file content into the database"""
         self._ensure_initialized()
 
         if not Path(path).exists():
             raise FileNotFoundError(f"{path} does not exist.")
 
+        parent = Path(path).parent
+
         files_to_process = FileReader.collect_files(Path(path), recursive=recursive)
 
         self._logger.info(f"Processing {len(files_to_process)} files...")
         for file_path in files_to_process:
             # TODO: include metadata extraction and mdx options (see our docsearch)
             content = FileReader.parse_file(file_path)
-            document = Document(content=content, uri=str(file_path.absolute()))
-            
+
+            uri = (
+                str(file_path.absolute())
+                if absolute_paths
+                else str(file_path.relative_to(parent))
+            )
+            document = Document(content=content, uri=uri, metadata=metadata)
+
             exists = self._repository.document_exists_by_hash(document.hash())
             if exists:
                 self._logger.info(f"Unchanged: {file_path}")
@@ -72,6 +86,7 @@ def add(self, path: str, recursive: bool = False) -> int:
 
             self._repository.add_document(document)
 
+        # TODO: when is it better to quantize? after each document?
         if self.settings.quantize_scan:
             self._engine.quantize()
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,7 +3,9 @@
 
 import pytest
 
+from sqlite_rag.chunker import Chunker
 from sqlite_rag.database import Database
+from sqlite_rag.engine import Engine
 from sqlite_rag.settings import Settings
 
 
@@ -17,7 +19,7 @@ def db_conn():
 
     conn = sqlite3.connect(settings.db_path)
     conn.row_factory = sqlite3.Row
-    
+
     Database.initialize(conn, settings)
 
     yield conn, settings
@@ -33,3 +35,14 @@ def db_settings() -> Settings:
             db_path=tmp_db.name,
         )
     return settings
+
+
+@pytest.fixture
+def engine(db_conn):
+    conn, settings = db_conn
+
+    engine = Engine(conn, settings, chunker=Chunker(conn, settings))
+    engine.load_model()
+    engine.quantize()
+
+    return engine
diff --git a/tests/integration/test_engine.py b/tests/integration/test_engine.py
@@ -0,0 +1,29 @@
+import random
+import string
+
+import pytest
+
+from sqlite_rag.models.chunk import Chunk
+
+
+class TestEngine:
+    def test_stress_embedding_generation(self, engine):
+        """Test embedding generation with a large number of chunks
+        to not fail and to never generate duplicated embeddings."""
+
+        def random_string(length=30):
+            return "".join(
+                random.choices(string.ascii_letters + string.digits + " ", k=length)
+            )
+
+        result_chunks = {}
+        for i in range(1000):
+            try:
+                chunk = engine.generate_embedding([Chunk(content=random_string())])
+                result_chunks[chunk[0].embedding.hex()] = chunk[0]
+                assert len(result_chunks) == i + 1
+            except Exception as e:
+                pytest.fail(f"Embedding generation failed on chunk {i}: {e}")
+
+        # Assert
+        assert len(result_chunks) == 1000
diff --git a/tests/test_engine.py b/tests/test_engine.py
@@ -1,23 +1,10 @@
-import pytest
-
 from sqlite_rag.chunker import Chunker
 from sqlite_rag.engine import Engine
 from sqlite_rag.models.chunk import Chunk
 from sqlite_rag.models.document import Document
 from sqlite_rag.repository import Repository
 
 
-@pytest.fixture
-def engine(db_conn):
-    conn, settings = db_conn
-
-    engine = Engine(conn, settings, chunker=Chunker(conn, settings))
-    engine.load_model()
-    engine.quantize()
-
-    return engine
-
-
 class TestEngine:
     def test_generate_embedding(self, engine):
         chunk = Chunk(content="This is a test chunk for embedding generation.")
diff --git a/tests/test_repository.py b/tests/test_repository.py
@@ -1,13 +1,8 @@
 import sqlite3
-import tempfile
 
-from h11 import Data
-
-from sqlite_rag.database import Database
 from sqlite_rag.models.chunk import Chunk
 from sqlite_rag.models.document import Document
 from sqlite_rag.repository import Repository
-from sqlite_rag.settings import Settings
 
 
 class TestRepository:
@@ -120,9 +115,9 @@ def test_find_document_by_id_or_uri_by_id(self, db_conn):
 
         # Add a document
         doc = Document(
-            content="Test document content.", 
-            uri="test.txt", 
-            metadata={"author": "test"}
+            content="Test document content.",
+            uri="test.txt",
+            metadata={"author": "test"},
         )
         doc_id = repo.add_document(doc)
 
@@ -141,9 +136,9 @@ def test_find_document_by_id_or_uri_by_uri(self, db_conn):
 
         # Add a document
         doc = Document(
-            content="Test document content.", 
-            uri="test.txt", 
-            metadata={"author": "test"}
+            content="Test document content.",
+            uri="test.txt",
+            metadata={"author": "test"},
         )
         repo.add_document(doc)
 
@@ -170,9 +165,9 @@ def test_remove_document_success(self, db_conn):
 
         # Add a document with chunks
         doc = Document(
-            content="Test document content.", 
-            uri="test.txt", 
-            metadata={"author": "test"}
+            content="Test document content.",
+            uri="test.txt",
+            metadata={"author": "test"},
         )
         doc.chunks = [
             Chunk(content="Chunk 1", embedding=b"\x00" * 384),
@@ -214,7 +209,7 @@ def test_document_exists_by_hash_exists(self, db_conn):
         doc = Document(
             content="Test document content.",
             uri="test.txt",
-            metadata={"author": "test"}
+            metadata={"author": "test"},
         )
         repo.add_document(doc)
 
diff --git a/tests/test_sqlite_rag.py b/tests/test_sqlite_rag.py