Skip to content

Commit 148b6e8

Browse files
author
Daniele Briggi
committed
feat(cli): add files or text with metadata
1 parent fc108c3 commit 148b6e8

File tree

9 files changed

+261
-54
lines changed

9 files changed

+261
-54
lines changed

src/sqlite_rag/cli.py

Lines changed: 90 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python3
2+
import json
23
import shlex
34
import sys
45
from typing import Optional
@@ -31,17 +32,47 @@ def add(
3132
recursive: bool = typer.Option(
3233
False, "-r", "--recursive", help="Recursively add all files in directories"
3334
),
35+
absolute_paths: bool = typer.Option(
36+
False,
37+
"--absolute-paths",
38+
help="Store absolute paths instead of relative paths",
39+
is_flag=True,
40+
),
41+
metadata: Optional[str] = typer.Option(
42+
None,
43+
"--metadata",
44+
help="Optional metadata in JSON format to associate with the document",
45+
metavar="JSON",
46+
show_default=False,
47+
prompt="Metadata (JSON format, e.g. {'author': 'John Doe', 'date': '2023-10-01'}'",
48+
),
3449
):
3550
"""Add a file path to the database"""
3651
rag = SQLiteRag()
37-
rag.add(path, recursive=recursive)
52+
rag.add(
53+
path,
54+
recursive=recursive,
55+
absolute_paths=absolute_paths,
56+
metadata=json.loads(metadata or "{}"),
57+
)
3858

3959

4060
@app.command()
41-
def add_text(text: str, uri: Optional[str] = None):
61+
def add_text(
62+
text: str,
63+
uri: Optional[str] = None,
64+
metadata: Optional[str] = typer.Option(
65+
None,
66+
"--metadata",
67+
help="Optional metadata in JSON format to associate with the document",
68+
metavar="JSON",
69+
show_default=False,
70+
prompt="Metadata (JSON format, e.g. {'author': 'John Doe', 'date': '2023-10-01'}'",
71+
),
72+
):
4273
"""Add a text to the database"""
4374
rag = SQLiteRag()
44-
rag.add_text(text, uri=uri, metadata={})
75+
rag.add_text(text, uri=uri, metadata=json.loads(metadata or "{}"))
4576

4677

4778
@app.command("list")
@@ -86,7 +117,7 @@ def remove(
86117
raise typer.Exit(1)
87118

88119
# Show document details
89-
typer.echo(f"Found document:")
120+
typer.echo("Found document:")
90121
typer.echo(f"ID: {document.id}")
91122
typer.echo(f"URI: {document.uri or 'N/A'}")
92123
typer.echo(
@@ -165,7 +196,11 @@ def reset(
165196

166197
@app.command()
167198
def search(
168-
query: str, limit: int = typer.Option(10, help="Number of results to return")
199+
query: str,
200+
limit: int = typer.Option(10, help="Number of results to return"),
201+
debug: bool = typer.Option(
202+
False, "-d", "--debug", help="Print extra debug information"
203+
),
169204
):
170205
"""Search for documents using hybrid vector + full-text search"""
171206
rag = SQLiteRag()
@@ -176,12 +211,56 @@ def search(
176211
return
177212

178213
typer.echo(f"Found {len(results)} documents:")
179-
typer.echo(f"{'Pos':<4} {'Preview':<60} {'URI':<50}")
180-
typer.echo("-" * 116)
181-
for idx, doc in enumerate(results, 1):
182-
snippet = f"{doc.snippet[:57]!r}" + "..." if len(doc.snippet) > 60 else f"{doc.snippet!r}"
183-
uri = doc.document.uri or "N/A"
184-
typer.echo(f"{idx:<4} {snippet:<60} {uri:<50}")
214+
215+
if debug:
216+
# Enhanced debug table with better formatting
217+
typer.echo(
218+
f"{'#':<3} {'Preview':<55} {'URI':<35} {'C.Rank':<33} {'V.Rank':<8} {'FTS.Rank':<9} {'V.Dist':<18} {'FTS.Score':<18}"
219+
)
220+
typer.echo("─" * 180)
221+
222+
for idx, doc in enumerate(results, 1):
223+
# Clean snippet display
224+
snippet = doc.snippet.replace("\n", " ").replace("\r", "")
225+
if len(snippet) > 52:
226+
snippet = snippet[:49] + "..."
227+
228+
# Clean URI display
229+
uri = doc.document.uri or "N/A"
230+
if len(uri) > 32:
231+
uri = "..." + uri[-29:]
232+
233+
# Format debug values with proper precision
234+
c_rank = (
235+
f"{doc.combined_rank:.17f}" if doc.combined_rank is not None else "N/A"
236+
)
237+
v_rank = str(doc.vec_rank) if doc.vec_rank is not None else "N/A"
238+
fts_rank = str(doc.fts_rank) if doc.fts_rank is not None else "N/A"
239+
v_dist = (
240+
f"{doc.vec_distance:.6f}" if doc.vec_distance is not None else "N/A"
241+
)
242+
fts_score = f"{doc.fts_score:.6f}" if doc.fts_score is not None else "N/A"
243+
244+
typer.echo(
245+
f"{idx:<3} {snippet:<55} {uri:<35} {c_rank:<33} {v_rank:<8} {fts_rank:<9} {v_dist:<18} {fts_score:<18}"
246+
)
247+
else:
248+
# Clean simple table for normal view
249+
typer.echo(f"{'#':<3} {'Preview':<60} {'URI':<40}")
250+
typer.echo("─" * 105)
251+
252+
for idx, doc in enumerate(results, 1):
253+
# Clean snippet display
254+
snippet = doc.snippet.replace("\n", " ").replace("\r", "")
255+
if len(snippet) > 57:
256+
snippet = snippet[:54] + "..."
257+
258+
# Clean URI display
259+
uri = doc.document.uri or "N/A"
260+
if len(uri) > 37:
261+
uri = "..." + uri[-34:]
262+
263+
typer.echo(f"{idx:<3} {snippet:<60} {uri:<40}")
185264

186265

187266
def repl_mode():

src/sqlite_rag/engine.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,24 +43,25 @@ def process(self, document: Document) -> Document:
4343
document.chunks = chunks
4444
return document
4545

46-
# TODO: better to get a list of str and return a list of embeddings?
4746
def generate_embedding(self, chunks: list[Chunk]) -> list[Chunk]:
4847
"""Generate embedding for the given text."""
4948
cursor = self._conn.cursor()
5049

5150
for chunk in chunks:
5251
try:
53-
cursor.execute("SELECT llm_embed_generate(?) AS embedding", (chunk.content,))
52+
cursor.execute(
53+
"SELECT llm_embed_generate(?) AS embedding", (chunk.content,)
54+
)
5455
except sqlite3.Error as e:
5556
print(f"Error generating embedding for chunk\n: ```{chunk.content}```")
5657
raise e
57-
58+
5859
result = cursor.fetchone()
5960

6061
if result is None:
6162
raise RuntimeError("Failed to generate embedding.")
6263

63-
chunk.embedding = result['embedding']
64+
chunk.embedding = result["embedding"]
6465

6566
return chunks
6667

src/sqlite_rag/models/document.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,6 @@ class Document:
1717

1818
chunks: list["Chunk"] = []
1919

20-
vec_rank: float | None = None
21-
fts_rank: float | None = None
22-
combined_rank: float | None = None
23-
vec_distance: float | None = None
24-
fts_score: float | None = None
25-
2620
def hash(self) -> str:
2721
"""Generate a hash for the document content"""
2822
return hashlib.blake2b(self.content.encode()).hexdigest()

src/sqlite_rag/sqliterag.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(self, settings: Optional[Settings] = None):
3333
self._engine = Engine(self._conn, settings, chunker=self._chunker)
3434

3535
self.ready = False
36-
36+
3737
def _create_db_connection(self) -> sqlite3.Connection:
3838
conn = sqlite3.connect(self.settings.db_path)
3939
conn.row_factory = sqlite3.Row
@@ -47,21 +47,35 @@ def _ensure_initialized(self):
4747

4848
self.ready = True
4949

50-
def add(self, path: str, recursive: bool = False) -> int:
50+
def add(
51+
self,
52+
path: str,
53+
recursive: bool = False,
54+
absolute_paths: bool = True,
55+
metadata: dict = {},
56+
) -> int:
5157
"""Add the file content into the database"""
5258
self._ensure_initialized()
5359

5460
if not Path(path).exists():
5561
raise FileNotFoundError(f"{path} does not exist.")
5662

63+
parent = Path(path).parent
64+
5765
files_to_process = FileReader.collect_files(Path(path), recursive=recursive)
5866

5967
self._logger.info(f"Processing {len(files_to_process)} files...")
6068
for file_path in files_to_process:
6169
# TODO: include metadata extraction and mdx options (see our docsearch)
6270
content = FileReader.parse_file(file_path)
63-
document = Document(content=content, uri=str(file_path.absolute()))
64-
71+
72+
uri = (
73+
str(file_path.absolute())
74+
if absolute_paths
75+
else str(file_path.relative_to(parent))
76+
)
77+
document = Document(content=content, uri=uri, metadata=metadata)
78+
6579
exists = self._repository.document_exists_by_hash(document.hash())
6680
if exists:
6781
self._logger.info(f"Unchanged: {file_path}")
@@ -72,6 +86,7 @@ def add(self, path: str, recursive: bool = False) -> int:
7286

7387
self._repository.add_document(document)
7488

89+
# TODO: when is it better to quantize? after each document?
7590
if self.settings.quantize_scan:
7691
self._engine.quantize()
7792

tests/conftest.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33

44
import pytest
55

6+
from sqlite_rag.chunker import Chunker
67
from sqlite_rag.database import Database
8+
from sqlite_rag.engine import Engine
79
from sqlite_rag.settings import Settings
810

911

@@ -17,7 +19,7 @@ def db_conn():
1719

1820
conn = sqlite3.connect(settings.db_path)
1921
conn.row_factory = sqlite3.Row
20-
22+
2123
Database.initialize(conn, settings)
2224

2325
yield conn, settings
@@ -33,3 +35,14 @@ def db_settings() -> Settings:
3335
db_path=tmp_db.name,
3436
)
3537
return settings
38+
39+
40+
@pytest.fixture
41+
def engine(db_conn):
42+
conn, settings = db_conn
43+
44+
engine = Engine(conn, settings, chunker=Chunker(conn, settings))
45+
engine.load_model()
46+
engine.quantize()
47+
48+
return engine

tests/integration/test_engine.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import random
2+
import string
3+
4+
import pytest
5+
6+
from sqlite_rag.models.chunk import Chunk
7+
8+
9+
class TestEngine:
10+
def test_stress_embedding_generation(self, engine):
11+
"""Test embedding generation with a large number of chunks
12+
to not fail and to never generate duplicated embeddings."""
13+
14+
def random_string(length=30):
15+
return "".join(
16+
random.choices(string.ascii_letters + string.digits + " ", k=length)
17+
)
18+
19+
result_chunks = {}
20+
for i in range(1000):
21+
try:
22+
chunk = engine.generate_embedding([Chunk(content=random_string())])
23+
result_chunks[chunk[0].embedding.hex()] = chunk[0]
24+
assert len(result_chunks) == i + 1
25+
except Exception as e:
26+
pytest.fail(f"Embedding generation failed on chunk {i}: {e}")
27+
28+
# Assert
29+
assert len(result_chunks) == 1000

tests/test_engine.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,10 @@
1-
import pytest
2-
31
from sqlite_rag.chunker import Chunker
42
from sqlite_rag.engine import Engine
53
from sqlite_rag.models.chunk import Chunk
64
from sqlite_rag.models.document import Document
75
from sqlite_rag.repository import Repository
86

97

10-
@pytest.fixture
11-
def engine(db_conn):
12-
conn, settings = db_conn
13-
14-
engine = Engine(conn, settings, chunker=Chunker(conn, settings))
15-
engine.load_model()
16-
engine.quantize()
17-
18-
return engine
19-
20-
218
class TestEngine:
229
def test_generate_embedding(self, engine):
2310
chunk = Chunk(content="This is a test chunk for embedding generation.")

tests/test_repository.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,8 @@
11
import sqlite3
2-
import tempfile
32

4-
from h11 import Data
5-
6-
from sqlite_rag.database import Database
73
from sqlite_rag.models.chunk import Chunk
84
from sqlite_rag.models.document import Document
95
from sqlite_rag.repository import Repository
10-
from sqlite_rag.settings import Settings
116

127

138
class TestRepository:
@@ -120,9 +115,9 @@ def test_find_document_by_id_or_uri_by_id(self, db_conn):
120115

121116
# Add a document
122117
doc = Document(
123-
content="Test document content.",
124-
uri="test.txt",
125-
metadata={"author": "test"}
118+
content="Test document content.",
119+
uri="test.txt",
120+
metadata={"author": "test"},
126121
)
127122
doc_id = repo.add_document(doc)
128123

@@ -141,9 +136,9 @@ def test_find_document_by_id_or_uri_by_uri(self, db_conn):
141136

142137
# Add a document
143138
doc = Document(
144-
content="Test document content.",
145-
uri="test.txt",
146-
metadata={"author": "test"}
139+
content="Test document content.",
140+
uri="test.txt",
141+
metadata={"author": "test"},
147142
)
148143
repo.add_document(doc)
149144

@@ -170,9 +165,9 @@ def test_remove_document_success(self, db_conn):
170165

171166
# Add a document with chunks
172167
doc = Document(
173-
content="Test document content.",
174-
uri="test.txt",
175-
metadata={"author": "test"}
168+
content="Test document content.",
169+
uri="test.txt",
170+
metadata={"author": "test"},
176171
)
177172
doc.chunks = [
178173
Chunk(content="Chunk 1", embedding=b"\x00" * 384),
@@ -214,7 +209,7 @@ def test_document_exists_by_hash_exists(self, db_conn):
214209
doc = Document(
215210
content="Test document content.",
216211
uri="test.txt",
217-
metadata={"author": "test"}
212+
metadata={"author": "test"},
218213
)
219214
repo.add_document(doc)
220215

0 commit comments

Comments
 (0)