Skip to content

Commit 1e24f17

Browse files
Merge pull request #5 from sqliteai/general-fixes
General fixes
2 parents 71bed6b + 90dff87 commit 1e24f17

File tree

10 files changed

+76
-20
lines changed

10 files changed

+76
-20
lines changed

.github/workflows/pypi-package.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ on:
77
description: "Version to use for the Python package (e.g. 0.1.0)"
88
required: true
99
type: string
10+
test-pypi:
11+
description: "Publish to Test PyPI"
12+
required: false
13+
type: boolean
14+
default: false
1015
release:
1116
types: [published]
1217

@@ -55,4 +60,4 @@ jobs:
5560
# Avoid workflow to fail if the version has already been published
5661
skip-existing: true
5762
# Upload to Test Pypi for testing
58-
#repository-url: https://test.pypi.org/legacy/
63+
repository-url: ${{ github.event.inputs.test-pypi == 'true' && 'https://test.pypi.org/legacy/' || '' }}

README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22

33
# SQLite RAG
44

5-
[![Run Tests](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml/badge.svg?branch=main&event=release)](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml)
5+
[![Run Tests](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml/badge.svg)](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml)
66
[![codecov](https://codecov.io/github/sqliteai/sqlite-rag/graph/badge.svg?token=30KYPY7864)](https://codecov.io/github/sqliteai/sqlite-rag)
77
![PyPI - Version](https://img.shields.io/pypi/v/sqlite-rag?link=https%3A%2F%2Fpypi.org%2Fproject%2Fsqlite-rag%2F)
88
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/sqlite-rag?link=https%3A%2F%2Fpypi.org%2Fproject%2Fsqlite-rag)
99

10-
A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqliteai/sqlite-ai) and [SQLite Vector](https://github.com/sqliteai/sqlite-vector) extensions. SQLite RAG combines vector similarity search with full-text search ([FTS5](https://www.sqlite.org/fts5.html) extension) using Reciprocal Rank Fusion (RRF) for enhanced document retrieval.
10+
A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqliteai/sqlite-ai) and [SQLite Vector](https://github.com/sqliteai/sqlite-vector) extensions.
11+
SQLite RAG combines vector similarity search with full-text search ([FTS5](https://www.sqlite.org/fts5.html) extension) using Reciprocal Rank Fusion (RRF) for enhanced document retrieval.
1112

1213
## Features
1314

@@ -20,6 +21,13 @@ A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqlit
2021

2122
## Installation
2223

24+
### Prerequisites
25+
26+
SQLite RAG requires SQLite with _extension loading_ support.
27+
If you encounter extension loading issues (e.g., `'sqlite3.Connection' object has no attribute 'enable_load_extension'`), follow the setup guides for [macOS](https://github.com/sqliteai/sqlite-extensions-guide/blob/main/platforms/macos.md#python-on-macos) or [Windows](https://github.com/sqliteai/sqlite-extensions-guide/blob/main/platforms/windows.md#using-sqlite-with-python).
28+
29+
### Install SQLite RAG
30+
2331
```bash
2432
python3 -m venv .venv
2533
source .venv/bin/activate # On Windows: .venv\Scripts\activate

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ classifiers = [
1717
"Operating System :: OS Independent",
1818
]
1919
dependencies = [
20-
"attrs",
2120
"typer",
2221
"huggingface_hub[hf_transfer]",
2322
"markitdown[docx]",

src/sqlite_rag/models/chunk.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from attr import dataclass
1+
from dataclasses import dataclass
22

33

44
@dataclass

src/sqlite_rag/models/document.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
import hashlib
22
import re
3+
from dataclasses import dataclass, field
34
from datetime import datetime
45
from typing import Optional
56

6-
from attr import dataclass
7-
87
from .chunk import Chunk
98

109

@@ -15,11 +14,11 @@ class Document:
1514
id: str | None = None
1615
content: str = ""
1716
uri: str | None = None
18-
metadata: dict = {}
17+
metadata: dict = field(default_factory=dict)
1918
created_at: datetime | None = None
2019
updated_at: datetime | None = None
2120

22-
chunks: list["Chunk"] = []
21+
chunks: list["Chunk"] = field(default_factory=list)
2322

2423
def hash(self) -> str:
2524
"""Generate a hash for the document content using SHA-3 for maximum collision resistance"""
@@ -55,11 +54,11 @@ def extract_document_title(self, fallback_first_line: bool = False) -> str | Non
5554
if match:
5655
return match.group(1).strip()
5756

58-
# Fallback: first non-empty line
57+
# Fallback: first non-empty line with at least one word
5958
if fallback_first_line:
6059
for line in self.content.splitlines():
6160
line = line.strip()
62-
if line:
61+
if line and re.search(r"\w", line):
6362
return line[: self.GENERATED_TITLE_MAX_CHARS]
6463

6564
return None

src/sqlite_rag/models/document_result.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from attr import dataclass
1+
from dataclasses import dataclass
22

33
from .document import Document
44

src/sqlite_rag/sqliterag.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def add(
103103
if use_relative_paths
104104
else str(file_path.absolute())
105105
)
106-
document = Document(content=content, uri=uri, metadata=metadata)
106+
document = Document(content=content, uri=uri, metadata=metadata.copy())
107107

108108
exists = self._repository.document_exists_by_hash(document.hash())
109109
if exists:
@@ -132,7 +132,7 @@ def add_text(
132132
"""Add a text content into the database"""
133133
self._ensure_initialized()
134134

135-
document = Document(content=text, uri=uri, metadata=metadata)
135+
document = Document(content=text, uri=uri, metadata=metadata.copy())
136136

137137
self._engine.create_new_context()
138138
document = self._engine.process(document)

tests/integration/test_engine.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
import pytest
55

6-
from sqlite_rag.models.chunk import Chunk
7-
86

97
class TestEngine:
108
@pytest.mark.slow
@@ -20,8 +18,8 @@ def random_string(length=30):
2018
result_chunks = {}
2119
for i in range(1000):
2220
try:
23-
chunk = engine.generate_embeddings([Chunk(content=random_string())])
24-
result_chunks[chunk[0].embedding.hex()] = chunk[0]
21+
embedding = engine.generate_embedding(random_string())
22+
result_chunks[embedding.hex()] = embedding
2523
assert len(result_chunks) == i + 1
2624
except Exception as e:
2725
pytest.fail(f"Embedding generation failed on chunk {i}: {e}")

tests/models/test_document.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,11 @@ def test_extract_document_title_without_heading(
4545
assert (
4646
doc.extract_document_title(fallback_first_line=fallback) == expected_title
4747
)
48+
49+
def test_extract_document_title_with_a_word(self):
50+
content = "---\n \n Leading spaces line with a word."
51+
doc = Document(content=content, metadata={})
52+
assert (
53+
doc.extract_document_title(fallback_first_line=True)
54+
== "Leading spaces line with a word."
55+
)

tests/test_sqlite_rag.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,39 @@ def test_add_file_with_metadata(self):
139139
doc = cursor.fetchone()
140140
assert doc
141141
assert doc[0] == "This is a test document with metadata."
142-
assert doc[1] == json.dumps(metadata)
142+
assert doc[1] == json.dumps(
143+
{
144+
**metadata,
145+
"generated": {"title": "This is a test document with metadata."},
146+
}
147+
)
148+
149+
def test_add_documents_with_generated_title(self):
150+
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as doc1:
151+
doc1.write("# Title 1\nThis is the first test document.")
152+
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as doc2:
153+
doc2.write("# Title 2\nThis is the second test document.")
154+
155+
doc3 = "# Title 3\nThis is the third test document."
156+
doc4 = "# Title 4\nThis is the fourth test document."
157+
158+
rag = SQLiteRag.create(db_path=":memory:")
159+
160+
rag.add(doc1.name)
161+
rag.add(doc2.name)
162+
rag.add_text(doc3)
163+
rag.add_text(doc4)
164+
165+
conn = rag._conn
166+
cursor = conn.execute("SELECT metadata FROM documents")
167+
docs = cursor.fetchall()
168+
assert len(docs) == 4
169+
170+
titles = [json.loads(doc[0]).get("generated", {}).get("title") for doc in docs]
171+
assert "Title 1" in titles
172+
assert "Title 2" in titles
173+
assert "Title 3" in titles
174+
assert "Title 4" in titles
143175

144176
def test_add_empty_file(self):
145177
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
@@ -229,7 +261,14 @@ def test_add_text_with_metadata(self):
229261
assert doc
230262
assert doc[0] == "This is a test document content with metadata."
231263
assert doc[1] == "test_doc_with_metadata.txt"
232-
assert doc[2] == json.dumps(metadata)
264+
assert doc[2] == json.dumps(
265+
{
266+
**metadata,
267+
"generated": {
268+
"title": "This is a test document content with metadata."
269+
},
270+
}
271+
)
233272

234273
def test_list_documents(self):
235274
rag = SQLiteRag.create(":memory:")

0 commit comments

Comments
 (0)