Merge pull request #5 from sqliteai/general-fixes

danielebriggi · web-flow · commit 1e24f173a694 · 2025-09-27T17:02:42.000+02:00
General fixes
diff --git a/.github/workflows/pypi-package.yaml b/.github/workflows/pypi-package.yaml
@@ -7,6 +7,11 @@ on:
         description: "Version to use for the Python package (e.g. 0.1.0)"
         required: true
         type: string
+      test-pypi:
+        description: "Publish to Test PyPI"
+        required: false
+        type: boolean
+        default: false
   release:
     types: [published]
 
@@ -55,4 +60,4 @@ jobs:
           # Avoid workflow to fail if the version has already been published
           skip-existing: true
           # Upload to Test Pypi for testing
-          #repository-url: https://test.pypi.org/legacy/
+          repository-url: ${{ github.event.inputs.test-pypi == 'true' && 'https://test.pypi.org/legacy/' || '' }}
diff --git a/README.md b/README.md
@@ -2,12 +2,13 @@
 
 # SQLite RAG
 
-[![Run Tests](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml/badge.svg?branch=main&event=release)](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml)
+[![Run Tests](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml/badge.svg)](https://github.com/sqliteai/sqlite-rag/actions/workflows/test.yaml)
 [![codecov](https://codecov.io/github/sqliteai/sqlite-rag/graph/badge.svg?token=30KYPY7864)](https://codecov.io/github/sqliteai/sqlite-rag)
 ![PyPI - Version](https://img.shields.io/pypi/v/sqlite-rag?link=https%3A%2F%2Fpypi.org%2Fproject%2Fsqlite-rag%2F)
 ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/sqlite-rag?link=https%3A%2F%2Fpypi.org%2Fproject%2Fsqlite-rag)
 
-A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqliteai/sqlite-ai) and [SQLite Vector](https://github.com/sqliteai/sqlite-vector) extensions. SQLite RAG combines vector similarity search with full-text search ([FTS5](https://www.sqlite.org/fts5.html) extension) using Reciprocal Rank Fusion (RRF) for enhanced document retrieval.
+A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqliteai/sqlite-ai) and [SQLite Vector](https://github.com/sqliteai/sqlite-vector) extensions.
+SQLite RAG combines vector similarity search with full-text search ([FTS5](https://www.sqlite.org/fts5.html) extension) using Reciprocal Rank Fusion (RRF) for enhanced document retrieval.
 
 ## Features
 
@@ -20,6 +21,13 @@ A hybrid search engine built on SQLite with [SQLite AI](https://github.com/sqlit
 
 ## Installation
 
+### Prerequisites
+
+SQLite RAG requires SQLite with _extension loading_ support.
+If you encounter extension loading issues (e.g., `'sqlite3.Connection' object has no attribute 'enable_load_extension'`), follow the setup guides for [macOS](https://github.com/sqliteai/sqlite-extensions-guide/blob/main/platforms/macos.md#python-on-macos) or [Windows](https://github.com/sqliteai/sqlite-extensions-guide/blob/main/platforms/windows.md#using-sqlite-with-python).
+
+### Install SQLite RAG
+
 ```bash
 python3 -m venv .venv
 source .venv/bin/activate  # On Windows: .venv\Scripts\activate
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,6 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
-    "attrs",
     "typer",
     "huggingface_hub[hf_transfer]",
     "markitdown[docx]",
diff --git a/src/sqlite_rag/models/chunk.py b/src/sqlite_rag/models/chunk.py
@@ -1,4 +1,4 @@
-from attr import dataclass
+from dataclasses import dataclass
 
 
 @dataclass
diff --git a/src/sqlite_rag/models/document.py b/src/sqlite_rag/models/document.py
@@ -1,10 +1,9 @@
 import hashlib
 import re
+from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Optional
 
-from attr import dataclass
-
 from .chunk import Chunk
 
 
@@ -15,11 +14,11 @@ class Document:
     id: str | None = None
     content: str = ""
     uri: str | None = None
-    metadata: dict = {}
+    metadata: dict = field(default_factory=dict)
     created_at: datetime | None = None
     updated_at: datetime | None = None
 
-    chunks: list["Chunk"] = []
+    chunks: list["Chunk"] = field(default_factory=list)
 
     def hash(self) -> str:
         """Generate a hash for the document content using SHA-3 for maximum collision resistance"""
@@ -55,11 +54,11 @@ def extract_document_title(self, fallback_first_line: bool = False) -> str | Non
         if match:
             return match.group(1).strip()
 
-        # Fallback: first non-empty line
+        # Fallback: first non-empty line with at least one word
         if fallback_first_line:
             for line in self.content.splitlines():
                 line = line.strip()
-                if line:
+                if line and re.search(r"\w", line):
                     return line[: self.GENERATED_TITLE_MAX_CHARS]
 
         return None
diff --git a/src/sqlite_rag/models/document_result.py b/src/sqlite_rag/models/document_result.py
@@ -1,4 +1,4 @@
-from attr import dataclass
+from dataclasses import dataclass
 
 from .document import Document
 
diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py
@@ -103,7 +103,7 @@ def add(
                     if use_relative_paths
                     else str(file_path.absolute())
                 )
-                document = Document(content=content, uri=uri, metadata=metadata)
+                document = Document(content=content, uri=uri, metadata=metadata.copy())
 
                 exists = self._repository.document_exists_by_hash(document.hash())
                 if exists:
@@ -132,7 +132,7 @@ def add_text(
         """Add a text content into the database"""
         self._ensure_initialized()
 
-        document = Document(content=text, uri=uri, metadata=metadata)
+        document = Document(content=text, uri=uri, metadata=metadata.copy())
 
         self._engine.create_new_context()
         document = self._engine.process(document)
diff --git a/tests/integration/test_engine.py b/tests/integration/test_engine.py
@@ -3,8 +3,6 @@
 
 import pytest
 
-from sqlite_rag.models.chunk import Chunk
-
 
 class TestEngine:
     @pytest.mark.slow
@@ -20,8 +18,8 @@ def random_string(length=30):
         result_chunks = {}
         for i in range(1000):
             try:
-                chunk = engine.generate_embeddings([Chunk(content=random_string())])
-                result_chunks[chunk[0].embedding.hex()] = chunk[0]
+                embedding = engine.generate_embedding(random_string())
+                result_chunks[embedding.hex()] = embedding
                 assert len(result_chunks) == i + 1
             except Exception as e:
                 pytest.fail(f"Embedding generation failed on chunk {i}: {e}")
diff --git a/tests/models/test_document.py b/tests/models/test_document.py
@@ -45,3 +45,11 @@ def test_extract_document_title_without_heading(
         assert (
             doc.extract_document_title(fallback_first_line=fallback) == expected_title
         )
+
+    def test_extract_document_title_with_a_word(self):
+        content = "---\n    \n  Leading spaces line with a word."
+        doc = Document(content=content, metadata={})
+        assert (
+            doc.extract_document_title(fallback_first_line=True)
+            == "Leading spaces line with a word."
+        )
diff --git a/tests/test_sqlite_rag.py b/tests/test_sqlite_rag.py
@@ -139,7 +139,39 @@ def test_add_file_with_metadata(self):
         doc = cursor.fetchone()
         assert doc
         assert doc[0] == "This is a test document with metadata."
-        assert doc[1] == json.dumps(metadata)
+        assert doc[1] == json.dumps(
+            {
+                **metadata,
+                "generated": {"title": "This is a test document with metadata."},
+            }
+        )
+
+    def test_add_documents_with_generated_title(self):
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as doc1:
+            doc1.write("# Title 1\nThis is the first test document.")
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as doc2:
+            doc2.write("# Title 2\nThis is the second test document.")
+
+        doc3 = "# Title 3\nThis is the third test document."
+        doc4 = "# Title 4\nThis is the fourth test document."
+
+        rag = SQLiteRag.create(db_path=":memory:")
+
+        rag.add(doc1.name)
+        rag.add(doc2.name)
+        rag.add_text(doc3)
+        rag.add_text(doc4)
+
+        conn = rag._conn
+        cursor = conn.execute("SELECT metadata FROM documents")
+        docs = cursor.fetchall()
+        assert len(docs) == 4
+
+        titles = [json.loads(doc[0]).get("generated", {}).get("title") for doc in docs]
+        assert "Title 1" in titles
+        assert "Title 2" in titles
+        assert "Title 3" in titles
+        assert "Title 4" in titles
 
     def test_add_empty_file(self):
         with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
@@ -229,7 +261,14 @@ def test_add_text_with_metadata(self):
         assert doc
         assert doc[0] == "This is a test document content with metadata."
         assert doc[1] == "test_doc_with_metadata.txt"
-        assert doc[2] == json.dumps(metadata)
+        assert doc[2] == json.dumps(
+            {
+                **metadata,
+                "generated": {
+                    "title": "This is a test document content with metadata."
+                },
+            }
+        )
 
     def test_list_documents(self):
         rag = SQLiteRag.create(":memory:")

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,6 @@ classifiers = [`
`17`	`17`	`"Operating System :: OS Independent",`
`18`	`18`	`]`
`19`	`19`	`dependencies = [`
`20`		`- "attrs",`
`21`	`20`	`"typer",`
`22`	`21`	`"huggingface_hub[hf_transfer]",`
`23`	`22`	`"markitdown[docx]",`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from attr import dataclass`
	`1`	`+from dataclasses import dataclass`
`2`	`2`
`3`	`3`
`4`	`4`	`@dataclass`