feat(chat): experiments

Daniele Briggi · Daniele Briggi · commit 2b120a1aa954 · 2025-11-06T16:26:18.000Z
diff --git a/documentation_ai.sqlite b/documentation_ai.sqlite
diff --git a/src/sqlite_rag/cli.py b/src/sqlite_rag/cli.py
@@ -468,6 +468,24 @@ def search(
     typer.echo(f"{search_time:.3f} seconds")
 
 
+@app.command()
+def ask(
+    ctx: typer.Context,
+    question: str,
+):
+    """Ask a question and get an answer using the LLM"""
+    rag_context = ctx.obj["rag_context"]
+    start_time = time.time()
+
+    rag = rag_context.get_rag(require_existing=True)
+    answer = rag.ask(question)
+
+    elapsed_time = time.time() - start_time
+
+    typer.echo(answer)
+    typer.echo(f"{elapsed_time:.3f} seconds")
+
+
 @app.command()
 def quantize(
     ctx: typer.Context,
diff --git a/src/sqlite_rag/engine.py b/src/sqlite_rag/engine.py
@@ -316,6 +316,79 @@ def search_sentences(
 
         return sentences[:top_k]
 
+    def create_new_chat(self) -> None:
+        """Create a new LLM chat context with empty history."""
+        # self._conn.execute(
+        #     "SELECT llm_context_create(?);", (self._settings.other_gen_context_options,)
+        # )
+        # self._conn.execute("SELECT llm_chat_create();")
+
+    def ask(self, query: str) -> str:
+        """Generate an answer to the query using the LLM."""
+        results = self.search(query, top_k=10)
+        results = results[:3]
+
+        context = ""
+        for result in results:
+            # if result.combined_rank < 0.3:
+            print(
+                f"doc uri: {result.document.uri}, vector: {result.vec_distance}, fts: {result.fts_score}, score: {result.combined_rank}"
+            )
+            preview = result.document.content[:5000].replace("\n", "\\n")
+            context += f"{preview}\n\n"
+
+        prompt = query
+        if context != "":
+            # prompt = f"""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say you that don't know. Use three sentences maximum and keep the answer coincise.
+            prompt = f"""Answer the question based only on the following documents.
+Answer with the summary of the documents provided.
+Do **NOT** include any introductory phrases, titles, or prefixes such as "Answer:", "The answer is:", "Final Answer:", or "Based on the context,". Start your response with the answer itself:
+
+{context}
+
+{query}
+"""
+
+        print("---\n", prompt)
+        print(
+            "token count:",
+            self._conn.execute(
+                "SELECT llm_token_count(?) AS token_count;", (prompt,)
+            ).fetchone()["token_count"],
+        )
+
+        self._conn.execute(
+            "SELECT llm_model_load(?, ?);",
+            (self._settings.gen_model_path, self._settings.other_gen_model_options),
+        )
+        self._conn.execute(
+            "SELECT llm_context_create(?);", (self._settings.other_gen_context_options,)
+        )
+        self._conn.execute("SELECT llm_chat_create();")
+
+        self._conn.executescript(
+            """
+            SELECT llm_sampler_init_temp(1.0);
+            SELECT llm_sampler_init_top_k(64);
+            SELECT llm_sampler_init_top_p(0.95, 1);
+            SELECT llm_sampler_init_min_p(0.0, 1);
+            SELECT llm_sampler_init_dist(-1);
+            SELECT llm_sampler_init_penalties(1024, 1.1, 0.0, 0.0);
+        """
+        )
+
+        r = self._conn.execute("SELECT llm_chat_respond(?) AS response;", (prompt,))
+
+        response = r.fetchone()[0]
+        print(
+            "token count:",
+            self._conn.execute(
+                "SELECT llm_token_count(?) AS token_count;", (response,)
+            ).fetchone()["token_count"],
+        )
+
+        return response
+
     def versions(self) -> dict:
         """Get versions of the loaded extensions."""
         cursor = self._conn.cursor()
diff --git a/src/sqlite_rag/settings.py b/src/sqlite_rag/settings.py
@@ -76,7 +76,29 @@ class Settings:
     # Zero means no limit
     max_chunks_per_document: int = 1000
     # Number of top sentences to return per document
-    top_k_sentences: int = 3
+    top_k_sentences: int = 10
+
+    #
+    # Text generation
+    #
+
+    # gen_model_path: str = (
+    #     "./models/unsloth/gemma-3-270m-it-GGUF/gemma-3-270m-it-Q8_0.gguf"
+    # )
+    gen_model_path: str = "./models/unsloth/gemma-3-1b-it-GGUF/gemma-3-1b-it-Q8_0.gguf"
+
+    # See: https://github.com/sqliteai/sqlite-ai/blob/main/API.md#llm_model_loadpath-text-options-text
+    other_gen_model_options: str = ""
+    # See: https://github.com/sqliteai/sqlite-ai/blob/main/API.md#llm_context_createoptions-text
+    other_gen_context_options: str = (
+        "n_ctx=6000,context_size=6000,max_tokens=3000,n_threads=8,n_predict=800"
+    )
+
+    context_size: int = 2048
+    # Max input tokens to the model for generation
+    max_tokens: int = 2048
+
+    n_predict: int = 400
 
     def get_embeddings_context_options(self) -> str:
         """Get the context options for embeddings generation."""
@@ -94,6 +116,20 @@ def get_embeddings_context_options(self) -> str:
             else ""
         )
 
+    def get_generation_context_options(self) -> str:
+        """Get the context options for text generation."""
+        options = {
+            "context_size": self.context_size,
+            "max_tokens": self.max_tokens,
+            "n_predict": self.n_predict,
+        }
+
+        return ",".join(f"{k}={v}" for k, v in options.items()) + (
+            f",{self.other_gen_context_options}"
+            if self.other_gen_context_options
+            else ""
+        )
+
     def get_vector_init_options(self) -> str:
         """Get the vector init options for the vector store."""
         options = {"type": self.vector_type, "dimension": self.embedding_dim}
diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py
@@ -318,6 +318,21 @@ def search(
 
         return self._engine.search(query, top_k=top_k)
 
+    def ask(self, question: str, new_context: bool = True) -> str:
+        """Generate an answer to the question using the LLM.
+
+        Args:
+            question: The question string
+            new_context: Whether to create a new LLM context for this question
+        """
+        self._ensure_initialized()
+        if new_context:
+            self._engine.create_new_context()
+
+        self._engine.create_new_chat()
+
+        return self._engine.ask(question)
+
     def get_settings(self) -> dict:
         """Get settings and more useful information"""
         versions = self._engine.versions()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,4 @@
 import sqlite3
-import tempfile
 from collections.abc import Generator
 
 import pytest
@@ -13,17 +12,17 @@
 
 @pytest.fixture
 def db_conn():
-    with tempfile.NamedTemporaryFile(suffix=".db") as tmp_db:
-        settings = Settings()
+    # with tempfile.NamedTemporaryFile(suffix=".db") as tmp_db:
+    settings = Settings()
 
-        conn = sqlite3.connect(tmp_db.name)
-        conn.row_factory = sqlite3.Row
+    conn = sqlite3.connect("./documentation_ai.sqlite")
+    conn.row_factory = sqlite3.Row
 
-        Database.initialize(conn, settings)
+    Database.initialize(conn, settings)
 
-        yield conn, settings
+    yield conn, settings
 
-        conn.close()
+    conn.close()
 
 
 @pytest.fixture
diff --git a/tests/integration/test_engine.py b/tests/integration/test_engine.py
@@ -320,3 +320,12 @@ def test_search_sentences(self, db_conn):
         assert len(results) > 0
         assert results[0].start_offset == 61  # it's the second sentence
         assert results[0].end_offset == 89
+
+
+class TestEngineAsk:
+    def test_ask(self, engine: Engine):
+        engine.create_new_chat()
+
+        result = engine.ask("what's the difference between offsync and sqlite sync?")
+        assert isinstance(result, str)
+        print(result)