feat(prompts): support to model's prompts

Daniele Briggi · Daniele Briggi · commit 10f9d7e86b56 · 2025-09-17T14:15:59.000Z
diff --git a/model_evaluation/configs/gemma_300M_Q8_650rows.json b/model_evaluation/configs/gemma_300M_Q8_650rows.json
@@ -3,7 +3,7 @@
   "rag_settings": {
     "chunk_size": 1000,
     "chunk_overlap": 0,
-    "model_path_or_name": "./../models/unsloth/embeddinggemma-300m-GGUF/embeddinggemma-300M-Q8_0.gguf",
+    "model_path": "./../models/unsloth/embeddinggemma-300m-GGUF/embeddinggemma-300M-Q8_0.gguf",
     "model_options": "",
     "model_context_options": "generate_embedding=1,normalize_embedding=1,pooling_type=mean,embedding_type=INT8",
     "vector_type": "INT8",
diff --git a/model_evaluation/configs/qwen3_Q8_650rows.json b/model_evaluation/configs/qwen3_Q8_650rows.json
@@ -3,7 +3,7 @@
   "rag_settings": {
     "chunk_size": 1000,
     "chunk_overlap": 0,
-    "model_path_or_name": "./../models/Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf",
+    "model_path": "./../models/Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf",
     "model_options": "",
     "model_context_options": "generate_embedding=1,normalize_embedding=1,pooling_type=last,embedding_type=INT8",
     "vector_type": "INT8",
diff --git a/model_evaluation/ms_marco.py b/model_evaluation/ms_marco.py
@@ -212,7 +212,10 @@ def test_ms_marco_processing(
 
 
 def evaluate_search_quality(
-    limit_rows=None, database_path="ms_marco_test.sqlite", output_file=None
+    limit_rows=None,
+    database_path="ms_marco_test.sqlite",
+    output_file=None,
+    rag_settings=None,
 ):
     """Evaluate search quality using proper metrics"""
 
@@ -243,7 +246,7 @@ def output(text):
         output(f"Evaluating on {len(df)} queries")
 
     # Create RAG instance
-    rag = SQLiteRag.create(database_path)
+    rag = SQLiteRag.create(database_path, settings=rag_settings)
     memory_monitor.record()  # After RAG initialization
 
     # Metrics for different top-k values
@@ -275,6 +278,8 @@ def output(text):
         total_queries += 1
 
         # Perform search
+        # EmbeddingGemma works better with task specific prefix
+        # query_text = f"task: search result | query: {query_text}"
         search_results = rag.search(query_text, top_k=10)
 
         # Check results for each k value
@@ -562,6 +567,7 @@ def main():
             limit_rows=args.limit_rows,
             database_path=database_path,
             output_file=output_file,
+            rag_settings=rag_settings,
         )
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
     "pytest",
+    "pytest-mock",
     "pytest-cov",
     "black",
     "flake8",
diff --git a/src/sqlite_rag/cli.py b/src/sqlite_rag/cli.py
@@ -148,6 +148,10 @@ def configure_settings(
     use_gpu: Optional[bool] = typer.Option(
         None, help="Whether to allow sqlite-ai extension to use the GPU"
     ),
+    prompt_template_retrieval_query: Optional[str] = typer.Option(
+        None,
+        help="Template for retrieval query prompts, use {content} as placeholder",
+    ),
 ):
     """Configure settings for the RAG system.
 
@@ -171,6 +175,7 @@ def configure_settings(
         "weight_fts": weight_fts,
         "weight_vec": weight_vec,
         "use_gpu": use_gpu,
+        "prompt_template_retrieval_query": prompt_template_retrieval_query,
     }
 
     # Filter out None values (unset options)
@@ -404,6 +409,11 @@ def search(
 @app.command()
 def quantize(
     ctx: typer.Context,
+    preload: bool = typer.Option(
+        False,
+        "--preload",
+        help="Preload quantized vectors into memory for faster search",
+    ),
     cleanup: bool = typer.Option(
         False,
         "--cleanup",
@@ -420,9 +430,14 @@ def quantize(
         typer.echo("Quantization cleanup completed.")
     else:
         typer.echo("Starting vector quantization...")
+
         rag.quantize_vectors()
+        if preload:
+            typer.echo("Preloading quantized vectors into memory...")
+            rag.quantize_preload()
+
         typer.echo(
-            "Vector quantization completed. Now you can search with `--quantize-scan` and `--quantize-preload` enabled."
+            "Vector quantization completed. Now you can search with `--quantize-scan` enabled."
         )
 
 
diff --git a/src/sqlite_rag/engine.py b/src/sqlite_rag/engine.py
@@ -25,7 +25,7 @@ def __init__(self, conn: sqlite3.Connection, settings: Settings, chunker: Chunke
     def load_model(self):
         """Load the model model from the specified path."""
 
-        model_path = Path(self._settings.model_path)
+        model_path = Path(self._settings.model_path).resolve()
         if not model_path.exists():
             raise FileNotFoundError(f"Model file not found at {model_path}")
 
diff --git a/src/sqlite_rag/settings.py b/src/sqlite_rag/settings.py
@@ -21,6 +21,10 @@ class Settings:
         "generate_embedding=1,normalize_embedding=1,pooling_type=mean,embedding_type=INT8"
     )
 
+    # Allow the sqlite-ai extension to use the GPU
+    # See: https://github.com/sqliteai/sqlite-ai
+    use_gpu = False
+
     vector_type: str = "INT8"
     embedding_dim: int = 768
     other_vector_options: str = (
@@ -44,9 +48,14 @@ class Settings:
     weight_fts: float = 1.0
     weight_vec: float = 1.0
 
-    # Allow the sqlite-ai extension to use the GPU
-    # See: https://github.com/sqliteai/sqlite-ai
-    use_gpu = False
+    #
+    # Prompt templates
+    # Some models are trained to work better with specific prompts
+    # depending on the task. For example, Gemma models work better
+    # when the prompt includes a task description.
+    #
+
+    prompt_template_retrieval_query: str = "task: search result | query: {content}"
 
 
 class SettingsManager:
@@ -92,7 +101,7 @@ def configure(
                         )
                     else:
                         raise ValueError(
-                            "Critical settings changes detected. Please reset the database."
+                            "Critical settings changes detected. Please force the settings update or reset the database."
                         )
                 # Update new settings
                 current_settings = self.store(new_settings)
diff --git a/src/sqlite_rag/sqliterag.py b/src/sqlite_rag/sqliterag.py
@@ -265,8 +265,8 @@ def search(
         if new_context:
             self._engine.create_new_context()
 
-        if self._settings.quantize_scan and self._settings.quantize_preload:
-            self._engine.quantize_preload()
+        if self._settings.prompt_template_retrieval_query:
+            query = self._settings.prompt_template_retrieval_query.format(content=query)
 
         return self._engine.search(query, top_k=top_k)
 
diff --git a/tests/test_sqlite_rag.py b/tests/test_sqlite_rag.py
@@ -580,3 +580,24 @@ def test_search_samples_exact_match_by_scan_type(self, quantize_scan: bool):
             # Second result should have distance > 0
             second_result = results[1]
             assert second_result.vec_distance and second_result.vec_distance > 0.0
+
+    def test_search_uses_retrieval_query_template(self, mocker):
+        template = "task: search | Do something with {content}"
+
+        settings = {"prompt_template_retrieval_query": template}
+
+        rag = SQLiteRag.create(":memory:", settings=settings)
+
+        mock_engine = mocker.Mock()
+        mock_engine.search.return_value = []
+
+        rag._engine = mock_engine
+
+        query = "test query"
+        rag.search(query)
+
+        # Assert that engine.search was called with the formatted template
+        expected_query = rag._settings.prompt_template_retrieval_query.format(
+            content=query
+        )
+        mock_engine.search.assert_called_once_with(expected_query, top_k=10)