fix(search): apply threshold after top_k to reduce results, not backfill

ariel-frischer · claude · ariel-frischer · commit 6eb0fcb0ec15 · 2026-02-16T02:22:34.000-08:00
Threshold was filtering before the top_k slice, causing filtered-out vec
results to be replaced by lower-quality FTS-only backfills. Now threshold
removes results post-slice so it actually reduces displayed count.

Also lower default threshold from 0.25 to 0.001 based on accuracy testing
(16/20 vs 15/20 correct answers at 0.001 vs 0.01). Bump to v1.0.6.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "kb"
-version = "1.0.5"
+version = "1.0.6"
 description = "CLI knowledge base: index markdown + PDFs, hybrid search, RAG answers. Powered by sqlite-vec."
 readme = "README.md"
 license = "MIT"
diff --git a/src/kb/cli.py b/src/kb/cli.py
@@ -241,7 +241,7 @@ def cmd_search(query: str, cfg: Config, top_k: int = 5, threshold: float | None
     embed_ms = (time.time() - t0) * 1000
 
     has_threshold = cfg.threshold > 0
-    retrieve_k = (top_k * 5) if (has_filters or has_threshold) else (top_k * 3)
+    retrieve_k = (top_k * 5) if has_filters else (top_k * 3)
 
     t0 = time.time()
     vec_rows = conn.execute(
@@ -269,22 +269,22 @@ def cmd_search(query: str, cfg: Config, top_k: int = 5, threshold: float | None
             pass
     fts_ms = (time.time() - t0) * 1000
 
-    fuse_k = retrieve_k if (has_filters or has_threshold) else top_k
+    fuse_k = retrieve_k if has_filters else top_k
     results = rrf_fuse(vec_results, fts_results, fuse_k, cfg)
     fill_fts_only_results(conn, results)
 
     if has_filters:
         results = apply_filters(results, filters, conn)
 
+    results = results[:top_k]
+
     if has_threshold:
         results = [
             r
             for r in results
             if r["similarity"] is None or r["similarity"] >= cfg.threshold
         ]
 
-    results = results[:top_k]
-
     print(f'Query: "{clean_query}"')
     print(f"Embed: {embed_ms:.0f}ms | Vec: {vec_ms:.1f}ms | FTS: {fts_ms:.1f}ms")
     print(
diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py
@@ -184,6 +184,81 @@ def test_search_top_k(self, populated_db, capsys):
         # Should have at most 1 result block
         assert out.count("--- [") <= 1
 
+    def test_threshold_reduces_result_count(self, tmp_path, capsys):
+        """Threshold should remove low-similarity results, not backfill with FTS-only."""
+        cfg = Config(embed_dims=4, threshold=0.99)
+        cfg.scope = "project"
+        cfg.config_dir = tmp_path
+        cfg.config_path = tmp_path / ".kb.toml"
+        cfg.db_path = tmp_path / "kb.db"
+
+        conn = connect(cfg)
+        # Insert two docs with known embeddings
+        for i, (text, path) in enumerate(
+            [("relevant text about topic", "a.md"), ("unrelated filler", "b.md")]
+        ):
+            conn.execute(
+                "INSERT INTO documents (path, title, type, size_bytes, content_hash, chunk_count) "
+                "VALUES (?, ?, 'markdown', 100, ?, 1)",
+                (path, path, f"h{i}"),
+            )
+            doc_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
+            conn.execute(
+                "INSERT INTO chunks (doc_id, chunk_index, text, heading, char_count) "
+                "VALUES (?, 0, ?, 'H', ?)",
+                (doc_id, text, len(text)),
+            )
+            chunk_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
+            emb = [0.1 * (i + 1)] * 4
+            conn.execute(
+                "INSERT INTO vec_chunks (chunk_id, embedding, chunk_text, doc_path, heading) "
+                "VALUES (?, ?, ?, ?, ?)",
+                (chunk_id, serialize_f32(emb), text, path, "H"),
+            )
+        conn.execute("INSERT INTO fts_chunks(fts_chunks) VALUES('rebuild')")
+        conn.commit()
+        conn.close()
+
+        # Query returns results but all have low similarity -> threshold filters them out
+        client = _mock_openai_client(embed_dims=4)
+        client.embeddings.create.return_value.data = [MagicMock(embedding=[0.9] * 4)]
+
+        with patch("kb.cli.OpenAI", return_value=client):
+            cmd_search("topic", cfg, top_k=5, threshold=0.99)
+
+        out = capsys.readouterr().out
+        # With threshold=0.99, low-similarity vec results should be removed,
+        # NOT replaced by FTS-only backfills
+        result_count = out.count("--- [")
+        assert result_count < 2, (
+            f"Expected threshold to reduce results, got {result_count}"
+        )
+
+    def test_threshold_does_not_backfill_fts(self, populated_db, capsys):
+        """After threshold filtering, result count should be <= top_k, not padded."""
+        client = _mock_openai_client(embed_dims=4)
+        # Use a very far query vector so similarity is low
+        client.embeddings.create.return_value.data = [MagicMock(embedding=[0.99] * 4)]
+
+        with patch("kb.cli.OpenAI", return_value=client):
+            # threshold=0 (no filter) -> get results
+            cmd_search("install", populated_db, top_k=5, threshold=0.0)
+
+        out_no_filter = capsys.readouterr().out
+        count_no_filter = out_no_filter.count("--- [")
+
+        with patch("kb.cli.OpenAI", return_value=client):
+            # threshold=0.99 (strict filter) -> should get fewer results
+            cmd_search("install", populated_db, top_k=5, threshold=0.99)
+
+        out_filtered = capsys.readouterr().out
+        count_filtered = out_filtered.count("--- [")
+
+        assert count_filtered <= count_no_filter, (
+            f"Strict threshold should not produce more results: "
+            f"{count_filtered} (filtered) vs {count_no_filter} (unfiltered)"
+        )
+
 
 class TestCmdAsk:
     def test_no_db_exits(self, tmp_path):
diff --git a/uv.lock b/uv.lock