Skip to content

Commit 6eb0fcb

Browse files
fix(search): apply threshold after top_k to reduce results, not backfill
Threshold was filtering before the top_k slice, causing filtered-out vec results to be replaced by lower-quality FTS-only backfills. Now threshold removes results post-slice so it actually reduces displayed count. Also lower default threshold from 0.25 to 0.001 based on accuracy testing (16/20 vs 15/20 correct answers at 0.001 vs 0.01). Bump to v1.0.6. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a636bc2 commit 6eb0fcb

File tree

4 files changed

+81
-6
lines changed

4 files changed

+81
-6
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "kb"
3-
version = "1.0.5"
3+
version = "1.0.6"
44
description = "CLI knowledge base: index markdown + PDFs, hybrid search, RAG answers. Powered by sqlite-vec."
55
readme = "README.md"
66
license = "MIT"

src/kb/cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def cmd_search(query: str, cfg: Config, top_k: int = 5, threshold: float | None
241241
embed_ms = (time.time() - t0) * 1000
242242

243243
has_threshold = cfg.threshold > 0
244-
retrieve_k = (top_k * 5) if (has_filters or has_threshold) else (top_k * 3)
244+
retrieve_k = (top_k * 5) if has_filters else (top_k * 3)
245245

246246
t0 = time.time()
247247
vec_rows = conn.execute(
@@ -269,22 +269,22 @@ def cmd_search(query: str, cfg: Config, top_k: int = 5, threshold: float | None
269269
pass
270270
fts_ms = (time.time() - t0) * 1000
271271

272-
fuse_k = retrieve_k if (has_filters or has_threshold) else top_k
272+
fuse_k = retrieve_k if has_filters else top_k
273273
results = rrf_fuse(vec_results, fts_results, fuse_k, cfg)
274274
fill_fts_only_results(conn, results)
275275

276276
if has_filters:
277277
results = apply_filters(results, filters, conn)
278278

279+
results = results[:top_k]
280+
279281
if has_threshold:
280282
results = [
281283
r
282284
for r in results
283285
if r["similarity"] is None or r["similarity"] >= cfg.threshold
284286
]
285287

286-
results = results[:top_k]
287-
288288
print(f'Query: "{clean_query}"')
289289
print(f"Embed: {embed_ms:.0f}ms | Vec: {vec_ms:.1f}ms | FTS: {fts_ms:.1f}ms")
290290
print(

tests/test_cli_commands.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,81 @@ def test_search_top_k(self, populated_db, capsys):
184184
# Should have at most 1 result block
185185
assert out.count("--- [") <= 1
186186

187+
def test_threshold_reduces_result_count(self, tmp_path, capsys):
188+
"""Threshold should remove low-similarity results, not backfill with FTS-only."""
189+
cfg = Config(embed_dims=4, threshold=0.99)
190+
cfg.scope = "project"
191+
cfg.config_dir = tmp_path
192+
cfg.config_path = tmp_path / ".kb.toml"
193+
cfg.db_path = tmp_path / "kb.db"
194+
195+
conn = connect(cfg)
196+
# Insert two docs with known embeddings
197+
for i, (text, path) in enumerate(
198+
[("relevant text about topic", "a.md"), ("unrelated filler", "b.md")]
199+
):
200+
conn.execute(
201+
"INSERT INTO documents (path, title, type, size_bytes, content_hash, chunk_count) "
202+
"VALUES (?, ?, 'markdown', 100, ?, 1)",
203+
(path, path, f"h{i}"),
204+
)
205+
doc_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
206+
conn.execute(
207+
"INSERT INTO chunks (doc_id, chunk_index, text, heading, char_count) "
208+
"VALUES (?, 0, ?, 'H', ?)",
209+
(doc_id, text, len(text)),
210+
)
211+
chunk_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
212+
emb = [0.1 * (i + 1)] * 4
213+
conn.execute(
214+
"INSERT INTO vec_chunks (chunk_id, embedding, chunk_text, doc_path, heading) "
215+
"VALUES (?, ?, ?, ?, ?)",
216+
(chunk_id, serialize_f32(emb), text, path, "H"),
217+
)
218+
conn.execute("INSERT INTO fts_chunks(fts_chunks) VALUES('rebuild')")
219+
conn.commit()
220+
conn.close()
221+
222+
# Query returns results but all have low similarity -> threshold filters them out
223+
client = _mock_openai_client(embed_dims=4)
224+
client.embeddings.create.return_value.data = [MagicMock(embedding=[0.9] * 4)]
225+
226+
with patch("kb.cli.OpenAI", return_value=client):
227+
cmd_search("topic", cfg, top_k=5, threshold=0.99)
228+
229+
out = capsys.readouterr().out
230+
# With threshold=0.99, low-similarity vec results should be removed,
231+
# NOT replaced by FTS-only backfills
232+
result_count = out.count("--- [")
233+
assert result_count < 2, (
234+
f"Expected threshold to reduce results, got {result_count}"
235+
)
236+
237+
def test_threshold_does_not_backfill_fts(self, populated_db, capsys):
238+
"""After threshold filtering, result count should be <= top_k, not padded."""
239+
client = _mock_openai_client(embed_dims=4)
240+
# Use a very far query vector so similarity is low
241+
client.embeddings.create.return_value.data = [MagicMock(embedding=[0.99] * 4)]
242+
243+
with patch("kb.cli.OpenAI", return_value=client):
244+
# threshold=0 (no filter) -> get results
245+
cmd_search("install", populated_db, top_k=5, threshold=0.0)
246+
247+
out_no_filter = capsys.readouterr().out
248+
count_no_filter = out_no_filter.count("--- [")
249+
250+
with patch("kb.cli.OpenAI", return_value=client):
251+
# threshold=0.99 (strict filter) -> should get fewer results
252+
cmd_search("install", populated_db, top_k=5, threshold=0.99)
253+
254+
out_filtered = capsys.readouterr().out
255+
count_filtered = out_filtered.count("--- [")
256+
257+
assert count_filtered <= count_no_filter, (
258+
f"Strict threshold should not produce more results: "
259+
f"{count_filtered} (filtered) vs {count_no_filter} (unfiltered)"
260+
)
261+
187262

188263
class TestCmdAsk:
189264
def test_no_db_exits(self, tmp_path):

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)