Skip to content

Commit 6ca7705

Browse files
Heng XiaHeng Xia
authored andcommitted
feat: replace PR 292 retrieval query expansion
1 parent 65d912c commit 6ca7705

File tree

4 files changed

+1558
-183
lines changed

4 files changed

+1558
-183
lines changed

cli.ts

Lines changed: 140 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,71 @@ function formatJson(obj: any): string {
409409
return JSON.stringify(obj, null, 2);
410410
}
411411

412+
function formatRetrievalDiagnosticsLines(diagnostics: {
413+
originalQuery: string;
414+
bm25Query: string | null;
415+
queryExpanded: boolean;
416+
vectorResultCount: number;
417+
bm25ResultCount: number;
418+
fusedResultCount: number;
419+
finalResultCount: number;
420+
stageCounts: {
421+
afterMinScore: number;
422+
rerankInput: number;
423+
afterRerank: number;
424+
afterHardMinScore: number;
425+
afterNoiseFilter: number;
426+
afterDiversity: number;
427+
};
428+
dropSummary: Array<{ stage: string; dropped: number; before: number; after: number }>;
429+
failureStage?: string;
430+
errorMessage?: string;
431+
}): string[] {
432+
const topDrops =
433+
diagnostics.dropSummary.length > 0
434+
? diagnostics.dropSummary
435+
.slice(0, 3)
436+
.map(
437+
(drop) => `${drop.stage} -${drop.dropped} (${drop.before}->${drop.after})`,
438+
)
439+
.join(", ")
440+
: "none";
441+
442+
const lines = [
443+
"Retrieval diagnostics:",
444+
` • Original query: ${diagnostics.originalQuery}`,
445+
` • BM25 query: ${diagnostics.bm25Query ?? "(disabled)"}`,
446+
` • Query expanded: ${diagnostics.queryExpanded ? "Yes" : "No"}`,
447+
` • Counts: vector=${diagnostics.vectorResultCount}, bm25=${diagnostics.bm25ResultCount}, fused=${diagnostics.fusedResultCount}, final=${diagnostics.finalResultCount}`,
448+
` • Stages: min=${diagnostics.stageCounts.afterMinScore}, rerankIn=${diagnostics.stageCounts.rerankInput}, rerank=${diagnostics.stageCounts.afterRerank}, hard=${diagnostics.stageCounts.afterHardMinScore}, noise=${diagnostics.stageCounts.afterNoiseFilter}, diversity=${diagnostics.stageCounts.afterDiversity}`,
449+
` • Drops: ${topDrops}`,
450+
];
451+
452+
if (diagnostics.failureStage) {
453+
lines.push(` • Failure stage: ${diagnostics.failureStage}`);
454+
}
455+
if (diagnostics.errorMessage) {
456+
lines.push(` • Error: ${diagnostics.errorMessage}`);
457+
}
458+
459+
return lines;
460+
}
461+
462+
function buildSearchErrorPayload(
463+
error: unknown,
464+
diagnostics: unknown,
465+
includeDiagnostics: boolean,
466+
): Record<string, unknown> {
467+
const message = error instanceof Error ? error.message : String(error);
468+
return {
469+
error: {
470+
code: "search_failed",
471+
message,
472+
},
473+
...(includeDiagnostics && diagnostics ? { diagnostics } : {}),
474+
};
475+
}
476+
412477
async function sleep(ms: number): Promise<void> {
413478
await new Promise(resolve => setTimeout(resolve, ms));
414479
}
@@ -418,6 +483,18 @@ async function sleep(ms: number): Promise<void> {
418483
// ============================================================================
419484

420485
export function registerMemoryCLI(program: Command, context: CLIContext): void {
486+
let lastSearchDiagnostics: ReturnType<MemoryRetriever["getLastDiagnostics"]> =
487+
null;
488+
489+
const captureSearchDiagnostics = (
490+
retriever: Pick<MemoryRetriever, "getLastDiagnostics">,
491+
) => {
492+
lastSearchDiagnostics =
493+
typeof retriever.getLastDiagnostics === "function"
494+
? retriever.getLastDiagnostics()
495+
: null;
496+
};
497+
421498
const getSearchRetriever = (): MemoryRetriever => {
422499
if (!context.embedder) {
423500
return context.retriever;
@@ -431,26 +508,49 @@ export function registerMemoryCLI(program: Command, context: CLIContext): void {
431508
scopeFilter?: string[],
432509
category?: string,
433510
) => {
434-
let results = await getSearchRetriever().retrieve({
435-
query,
436-
limit,
437-
scopeFilter,
438-
category,
439-
source: "cli",
440-
});
441-
442-
if (results.length === 0 && context.embedder) {
443-
await sleep(75);
444-
results = await getSearchRetriever().retrieve({
511+
lastSearchDiagnostics = null;
512+
const retriever = getSearchRetriever();
513+
let results;
514+
try {
515+
results = await retriever.retrieve({
445516
query,
446517
limit,
447518
scopeFilter,
448519
category,
449520
source: "cli",
450521
});
522+
captureSearchDiagnostics(retriever);
523+
} catch (error) {
524+
captureSearchDiagnostics(retriever);
525+
throw error;
451526
}
452527

453-
return results;
528+
if (results.length === 0 && context.embedder) {
529+
await sleep(75);
530+
const retryRetriever = getSearchRetriever();
531+
try {
532+
results = await retryRetriever.retrieve({
533+
query,
534+
limit,
535+
scopeFilter,
536+
category,
537+
source: "cli",
538+
});
539+
captureSearchDiagnostics(retryRetriever);
540+
} catch (error) {
541+
captureSearchDiagnostics(retryRetriever);
542+
throw error;
543+
}
544+
return {
545+
results,
546+
diagnostics: lastSearchDiagnostics,
547+
};
548+
}
549+
550+
return {
551+
results,
552+
diagnostics: lastSearchDiagnostics,
553+
};
454554
};
455555

456556
const memory = program
@@ -697,6 +797,7 @@ export function registerMemoryCLI(program: Command, context: CLIContext): void {
697797
.option("--scope <scope>", "Search within specific scope")
698798
.option("--category <category>", "Filter by category")
699799
.option("--limit <n>", "Maximum number of results", "10")
800+
.option("--debug", "Show retrieval diagnostics")
700801
.option("--json", "Output as JSON")
701802
.action(async (query, options) => {
702803
try {
@@ -707,11 +808,24 @@ export function registerMemoryCLI(program: Command, context: CLIContext): void {
707808
scopeFilter = [options.scope];
708809
}
709810

710-
const results = await runSearch(query, limit, scopeFilter, options.category);
811+
const { results, diagnostics } = await runSearch(
812+
query,
813+
limit,
814+
scopeFilter,
815+
options.category,
816+
);
711817

712818
if (options.json) {
713-
console.log(formatJson(results));
819+
console.log(
820+
formatJson(options.debug ? { diagnostics, results } : results),
821+
);
714822
} else {
823+
if (options.debug && diagnostics) {
824+
for (const line of formatRetrievalDiagnosticsLines(diagnostics)) {
825+
console.log(line);
826+
}
827+
console.log();
828+
}
715829
if (results.length === 0) {
716830
console.log("No relevant memories found.");
717831
} else {
@@ -730,6 +844,18 @@ export function registerMemoryCLI(program: Command, context: CLIContext): void {
730844
}
731845
}
732846
} catch (error) {
847+
const diagnostics = options.debug ? lastSearchDiagnostics : null;
848+
if (options.json) {
849+
console.log(
850+
formatJson(buildSearchErrorPayload(error, diagnostics, options.debug)),
851+
);
852+
process.exit(1);
853+
}
854+
if (diagnostics) {
855+
for (const line of formatRetrievalDiagnosticsLines(diagnostics)) {
856+
console.error(line);
857+
}
858+
}
733859
console.error("Search failed:", error);
734860
process.exit(1);
735861
}

src/query-expander.ts

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/**
2+
* Lightweight Chinese query expansion for BM25.
3+
* Keeps the vector query untouched and only appends a few high-signal synonyms.
4+
*/
5+
6+
const MAX_EXPANSION_TERMS = 5;
7+
8+
interface SynonymEntry {
9+
cn: string[];
10+
en: string[];
11+
expansions: string[];
12+
}
13+
14+
const SYNONYM_MAP: SynonymEntry[] = [
15+
{
16+
cn: ["挂了", "挂掉", "宕机"],
17+
en: ["shutdown", "crashed"],
18+
expansions: ["崩溃", "crash", "error", "报错", "宕机", "失败"],
19+
},
20+
{
21+
cn: ["卡住", "卡死", "没反应"],
22+
en: ["hung", "frozen"],
23+
expansions: ["hang", "timeout", "超时", "无响应", "stuck"],
24+
},
25+
{
26+
cn: ["炸了", "爆了"],
27+
en: ["oom"],
28+
expansions: ["崩溃", "crash", "OOM", "内存溢出", "error"],
29+
},
30+
{
31+
cn: ["配置", "设置"],
32+
en: ["config", "configuration"],
33+
expansions: ["配置", "config", "configuration", "settings", "设置"],
34+
},
35+
{
36+
cn: ["部署", "上线"],
37+
en: ["deploy", "deployment"],
38+
expansions: ["deploy", "部署", "上线", "发布", "release"],
39+
},
40+
{
41+
cn: ["容器"],
42+
en: ["docker", "container"],
43+
expansions: ["Docker", "容器", "container", "docker-compose"],
44+
},
45+
{
46+
cn: ["报错", "出错", "错误"],
47+
en: ["error", "exception"],
48+
expansions: ["error", "报错", "exception", "错误", "失败", "bug"],
49+
},
50+
{
51+
cn: ["修复", "修了", "修好"],
52+
en: ["bugfix", "hotfix"],
53+
expansions: ["fix", "修复", "patch", "解决"],
54+
},
55+
{
56+
cn: ["踩坑"],
57+
en: ["troubleshoot"],
58+
expansions: ["踩坑", "bug", "问题", "教训", "排查", "troubleshoot"],
59+
},
60+
{
61+
cn: ["记忆", "记忆系统"],
62+
en: ["memory"],
63+
expansions: ["记忆", "memory", "记忆系统", "LanceDB", "索引"],
64+
},
65+
{
66+
cn: ["搜索", "查找", "找不到"],
67+
en: ["search", "retrieval"],
68+
expansions: ["搜索", "search", "retrieval", "检索", "查找"],
69+
},
70+
{
71+
cn: ["推送"],
72+
en: ["git push"],
73+
expansions: ["push", "推送", "git push", "commit"],
74+
},
75+
{
76+
cn: ["日志"],
77+
en: ["logfile", "logging"],
78+
expansions: ["日志", "log", "logging", "输出", "打印"],
79+
},
80+
{
81+
cn: ["权限"],
82+
en: ["permission", "authorization"],
83+
expansions: ["权限", "permission", "access", "授权", "认证"],
84+
},
85+
];
86+
87+
function buildWordBoundaryRegex(term: string): RegExp {
88+
const escaped = term.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
89+
return new RegExp(`\\b${escaped}\\b`, "i");
90+
}
91+
92+
export function expandQuery(query: string): string {
93+
if (!query || query.trim().length < 2) return query;
94+
95+
const lower = query.toLowerCase();
96+
const additions = new Set<string>();
97+
98+
for (const entry of SYNONYM_MAP) {
99+
const cnMatch = entry.cn.some((term) => lower.includes(term.toLowerCase()));
100+
const enMatch = entry.en.some((term) => buildWordBoundaryRegex(term).test(query));
101+
102+
if (!cnMatch && !enMatch) continue;
103+
104+
for (const expansion of entry.expansions) {
105+
if (!lower.includes(expansion.toLowerCase())) {
106+
additions.add(expansion);
107+
}
108+
if (additions.size >= MAX_EXPANSION_TERMS) break;
109+
}
110+
111+
if (additions.size >= MAX_EXPANSION_TERMS) break;
112+
}
113+
114+
if (additions.size === 0) return query;
115+
return `${query} ${[...additions].join(" ")}`;
116+
}

0 commit comments

Comments
 (0)