Skip to content

Commit 29ad1d1

Browse files
committed
chore: optimizations for large corpuses and recommendations on embedding
1 parent 19c9ce8 commit 29ad1d1

File tree

8 files changed

+814
-51
lines changed

8 files changed

+814
-51
lines changed

README.md

Lines changed: 26 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -58,56 +58,38 @@ Ancestor headings (breadcrumbs like `Auth SDK > AcmeAuthClientV2 > Initializatio
5858

5959
## Benchmarks
6060

61-
On a realistic 28.8MB multi-language SDK corpus (38 eval cases across 9 categories), benchmarked with [`docs-mcp-eval benchmark`](docs/eval.md):
61+
On a realistic ~300-operation API with hand-written guides (~28.8MB corpus, 5 eval categories), benchmarked with [`docs-mcp-eval benchmark`](docs/eval.md):
6262

6363
### Summary
6464

65-
| Metric | none | openai/text-embedding-3-large |
66-
| --- | ---: | ---: |
67-
| MRR@5 | 0.1803 | 0.2320 |
68-
| NDCG@5 | 0.2136 | 0.2657 |
69-
| Facet Precision | 0.3158 | 0.3684 |
70-
| Search p50 (ms) | 5.2 | 242.6 |
71-
| Search p95 (ms) | 6.6 | 5914.1 |
72-
| Build Time (ms) | 6989 | 20448 |
73-
| Peak RSS (MB) | 247.6 | 313.6 |
74-
| Index Size (corpus 28.8MB) | 104.9MB | 356.9MB |
75-
| Embed Cost (est.) | $0 | $0.9825 |
76-
| Query Cost (est.) | $0 | $0.000003 |
77-
78-
### Per-Category Facet Precision
79-
80-
| Category | none | openai/text-embedding-3-large |
81-
| --- | ---: | ---: |
82-
| api-discovery | 0.0000 | 0.0000 |
83-
| cross-service | 0.3333 | 0.3333 |
84-
| distractor | 0.4000 | 0.4000 |
85-
| error-handling | 0.0000 | 0.0000 |
86-
| intent | 0.4000 | 0.4000 |
87-
| lexical | 0.8000 | 0.8000 |
88-
| multi-hop | 0.3333 | 0.3333 |
89-
| paraphrased | 0.1250 | 0.2500 |
90-
| sdk-reference | 0.3333 | 0.6667 |
65+
| Metric | none | openai | Takeaway |
66+
| --- | ---: | ---: | --- |
67+
| MRR@5 | 0.2141 | 0.2833 | Embeddings lift relevant-result ranking by 32% |
68+
| NDCG@5 | 0.2536 | 0.3218 | Graded relevance improves 27% with embeddings |
69+
| Facet Precision | 0.3750 | 0.4375 | Embeddings improve filter accuracy by 17% |
70+
| Search p50 (ms) | 5.2 | 258.4 | FTS-only is ~50x faster at median |
71+
| Search p95 (ms) | 6.5 | 11101.1 | Tail latency dominated by embedding API |
72+
| Build Time (ms) | 6022 | 1569703 | Embedding uses batch API for large corpora |
73+
| Peak RSS (MB) | 221.1 | 283.8 | Modest memory overhead |
74+
| Index Size (corpus 28.8MB) | 104.9MB | 356.9MB | Vectors ~3.4x the FTS-only index |
75+
| Embed Cost (est.) | $0 | $0.9825 | ~$1 one-time cost per corpus |
76+
| Query Cost (est.) | $0 | $0.000003 | Negligible per-query cost |
9177

9278
### Per-Category MRR@5
9379

94-
| Category | none | openai/text-embedding-3-large |
95-
| --- | ---: | ---: |
96-
| api-discovery | 0.0000 | 0.0000 |
97-
| cross-service | 0.1667 | 0.3333 |
98-
| distractor | 0.3000 | 0.3000 |
99-
| error-handling | 0.0000 | 0.0000 |
100-
| intent | 0.0900 | 0.2667 |
101-
| lexical | 0.4800 | 0.5067 |
102-
| multi-hop | 0.3333 | 0.3333 |
103-
| paraphrased | 0.0625 | 0.0938 |
104-
| sdk-reference | 0.1667 | 0.2333 |
105-
106-
**Key takeaways:**
107-
- Embeddings double facet precision on `paraphrased` and `sdk-reference` categories
108-
- Embeddings triple MRR on `intent` queries (0.09 → 0.27)
109-
- `lexical`, `distractor`, `cross-service`, `multi-hop` — FTS alone matches embedding performance
110-
- FTS-only search: 5ms p50 latency, zero embedding cost
80+
> MRR@5 (Mean Reciprocal Rank at 5) measures how high the first relevant result appears in the top 5. 1.0 = always ranked first; 0.0 = never appears in top 5.
81+
82+
| Category | none | openai | Takeaway |
83+
| --- | ---: | ---: | --- |
84+
| clarification | 0.3000 | 0.3000 | FTS matches embeddings |
85+
| cross-service | 0.1667 | 0.3333 | Embeddings double rank |
86+
| exact-name | 0.3625 | 0.3792 | FTS nearly matches embeddings |
87+
| natural-language | 0.0731 | 0.1692 | Embeddings lift 130% |
88+
| workflow | 0.3333 | 0.4444 | Embeddings lift 33% |
89+
90+
### Recommendation
91+
92+
We recommend starting with FTS-only search. While embeddings improve relevance for conceptual and paraphrased queries, they also introduce ~50x query latency and substantial build overhead. For agents that iterate through multiple searches, the faster cycle time of pure FTS has anecdotally proven more valuable than the per-query relevance lift — particularly with modern models capable of query refinement.
11193

11294
## Graceful Fallback
11395

packages/cli/src/git.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { execFile } from "node:child_process";
2+
import path from "node:path";
23
import { promisify } from "node:util";
34

45
const execFileAsync = promisify(execFile);
@@ -15,3 +16,34 @@ export async function resolveSourceCommit(targetDir: string): Promise<string | n
1516
return null;
1617
}
1718
}
19+
20+
/**
21+
* Derive a corpus identifier from the git repo name and the docs directory's
22+
* path relative to the repo root. Falls back to the directory basename.
23+
*/
24+
export async function resolveCorpusLabel(docsDir: string): Promise<string> {
25+
try {
26+
const opts = { timeout: 5_000, windowsHide: true } as const;
27+
28+
const { stdout: rootOut } = await execFileAsync(
29+
"git", ["-C", docsDir, "rev-parse", "--show-toplevel"], opts
30+
);
31+
const repoRoot = rootOut.trim();
32+
33+
let repoName: string;
34+
try {
35+
const { stdout: remoteOut } = await execFileAsync(
36+
"git", ["-C", docsDir, "remote", "get-url", "origin"], opts
37+
);
38+
const url = remoteOut.trim();
39+
repoName = url.replace(/\.git$/, "").split(/[/:]/).pop() ?? path.basename(repoRoot);
40+
} catch {
41+
repoName = path.basename(repoRoot);
42+
}
43+
44+
const relPath = path.relative(repoRoot, docsDir);
45+
return relPath ? `${repoName}/${relPath}` : repoName;
46+
} catch {
47+
return path.basename(docsDir);
48+
}
49+
}

packages/cli/src/index.ts

Lines changed: 127 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,14 @@ import {
1414
computeChunkFingerprint,
1515
createEmbeddingProvider,
1616
embedChunksIncremental,
17+
formatDuration,
1718
loadCache,
1819
loadChunksFromPreviousIndex,
1920
mergeTaxonomyConfigs,
2021
parseManifestJson,
2122
resolveFileConfig,
2223
saveCache,
24+
type BatchProgressEvent,
2325
type Chunk,
2426
type EmbedProgressEvent,
2527
type EmbeddingMetadata,
@@ -29,13 +31,120 @@ import {
2931
type PreviousIndexReader
3032
} from "@speakeasy-api/docs-mcp-core";
3133
import { buildHeuristicManifest } from "./fix.js";
32-
import { resolveSourceCommit } from "./git.js";
34+
import { resolveCorpusLabel, resolveSourceCommit } from "./git.js";
3335

3436
const program = new Command();
3537

3638
const isTTY = process.stderr.isTTY ?? false;
37-
function writeProgress(msg: string) { if (isTTY) process.stderr.write(`\r\x1b[K${msg}`); }
38-
function clearProgress() { if (isTTY) process.stderr.write("\r\x1b[K"); }
39+
let progressLineCount = 0;
40+
41+
function writeProgress(msg: string) { writeProgressBlock([msg]); }
42+
43+
function writeProgressBlock(lines: string[]) {
44+
if (!isTTY) return;
45+
// Move cursor up to start of previous block
46+
if (progressLineCount > 1) {
47+
process.stderr.write(`\x1b[${progressLineCount - 1}A`);
48+
}
49+
// Write new lines, clearing each
50+
for (let i = 0; i < lines.length; i++) {
51+
process.stderr.write(`\r\x1b[K${lines[i]}`);
52+
if (i < lines.length - 1) process.stderr.write("\n");
53+
}
54+
// Clear any leftover lines from a previously taller block
55+
for (let i = lines.length; i < progressLineCount; i++) {
56+
process.stderr.write("\n\x1b[K");
57+
}
58+
const extra = progressLineCount - lines.length;
59+
if (extra > 0) {
60+
process.stderr.write(`\x1b[${extra}A`);
61+
}
62+
progressLineCount = lines.length;
63+
}
64+
65+
function clearProgress() {
66+
if (!isTTY || progressLineCount === 0) return;
67+
if (progressLineCount > 1) {
68+
process.stderr.write(`\x1b[${progressLineCount - 1}A`);
69+
}
70+
process.stderr.write("\r\x1b[K");
71+
for (let i = 1; i < progressLineCount; i++) {
72+
process.stderr.write("\n\x1b[K");
73+
}
74+
if (progressLineCount > 1) {
75+
process.stderr.write(`\x1b[${progressLineCount - 1}A\r`);
76+
}
77+
progressLineCount = 0;
78+
}
79+
80+
let lastNonTtyWrite = 0;
81+
82+
/**
83+
* Pack segments into lines that fit within `cols`, separating with `sep`.
84+
* Any single segment wider than `cols` is hard-truncated.
85+
*/
86+
function packLines(segments: string[], cols: number, sep = " "): string[] {
87+
const lines: string[] = [];
88+
let cur = "";
89+
for (const seg of segments) {
90+
const truncated = seg.length > cols ? seg.slice(0, cols) : seg;
91+
if (cur.length === 0) {
92+
cur = truncated;
93+
} else if (cur.length + sep.length + truncated.length <= cols) {
94+
cur += sep + truncated;
95+
} else {
96+
lines.push(cur);
97+
cur = truncated;
98+
}
99+
}
100+
if (cur.length > 0) lines.push(cur);
101+
return lines;
102+
}
103+
104+
function writeBatchProgress(event: BatchProgressEvent) {
105+
// Non-polling phases: always emit
106+
if (event.phase !== "batch-polling") {
107+
if (isTTY) {
108+
writeProgress(event.message);
109+
} else {
110+
console.warn(event.message);
111+
}
112+
return;
113+
}
114+
115+
// Non-TTY: throttle to one line per ~10s
116+
if (!isTTY) {
117+
const now = Date.now();
118+
if (now - lastNonTtyWrite >= 10_000) {
119+
lastNonTtyWrite = now;
120+
console.warn(event.message);
121+
}
122+
return;
123+
}
124+
125+
// TTY: read current width every call (terminal can be resized)
126+
const cols = process.stderr.columns || 80;
127+
128+
// If the flat message fits, use it as-is
129+
if (event.message.length <= cols) {
130+
writeProgress(event.message);
131+
return;
132+
}
133+
134+
// Build discrete segments and pack into as many lines as needed
135+
if (event.counts) {
136+
const { completed, total, failed } = event.counts;
137+
const pct = ((completed / total) * 100).toFixed(1);
138+
const segments = [`Batch: ${completed}/${total} (${pct}%)`];
139+
if (failed > 0) segments.push(`${failed} failed`);
140+
if (event.etaSec != null) segments.push(`ETA ~${formatDuration(event.etaSec)}`);
141+
if (event.elapsedSec != null) segments.push(`Elapsed: ${formatDuration(event.elapsedSec)}`);
142+
if (event.pollRemainingSec != null) segments.push(`Next poll: ${event.pollRemainingSec}s`);
143+
writeProgressBlock(packLines(segments, cols));
144+
} else {
145+
writeProgress(event.message);
146+
}
147+
}
39148

40149
program
41150
.name("docs-mcp")
@@ -219,17 +328,26 @@ program
219328

220329
const taxonomyConfig = mergeTaxonomyConfigs(manifestCache.values());
221330

331+
const onBatchProgress = (event: BatchProgressEvent) => {
332+
writeBatchProgress(event);
333+
};
222334
const providerInput: {
223335
provider: "none" | "hash" | "openai";
224336
model?: string;
225337
dimensions?: number;
226338
apiKey?: string;
227339
baseUrl?: string;
228340
batchSize?: number;
341+
batchApiThreshold?: number;
342+
batchName?: string;
229343
concurrency?: number;
230344
maxRetries?: number;
345+
onBatchProgress?: (event: BatchProgressEvent) => void;
231346
} = {
232-
provider: normalizeProvider(options.embeddingProvider)
347+
provider: normalizeProvider(options.embeddingProvider),
348+
batchApiThreshold: 2500,
349+
batchName: `docs-mcp:${await resolveCorpusLabel(docsDir)}`,
350+
onBatchProgress,
233351
};
234352
if (options.embeddingModel !== undefined) {
235353
providerInput.model = options.embeddingModel;
@@ -280,7 +398,11 @@ program
280398
chunks,
281399
{ ...config, embed: (texts) => embeddingProvider.embed(texts) },
282400
cache,
283-
{ ...(embeddingProvider.batchSize !== undefined ? { batchSize: embeddingProvider.batchSize } : {}), onProgress },
401+
{
402+
...(embeddingProvider.batchSize !== undefined ? { batchSize: embeddingProvider.batchSize } : {}),
403+
...(embeddingProvider.batchApiThreshold !== undefined ? { batchApiThreshold: embeddingProvider.batchApiThreshold } : {}),
404+
onProgress,
405+
},
284406
);
285407
clearProgress();
286408
const embedMs = ((Date.now() - embedStart) / 1000).toFixed(1);

packages/core/src/embedding-cache.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,11 @@ export async function embedChunksIncremental(
253253
const missVectors: number[][] = [];
254254

255255
const batchSize = options?.batchSize;
256-
if (missTexts.length > 0 && batchSize && batchSize > 0) {
256+
const batchApiThreshold = options?.batchApiThreshold;
257+
if (missTexts.length > 0 && batchApiThreshold && missTexts.length >= batchApiThreshold) {
258+
// Send all at once so provider can use Batch API
259+
missVectors.push(...await provider.embed(missTexts));
260+
} else if (missTexts.length > 0 && batchSize && batchSize > 0) {
257261
let embeddedSoFar = 0;
258262
for (let offset = 0; offset < missTexts.length; offset += batchSize) {
259263
const batch = missTexts.slice(offset, offset + batchSize);

0 commit comments

Comments
 (0)