speakeasy-api
diff --git a/‎README.md‎
Lines changed: 26 additions & 44 deletions b/‎README.md‎
Lines changed: 26 additions & 44 deletions
diff --git a/‎packages/cli/src/git.ts‎
Lines changed: 32 additions & 0 deletions b/‎packages/cli/src/git.ts‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎packages/cli/src/index.ts‎
Lines changed: 127 additions & 5 deletions b/‎packages/cli/src/index.ts‎
Lines changed: 127 additions & 5 deletions
diff --git a/‎packages/core/src/embedding-cache.ts‎
Lines changed: 5 additions & 1 deletion b/‎packages/core/src/embedding-cache.ts‎
Lines changed: 5 additions & 1 deletion
@@ -58,56 +58,38 @@ Ancestor headings (breadcrumbs like `Auth SDK > AcmeAuthClientV2 > Initializatio
 
 ## Benchmarks
 
-On a realistic 28.8MB multi-language SDK corpus (38 eval cases across 9 categories), benchmarked with [`docs-mcp-eval benchmark`](docs/eval.md):
+On a realistic ~300-operation API with hand-written guides (~28.8MB corpus, 5 eval categories), benchmarked with [`docs-mcp-eval benchmark`](docs/eval.md):
 
 ### Summary
 
-| Metric | none | openai/text-embedding-3-large |
-| --- | ---: | ---: |
-| MRR@5 | 0.1803 | 0.2320 |
-| NDCG@5 | 0.2136 | 0.2657 |
-| Facet Precision | 0.3158 | 0.3684 |
-| Search p50 (ms) | 5.2 | 242.6 |
-| Search p95 (ms) | 6.6 | 5914.1 |
-| Build Time (ms) | 6989 | 20448 |
-| Peak RSS (MB) | 247.6 | 313.6 |
-| Index Size (corpus 28.8MB) | 104.9MB | 356.9MB |
-| Embed Cost (est.) | $0 | $0.9825 |
-| Query Cost (est.) | $0 | $0.000003 |
-
-### Per-Category Facet Precision
-
-| Category | none | openai/text-embedding-3-large |
-| --- | ---: | ---: |
-| api-discovery | 0.0000 | 0.0000 |
-| cross-service | 0.3333 | 0.3333 |
-| distractor | 0.4000 | 0.4000 |
-| error-handling | 0.0000 | 0.0000 |
-| intent | 0.4000 | 0.4000 |
-| lexical | 0.8000 | 0.8000 |
-| multi-hop | 0.3333 | 0.3333 |
-| paraphrased | 0.1250 | 0.2500 |
-| sdk-reference | 0.3333 | 0.6667 |
+| Metric | none | openai | Takeaway |
+| --- | ---: | ---: | --- |
+| MRR@5 | 0.2141 | 0.2833 | Embeddings lift relevant-result ranking by 32% |
+| NDCG@5 | 0.2536 | 0.3218 | Graded relevance improves 27% with embeddings |
+| Facet Precision | 0.3750 | 0.4375 | Embeddings improve filter accuracy by 17% |
+| Search p50 (ms) | 5.2 | 258.4 | FTS-only is ~50x faster at median |
+| Search p95 (ms) | 6.5 | 11101.1 | Tail latency dominated by embedding API |
+| Build Time (ms) | 6022 | 1569703 | Embedding uses batch API for large corpora |
+| Peak RSS (MB) | 221.1 | 283.8 | Modest memory overhead |
+| Index Size (corpus 28.8MB) | 104.9MB | 356.9MB | Vectors ~3.4x the FTS-only index |
+| Embed Cost (est.) | $0 | $0.9825 | ~$1 one-time cost per corpus |
+| Query Cost (est.) | $0 | $0.000003 | Negligible per-query cost |
 
 ### Per-Category MRR@5
 
-| Category | none | openai/text-embedding-3-large |
-| --- | ---: | ---: |
-| api-discovery | 0.0000 | 0.0000 |
-| cross-service | 0.1667 | 0.3333 |
-| distractor | 0.3000 | 0.3000 |
-| error-handling | 0.0000 | 0.0000 |
-| intent | 0.0900 | 0.2667 |
-| lexical | 0.4800 | 0.5067 |
-| multi-hop | 0.3333 | 0.3333 |
-| paraphrased | 0.0625 | 0.0938 |
-| sdk-reference | 0.1667 | 0.2333 |
-
-**Key takeaways:**
-- Embeddings double facet precision on `paraphrased` and `sdk-reference` categories
-- Embeddings triple MRR on `intent` queries (0.09 → 0.27)
-- `lexical`, `distractor`, `cross-service`, `multi-hop` — FTS alone matches embedding performance
-- FTS-only search: 5ms p50 latency, zero embedding cost
+> MRR@5 (Mean Reciprocal Rank at 5) measures how high the first relevant result appears in the top 5. 1.0 = always ranked first; 0.0 = never appears in top 5.
+
+| Category | none | openai | Takeaway |
+| --- | ---: | ---: | --- |
+| clarification | 0.3000 | 0.3000 | FTS matches embeddings |
+| cross-service | 0.1667 | 0.3333 | Embeddings double rank |
+| exact-name | 0.3625 | 0.3792 | FTS nearly matches embeddings |
+| natural-language | 0.0731 | 0.1692 | Embeddings lift 130% |
+| workflow | 0.3333 | 0.4444 | Embeddings lift 33% |
+
+### Recommendation
+
+We recommend starting with FTS-only search. While embeddings improve relevance for conceptual and paraphrased queries, they also introduce ~50x query latency and substantial build overhead. For agents that iterate through multiple searches, the faster cycle time of pure FTS has anecdotally proven more valuable than the per-query relevance lift — particularly with modern models capable of query refinement.
 
 ## Graceful Fallback
 
 
@@ -1,4 +1,5 @@
 import { execFile } from "node:child_process";
+import path from "node:path";
 import { promisify } from "node:util";
 
 const execFileAsync = promisify(execFile);
@@ -15,3 +16,34 @@ export async function resolveSourceCommit(targetDir: string): Promise<string | n
     return null;
   }
 }
+
+/**
+ * Derive a corpus identifier from the git repo name and the docs directory's
+ * path relative to the repo root. Falls back to the directory basename.
+ */
+export async function resolveCorpusLabel(docsDir: string): Promise<string> {
+  try {
+    const opts = { timeout: 5_000, windowsHide: true } as const;
+
+    const { stdout: rootOut } = await execFileAsync(
+      "git", ["-C", docsDir, "rev-parse", "--show-toplevel"], opts
+    );
+    const repoRoot = rootOut.trim();
+
+    let repoName: string;
+    try {
+      const { stdout: remoteOut } = await execFileAsync(
+        "git", ["-C", docsDir, "remote", "get-url", "origin"], opts
+      );
+      const url = remoteOut.trim();
+      repoName = url.replace(/\.git$/, "").split(/[/:]/).pop() ?? path.basename(repoRoot);
+    } catch {
+      repoName = path.basename(repoRoot);
+    }
+
+    const relPath = path.relative(repoRoot, docsDir);
+    return relPath ? `${repoName}/${relPath}` : repoName;
+  } catch {
+    return path.basename(docsDir);
+  }
+}
@@ -14,12 +14,14 @@ import {
   computeChunkFingerprint,
   createEmbeddingProvider,
   embedChunksIncremental,
+  formatDuration,
   loadCache,
   loadChunksFromPreviousIndex,
   mergeTaxonomyConfigs,
   parseManifestJson,
   resolveFileConfig,
   saveCache,
+  type BatchProgressEvent,
   type Chunk,
   type EmbedProgressEvent,
   type EmbeddingMetadata,
@@ -29,13 +31,120 @@ import {
   type PreviousIndexReader
 } from "@speakeasy-api/docs-mcp-core";
 import { buildHeuristicManifest } from "./fix.js";
-import { resolveSourceCommit } from "./git.js";
+import { resolveCorpusLabel, resolveSourceCommit } from "./git.js";
 
 const program = new Command();
 
 const isTTY = process.stderr.isTTY ?? false;
-function writeProgress(msg: string) { if (isTTY) process.stderr.write(`\r\x1b[K${msg}`); }
-function clearProgress() { if (isTTY) process.stderr.write("\r\x1b[K"); }
+let progressLineCount = 0;
+
+function writeProgress(msg: string) { writeProgressBlock([msg]); }
+
+function writeProgressBlock(lines: string[]) {
+  if (!isTTY) return;
+  // Move cursor up to start of previous block
+  if (progressLineCount > 1) {
+    process.stderr.write(`\x1b[${progressLineCount - 1}A`);
+  }
+  // Write new lines, clearing each
+  for (let i = 0; i < lines.length; i++) {
+    process.stderr.write(`\r\x1b[K${lines[i]}`);
+    if (i < lines.length - 1) process.stderr.write("\n");
+  }
+  // Clear any leftover lines from a previously taller block
+  for (let i = lines.length; i < progressLineCount; i++) {
+    process.stderr.write("\n\x1b[K");
+  }
+  const extra = progressLineCount - lines.length;
+  if (extra > 0) {
+    process.stderr.write(`\x1b[${extra}A`);
+  }
+  progressLineCount = lines.length;
+}
+
+function clearProgress() {
+  if (!isTTY || progressLineCount === 0) return;
+  if (progressLineCount > 1) {
+    process.stderr.write(`\x1b[${progressLineCount - 1}A`);
+  }
+  process.stderr.write("\r\x1b[K");
+  for (let i = 1; i < progressLineCount; i++) {
+    process.stderr.write("\n\x1b[K");
+  }
+  if (progressLineCount > 1) {
+    process.stderr.write(`\x1b[${progressLineCount - 1}A\r`);
+  }
+  progressLineCount = 0;
+}
+
+let lastNonTtyWrite = 0;
+
+/**
+ * Pack segments into lines that fit within `cols`, separating with `sep`.
+ * Any single segment wider than `cols` is hard-truncated.
+ */
+function packLines(segments: string[], cols: number, sep = "  "): string[] {
+  const lines: string[] = [];
+  let cur = "";
+  for (const seg of segments) {
+    const truncated = seg.length > cols ? seg.slice(0, cols) : seg;
+    if (cur.length === 0) {
+      cur = truncated;
+    } else if (cur.length + sep.length + truncated.length <= cols) {
+      cur += sep + truncated;
+    } else {
+      lines.push(cur);
+      cur = truncated;
+    }
+  }
+  if (cur.length > 0) lines.push(cur);
+  return lines;
+}
+
+function writeBatchProgress(event: BatchProgressEvent) {
+  // Non-polling phases: always emit
+  if (event.phase !== "batch-polling") {
+    if (isTTY) {
+      writeProgress(event.message);
+    } else {
+      console.warn(event.message);
+    }
+    return;
+  }
+
+  // Non-TTY: throttle to one line per ~10s
+  if (!isTTY) {
+    const now = Date.now();
+    if (now - lastNonTtyWrite >= 10_000) {
+      lastNonTtyWrite = now;
+      console.warn(event.message);
+    }
+    return;
+  }
+
+  // TTY: read current width every call (terminal can be resized)
+  const cols = process.stderr.columns || 80;
+
+  // If the flat message fits, use it as-is
+  if (event.message.length <= cols) {
+    writeProgress(event.message);
+    return;
+  }
+
+  // Build discrete segments and pack into as many lines as needed
+  if (event.counts) {
+    const { completed, total, failed } = event.counts;
+    const pct = ((completed / total) * 100).toFixed(1);
+    const segments = [`Batch: ${completed}/${total} (${pct}%)`];
+    if (failed > 0) segments.push(`${failed} failed`);
+    if (event.etaSec != null) segments.push(`ETA ~${formatDuration(event.etaSec)}`);
+    if (event.elapsedSec != null) segments.push(`Elapsed: ${formatDuration(event.elapsedSec)}`);
+    if (event.pollRemainingSec != null) segments.push(`Next poll: ${event.pollRemainingSec}s`);
+    writeProgressBlock(packLines(segments, cols));
+  } else {
+    writeProgress(event.message);
+  }
+}
 
 program
   .name("docs-mcp")
@@ -219,17 +328,26 @@ program
 
     const taxonomyConfig = mergeTaxonomyConfigs(manifestCache.values());
 
+    const onBatchProgress = (event: BatchProgressEvent) => {
+      writeBatchProgress(event);
+    };
     const providerInput: {
       provider: "none" | "hash" | "openai";
       model?: string;
       dimensions?: number;
       apiKey?: string;
       baseUrl?: string;
       batchSize?: number;
+      batchApiThreshold?: number;
+      batchName?: string;
       concurrency?: number;
       maxRetries?: number;
+      onBatchProgress?: (event: BatchProgressEvent) => void;
     } = {
-      provider: normalizeProvider(options.embeddingProvider)
+      provider: normalizeProvider(options.embeddingProvider),
+      batchApiThreshold: 2500,
+      batchName: `docs-mcp:${await resolveCorpusLabel(docsDir)}`,
+      onBatchProgress,
     };
     if (options.embeddingModel !== undefined) {
       providerInput.model = options.embeddingModel;
@@ -280,7 +398,11 @@ program
         chunks,
         { ...config, embed: (texts) => embeddingProvider.embed(texts) },
         cache,
-        { ...(embeddingProvider.batchSize !== undefined ? { batchSize: embeddingProvider.batchSize } : {}), onProgress },
+        {
+          ...(embeddingProvider.batchSize !== undefined ? { batchSize: embeddingProvider.batchSize } : {}),
+          ...(embeddingProvider.batchApiThreshold !== undefined ? { batchApiThreshold: embeddingProvider.batchApiThreshold } : {}),
+          onProgress,
+        },
       );
       clearProgress();
       const embedMs = ((Date.now() - embedStart) / 1000).toFixed(1);
 
@@ -253,7 +253,11 @@ export async function embedChunksIncremental(
   const missVectors: number[][] = [];
 
   const batchSize = options?.batchSize;
-  if (missTexts.length > 0 && batchSize && batchSize > 0) {
+  const batchApiThreshold = options?.batchApiThreshold;
+  if (missTexts.length > 0 && batchApiThreshold && missTexts.length >= batchApiThreshold) {
+    // Send all at once so provider can use Batch API
+    missVectors.push(...await provider.embed(missTexts));
+  } else if (missTexts.length > 0 && batchSize && batchSize > 0) {
     let embeddedSoFar = 0;
     for (let offset = 0; offset < missTexts.length; offset += batchSize) {
       const batch = missTexts.slice(offset, offset + batchSize);