Skip to content

Commit e4b58d0

Browse files
committed
chore: iterate on evals
1 parent 2dd2b70 commit e4b58d0

File tree

12 files changed

+556
-67
lines changed

12 files changed

+556
-67
lines changed

mise.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ run = "cd packages/playground && npx vite"
4343

4444
[tasks."bench:realistic"]
4545
description = "Benchmark embedding providers against realistic test fixture (none, hash)"
46-
run = "node packages/eval/dist/bin.js benchmark --cases packages/eval/fixtures/realistic/cases.json --docs-dir packages/eval/fixtures/realistic --work-dir /tmp/bench-realistic --build-command packages/cli/dist/index.js --server-command packages/server/dist/bin.js --providers none,hash"
46+
run = "node packages/eval/dist/bin.js benchmark --cases packages/eval/fixtures/realistic/cases.json --docs-dir packages/eval/fixtures/realistic --work-dir /tmp/bench-realistic --build-command packages/cli/dist/index.js --server-command packages/server/dist/bin.js --embeddings none,hash"
4747
depends = ["build"]
4848

4949
[tasks.nuke]

packages/cli/src/index.ts

Lines changed: 84 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env node
22

3-
import { mkdir, readFile, stat, writeFile } from "node:fs/promises";
3+
import { mkdir, readFile, rename, rm, stat, writeFile } from "node:fs/promises";
44
import { createRequire } from "node:module";
55
import path from "node:path";
66
import fg from "fast-glob";
@@ -11,17 +11,20 @@ const { version: CLI_VERSION } = require("../package.json") as { version: string
1111
import {
1212
buildLanceDbIndex,
1313
buildChunks,
14+
computeChunkFingerprint,
1415
createEmbeddingProvider,
1516
embedChunksIncremental,
1617
loadCache,
18+
loadChunksFromPreviousIndex,
1719
parseManifestJson,
1820
resolveFileConfig,
1921
saveCache,
2022
type Chunk,
2123
type EmbedProgressEvent,
2224
type EmbeddingMetadata,
2325
type IndexBuildStep,
24-
type Manifest
26+
type Manifest,
27+
type PreviousIndexReader
2528
} from "@speakeasy-api/docs-mcp-core";
2629
import { buildHeuristicManifest } from "./fix.js";
2730
import { resolveSourceCommit } from "./git.js";
@@ -116,8 +119,63 @@ program
116119
const outDir = path.resolve(options.out);
117120
const files = await listMarkdownFiles(docsDir);
118121
const manifestCache = new Map<string, Manifest>();
122+
const lanceDbPath = path.join(outDir, ".lancedb");
123+
const lanceDbTmpPath = path.join(outDir, ".lancedb.tmp");
124+
const lanceDbOldPath = path.join(outDir, ".lancedb.old");
125+
126+
// Clean up stale tmp/old dirs from interrupted builds
127+
await rm(lanceDbTmpPath, { recursive: true, force: true });
128+
await rm(lanceDbOldPath, { recursive: true, force: true });
129+
130+
// Load previous index for chunk caching (old .lancedb/ stays readable during build)
131+
let previousIndex: PreviousIndexReader | null = options.rebuildCache
132+
? null
133+
: await loadChunksFromPreviousIndex(lanceDbPath);
134+
135+
// Canary validation: re-chunk the first 10 fingerprint-matching files to
136+
// detect chunking logic changes without maintaining a version number.
137+
if (previousIndex) {
138+
let validated = 0;
139+
for (const file of files) {
140+
if (validated >= 10) break;
141+
const markdown = await readFile(file, "utf8");
142+
const relative = toPosix(path.relative(docsDir, file));
143+
const manifestContext = await loadNearestManifest(file, docsDir, manifestCache);
144+
const resolved = resolveFileConfig({
145+
relativeFilePath: relative,
146+
markdown,
147+
...(manifestContext
148+
? {
149+
manifest: manifestContext.manifest,
150+
manifestBaseDir: manifestContext.manifestBaseDir
151+
}
152+
: {})
153+
});
154+
155+
const fingerprint = computeChunkFingerprint(markdown, resolved.strategy, resolved.metadata);
156+
if (previousIndex.fingerprints.get(relative) !== fingerprint) continue;
157+
158+
const freshChunks = buildChunks({
159+
filepath: relative,
160+
markdown,
161+
strategy: resolved.strategy,
162+
metadata: resolved.metadata
163+
});
164+
const cachedChunks = await previousIndex.getChunks(relative);
165+
166+
if (JSON.stringify(freshChunks) !== JSON.stringify(cachedChunks)) {
167+
console.warn(`warn: chunk cache canary mismatch for ${relative}; discarding cache`);
168+
previousIndex.close();
169+
previousIndex = null;
170+
break;
171+
}
172+
validated++;
173+
}
174+
}
119175

120176
const chunks: Chunk[] = [];
177+
const newFileFingerprints: Record<string, string> = {};
178+
let chunkCacheHits = 0;
121179
for (let fi = 0; fi < files.length; fi++) {
122180
writeProgress(`Chunking [${fi + 1}/${files.length}]...`);
123181
const file = files[fi]!;
@@ -135,6 +193,16 @@ program
135193
: {})
136194
});
137195

196+
const fingerprint = computeChunkFingerprint(markdown, resolved.strategy, resolved.metadata);
197+
newFileFingerprints[relative] = fingerprint;
198+
199+
if (previousIndex?.fingerprints.get(relative) === fingerprint) {
200+
const cachedChunks = await previousIndex.getChunks(relative);
201+
chunks.push(...cachedChunks);
202+
chunkCacheHits++;
203+
continue;
204+
}
205+
138206
const fileChunks = buildChunks({
139207
filepath: relative,
140208
markdown,
@@ -144,7 +212,8 @@ program
144212
chunks.push(...fileChunks);
145213
}
146214
clearProgress();
147-
console.warn(`Chunked ${files.length} files into ${chunks.length.toLocaleString()} chunks`);
215+
const cacheSuffix = chunkCacheHits > 0 ? ` (${chunkCacheHits} cached)` : "";
216+
console.warn(`Chunked ${files.length} files into ${chunks.length.toLocaleString()} chunks${cacheSuffix}`);
148217

149218
const providerInput: {
150219
provider: "none" | "hash" | "openai";
@@ -252,7 +321,9 @@ program
252321
sourceCommit
253322
);
254323
const metadataKeys = Object.keys(metadata.taxonomy);
255-
const lanceDbPath = path.join(outDir, ".lancedb");
324+
325+
// Close previous index before writing the new one
326+
previousIndex?.close();
256327

257328
const indexStepLabels: Record<IndexBuildStep, string> = {
258329
"writing-table": "Building search index: writing table...",
@@ -265,11 +336,13 @@ program
265336
chunks: Chunk[];
266337
metadataKeys: string[];
267338
vectorsByChunkId?: Map<string, number[]>;
339+
fileFingerprints?: Record<string, string>;
268340
onProgress?: (step: IndexBuildStep) => void;
269341
} = {
270-
dbPath: lanceDbPath,
342+
dbPath: lanceDbTmpPath,
271343
chunks,
272344
metadataKeys,
345+
fileFingerprints: newFileFingerprints,
273346
onProgress: (step) => writeProgress(indexStepLabels[step]),
274347
};
275348
if (vectorsByChunkId) {
@@ -295,6 +368,12 @@ program
295368
)
296369
);
297370

371+
// Atomic swap: .lancedb.tmp → .lancedb
372+
await rm(lanceDbOldPath, { recursive: true, force: true });
373+
try { await rename(lanceDbPath, lanceDbOldPath); } catch {}
374+
await rename(lanceDbTmpPath, lanceDbPath);
375+
await rm(lanceDbOldPath, { recursive: true, force: true }).catch(() => {});
376+
298377
console.log(`wrote ${chunks.length} chunks and .lancedb index to ${outDir}`);
299378
});
300379

packages/core/src/chunk-cache.ts

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import { connect } from "@lancedb/lancedb";
2+
import { sha256hex } from "./embedding.js";
3+
import type { Chunk, ChunkingStrategy } from "./types.js";
4+
import type { ChunkRow } from "./lancedb.js";
5+
6+
const DEFAULT_TABLE_NAME = "chunks";
7+
8+
/**
9+
* Compute a fingerprint for a file's chunking inputs.
10+
* Changes when the markdown, strategy, or metadata change.
11+
*/
12+
export function computeChunkFingerprint(
13+
markdown: string,
14+
strategy: ChunkingStrategy,
15+
metadata: Record<string, string>
16+
): string {
17+
return sha256hex(
18+
[JSON.stringify(strategy), JSON.stringify(metadata), markdown].join("\0")
19+
);
20+
}
21+
22+
export interface PreviousIndexReader {
23+
fingerprints: Map<string, string>; // filepath → fingerprint
24+
getChunks(filepath: string): Promise<Chunk[]>;
25+
close(): void;
26+
}
27+
28+
/**
29+
* Open the previous `.lancedb/` index and extract per-file fingerprints.
30+
* Returns null if the index doesn't exist or lacks the `file_fingerprint` column.
31+
*/
32+
export async function loadChunksFromPreviousIndex(
33+
dbPath: string,
34+
tableName?: string
35+
): Promise<PreviousIndexReader | null> {
36+
const table = tableName ?? DEFAULT_TABLE_NAME;
37+
38+
let db;
39+
try {
40+
db = await connect(dbPath);
41+
} catch {
42+
return null;
43+
}
44+
45+
try {
46+
const tableNames = await db.tableNames();
47+
if (!tableNames.includes(table)) {
48+
db.close();
49+
return null;
50+
}
51+
52+
const tbl = await db.openTable(table);
53+
54+
// Probe for file_fingerprint column by fetching a single row
55+
const probeRows = await tbl
56+
.query()
57+
.select(["filepath", "file_fingerprint"])
58+
.limit(1)
59+
.toArray();
60+
61+
if (probeRows.length === 0) {
62+
// Empty table — nothing to cache from
63+
tbl.close();
64+
db.close();
65+
return null;
66+
}
67+
68+
const probeRow = probeRows[0]!;
69+
if (
70+
!("file_fingerprint" in probeRow) ||
71+
typeof probeRow.file_fingerprint !== "string"
72+
) {
73+
// Old-format index without fingerprints
74+
tbl.close();
75+
db.close();
76+
return null;
77+
}
78+
79+
// Load all fingerprints (lightweight: only two string columns)
80+
const fpRows = await tbl
81+
.query()
82+
.select(["filepath", "file_fingerprint"])
83+
.toArray();
84+
85+
const fingerprints = new Map<string, string>();
86+
for (const row of fpRows) {
87+
const filepath = row.filepath as string;
88+
const fp = row.file_fingerprint as string;
89+
if (fp) {
90+
fingerprints.set(filepath, fp);
91+
}
92+
}
93+
94+
return {
95+
fingerprints,
96+
async getChunks(filepath: string): Promise<Chunk[]> {
97+
const escaped = filepath.replace(/'/g, "''");
98+
const rows = await tbl
99+
.query()
100+
.where(`filepath = '${escaped}'`)
101+
.toArray();
102+
103+
return rows
104+
.map((row) => row as ChunkRow)
105+
.sort((a, b) => toNumber(a.chunk_index) - toNumber(b.chunk_index))
106+
.map((row) => rowToChunk(row));
107+
},
108+
close() {
109+
tbl.close();
110+
db.close();
111+
},
112+
};
113+
} catch {
114+
db.close();
115+
return null;
116+
}
117+
}
118+
119+
function toNumber(value: unknown): number {
120+
if (typeof value === "number") return value;
121+
if (typeof value === "bigint") return Number(value);
122+
const parsed = Number(value);
123+
return Number.isFinite(parsed) ? parsed : 0;
124+
}
125+
126+
function rowToChunk(row: ChunkRow): Chunk {
127+
let metadata: Record<string, string> = {};
128+
if (typeof row.metadata_json === "string" && row.metadata_json.trim()) {
129+
try {
130+
const parsed = JSON.parse(row.metadata_json) as Record<string, unknown>;
131+
for (const [key, value] of Object.entries(parsed)) {
132+
if (typeof value === "string") {
133+
metadata[key] = value;
134+
}
135+
}
136+
} catch {
137+
metadata = {};
138+
}
139+
}
140+
141+
return {
142+
chunk_id: String(row.chunk_id ?? ""),
143+
filepath: String(row.filepath ?? ""),
144+
heading: String(row.heading ?? ""),
145+
heading_level: toNumber(row.heading_level),
146+
content: String(row.content ?? ""),
147+
content_text: String(row.content_text ?? ""),
148+
breadcrumb: String(row.breadcrumb ?? ""),
149+
chunk_index: toNumber(row.chunk_index),
150+
metadata,
151+
};
152+
}

packages/core/src/embedding.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ export class NoopEmbeddingProvider implements EmbeddingProvider {
3737
readonly name = "none";
3838
readonly model = "none";
3939
readonly dimensions = 0;
40+
readonly costPerMillionTokens = 0;
4041
readonly configFingerprint: string;
4142

4243
constructor() {
@@ -56,6 +57,7 @@ export class HashEmbeddingProvider implements EmbeddingProvider {
5657
readonly name = "hash";
5758
readonly model: string;
5859
readonly dimensions: number;
60+
readonly costPerMillionTokens = 0;
5961
readonly configFingerprint: string;
6062

6163
constructor(options: { dimensions?: number; model?: string } = {}) {
@@ -80,10 +82,18 @@ export class HashEmbeddingProvider implements EmbeddingProvider {
8082
*/
8183
const DEFAULT_MAX_INPUT_CHARS = 24_000;
8284

85+
/** Known pricing in USD per 1M input tokens for OpenAI embedding models. */
86+
const OPENAI_COST_PER_M_TOKENS: Record<string, number> = {
87+
"text-embedding-3-large": 0.13,
88+
"text-embedding-3-small": 0.02,
89+
"text-embedding-ada-002": 0.10,
90+
};
91+
8392
export class OpenAIEmbeddingProvider implements EmbeddingProvider {
8493
readonly name = "openai";
8594
readonly model: string;
8695
readonly dimensions: number;
96+
readonly costPerMillionTokens: number;
8797
readonly configFingerprint: string;
8898

8999
private readonly apiKey: string;
@@ -101,6 +111,7 @@ export class OpenAIEmbeddingProvider implements EmbeddingProvider {
101111

102112
this.apiKey = options.apiKey;
103113
this.model = options.model ?? "text-embedding-3-large";
114+
this.costPerMillionTokens = OPENAI_COST_PER_M_TOKENS[this.model] ?? 0;
104115
this.dimensions = options.dimensions ?? 3072;
105116
this.batchSize = options.batchSize ?? 128;
106117
this.baseUrl = (options.baseUrl ?? "https://api.openai.com/v1").replace(/\/$/, "");

packages/core/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ export * from "./search.js";
99
export * from "./embedding.js";
1010
export * from "./embedding-cache.js";
1111
export * from "./lancedb.js";
12+
export * from "./chunk-cache.js";

0 commit comments

Comments
 (0)