11#!/usr/bin/env node
22
3- import { mkdir , readFile , stat , writeFile } from "node:fs/promises" ;
3+ import { mkdir , readFile , rename , rm , stat , writeFile } from "node:fs/promises" ;
44import { createRequire } from "node:module" ;
55import path from "node:path" ;
66import fg from "fast-glob" ;
@@ -11,17 +11,20 @@ const { version: CLI_VERSION } = require("../package.json") as { version: string
1111import {
1212 buildLanceDbIndex ,
1313 buildChunks ,
14+ computeChunkFingerprint ,
1415 createEmbeddingProvider ,
1516 embedChunksIncremental ,
1617 loadCache ,
18+ loadChunksFromPreviousIndex ,
1719 parseManifestJson ,
1820 resolveFileConfig ,
1921 saveCache ,
2022 type Chunk ,
2123 type EmbedProgressEvent ,
2224 type EmbeddingMetadata ,
2325 type IndexBuildStep ,
24- type Manifest
26+ type Manifest ,
27+ type PreviousIndexReader
2528} from "@speakeasy-api/docs-mcp-core" ;
2629import { buildHeuristicManifest } from "./fix.js" ;
2730import { resolveSourceCommit } from "./git.js" ;
@@ -116,8 +119,63 @@ program
116119 const outDir = path . resolve ( options . out ) ;
117120 const files = await listMarkdownFiles ( docsDir ) ;
118121 const manifestCache = new Map < string , Manifest > ( ) ;
122+ const lanceDbPath = path . join ( outDir , ".lancedb" ) ;
123+ const lanceDbTmpPath = path . join ( outDir , ".lancedb.tmp" ) ;
124+ const lanceDbOldPath = path . join ( outDir , ".lancedb.old" ) ;
125+
126+ // Clean up stale tmp/old dirs from interrupted builds
127+ await rm ( lanceDbTmpPath , { recursive : true , force : true } ) ;
128+ await rm ( lanceDbOldPath , { recursive : true , force : true } ) ;
129+
130+ // Load previous index for chunk caching (old .lancedb/ stays readable during build)
131+ let previousIndex : PreviousIndexReader | null = options . rebuildCache
132+ ? null
133+ : await loadChunksFromPreviousIndex ( lanceDbPath ) ;
134+
135+ // Canary validation: re-chunk the first 10 fingerprint-matching files to
136+ // detect chunking logic changes without maintaining a version number.
137+ if ( previousIndex ) {
138+ let validated = 0 ;
139+ for ( const file of files ) {
140+ if ( validated >= 10 ) break ;
141+ const markdown = await readFile ( file , "utf8" ) ;
142+ const relative = toPosix ( path . relative ( docsDir , file ) ) ;
143+ const manifestContext = await loadNearestManifest ( file , docsDir , manifestCache ) ;
144+ const resolved = resolveFileConfig ( {
145+ relativeFilePath : relative ,
146+ markdown,
147+ ...( manifestContext
148+ ? {
149+ manifest : manifestContext . manifest ,
150+ manifestBaseDir : manifestContext . manifestBaseDir
151+ }
152+ : { } )
153+ } ) ;
154+
155+ const fingerprint = computeChunkFingerprint ( markdown , resolved . strategy , resolved . metadata ) ;
156+ if ( previousIndex . fingerprints . get ( relative ) !== fingerprint ) continue ;
157+
158+ const freshChunks = buildChunks ( {
159+ filepath : relative ,
160+ markdown,
161+ strategy : resolved . strategy ,
162+ metadata : resolved . metadata
163+ } ) ;
164+ const cachedChunks = await previousIndex . getChunks ( relative ) ;
165+
166+ if ( JSON . stringify ( freshChunks ) !== JSON . stringify ( cachedChunks ) ) {
167+ console . warn ( `warn: chunk cache canary mismatch for ${ relative } ; discarding cache` ) ;
168+ previousIndex . close ( ) ;
169+ previousIndex = null ;
170+ break ;
171+ }
172+ validated ++ ;
173+ }
174+ }
119175
120176 const chunks : Chunk [ ] = [ ] ;
177+ const newFileFingerprints : Record < string , string > = { } ;
178+ let chunkCacheHits = 0 ;
121179 for ( let fi = 0 ; fi < files . length ; fi ++ ) {
122180 writeProgress ( `Chunking [${ fi + 1 } /${ files . length } ]...` ) ;
123181 const file = files [ fi ] ! ;
@@ -135,6 +193,16 @@ program
135193 : { } )
136194 } ) ;
137195
196+ const fingerprint = computeChunkFingerprint ( markdown , resolved . strategy , resolved . metadata ) ;
197+ newFileFingerprints [ relative ] = fingerprint ;
198+
199+ if ( previousIndex ?. fingerprints . get ( relative ) === fingerprint ) {
200+ const cachedChunks = await previousIndex . getChunks ( relative ) ;
201+ chunks . push ( ...cachedChunks ) ;
202+ chunkCacheHits ++ ;
203+ continue ;
204+ }
205+
138206 const fileChunks = buildChunks ( {
139207 filepath : relative ,
140208 markdown,
@@ -144,7 +212,8 @@ program
144212 chunks . push ( ...fileChunks ) ;
145213 }
146214 clearProgress ( ) ;
147- console . warn ( `Chunked ${ files . length } files into ${ chunks . length . toLocaleString ( ) } chunks` ) ;
215+ const cacheSuffix = chunkCacheHits > 0 ? ` (${ chunkCacheHits } cached)` : "" ;
216+ console . warn ( `Chunked ${ files . length } files into ${ chunks . length . toLocaleString ( ) } chunks${ cacheSuffix } ` ) ;
148217
149218 const providerInput : {
150219 provider : "none" | "hash" | "openai" ;
@@ -252,7 +321,9 @@ program
252321 sourceCommit
253322 ) ;
254323 const metadataKeys = Object . keys ( metadata . taxonomy ) ;
255- const lanceDbPath = path . join ( outDir , ".lancedb" ) ;
324+
325+ // Close previous index before writing the new one
326+ previousIndex ?. close ( ) ;
256327
257328 const indexStepLabels : Record < IndexBuildStep , string > = {
258329 "writing-table" : "Building search index: writing table..." ,
@@ -265,11 +336,13 @@ program
265336 chunks : Chunk [ ] ;
266337 metadataKeys : string [ ] ;
267338 vectorsByChunkId ? : Map < string , number [ ] > ;
339+ fileFingerprints ? : Record < string , string > ;
268340 onProgress ?: ( step : IndexBuildStep ) => void ;
269341 } = {
270- dbPath : lanceDbPath ,
342+ dbPath : lanceDbTmpPath ,
271343 chunks,
272344 metadataKeys,
345+ fileFingerprints : newFileFingerprints ,
273346 onProgress : ( step ) => writeProgress ( indexStepLabels [ step ] ) ,
274347 } ;
275348 if ( vectorsByChunkId ) {
@@ -295,6 +368,12 @@ program
295368 )
296369 ) ;
297370
371+ // Atomic swap: .lancedb.tmp → .lancedb
372+ await rm ( lanceDbOldPath , { recursive : true , force : true } ) ;
373+ try { await rename ( lanceDbPath , lanceDbOldPath ) ; } catch { }
374+ await rename ( lanceDbTmpPath , lanceDbPath ) ;
375+ await rm ( lanceDbOldPath , { recursive : true , force : true } ) . catch ( ( ) => { } ) ;
376+
298377 console . log ( `wrote ${ chunks . length } chunks and .lancedb index to ${ outDir } ` ) ;
299378 } ) ;
300379
0 commit comments