@@ -11,7 +11,6 @@ import {
1111 mkdirSync ,
1212 realpathSync ,
1313 lstatSync ,
14- rmSync ,
1514 statSync ,
1615 unlinkSync ,
1716} from "node:fs" ;
@@ -67,6 +66,11 @@ async function loadLockfile(): Promise<any> {
6766 return lockfileModule ;
6867}
6968
69+ /** For unit testing: override the lockfile module with a mock. */
70+ export function __setLockfileModuleForTests ( module : any ) : void {
71+ lockfileModule = module ;
72+ }
73+
7074export const loadLanceDB = async ( ) : Promise <
7175 typeof import ( "@lancedb/lancedb" )
7276> => {
@@ -157,7 +161,7 @@ export function validateStoragePath(dbPath: string): string {
157161 ) {
158162 throw err ;
159163 } else {
160- // Other lstat failures ?? continue with original path
164+ // Other lstat failures — continue with original path
161165 }
162166 }
163167
@@ -201,23 +205,27 @@ export class MemoryStore {
201205 private table : LanceDB . Table | null = null ;
202206 private initPromise : Promise < void > | null = null ;
203207 private ftsIndexCreated = false ;
204- // Tail-reset serialization: replaces unbounded promise chain with a boolean flag + FIFO queue.
205- private _updating = false ;
206- private _waitQueue : Array < ( ) => void > = [ ] ;
208+ private updateQueue : Promise < void > = Promise . resolve ( ) ;
207209
208210 constructor ( private readonly config : StoreConfig ) { }
209211
210212 private async runWithFileLock < T > ( fn : ( ) => Promise < T > ) : Promise < T > {
211213 const lockfile = await loadLockfile ( ) ;
212214 const lockPath = join ( this . config . dbPath , ".memory-write.lock" ) ;
213-
214- // Ensure lock file exists before locking (proper-lockfile requires it)
215215 if ( ! existsSync ( lockPath ) ) {
216216 try { mkdirSync ( dirname ( lockPath ) , { recursive : true } ) ; } catch { }
217217 try { const { writeFileSync } = await import ( "node:fs" ) ; writeFileSync ( lockPath , "" , { flag : "wx" } ) ; } catch { }
218218 }
219-
220- // Proactive cleanup of stale lock artifacts (fixes stale-lock ECOMPROMISED)
219+ // 【修復 #415】調整 retries:max wait 從 ~3100ms → ~151秒
220+ // 指數退避:1s, 2s, 4s, 8s, 16s, 30s×5,總計約 151 秒
221+ // ECOMPROMISED 透過 onCompromised callback 觸發(非 throw),使用 flag 機制正確處理
222+ let isCompromised = false ;
223+ let compromisedErr : unknown = null ;
224+ let fnSucceeded = false ;
225+ let fnError : unknown = null ;
226+
227+ // Proactive cleanup of stale lock artifacts(from PR #626)
228+ // 根本避免 >5 分鐘的 lock artifact 導致 ECOMPROMISED
221229 if ( existsSync ( lockPath ) ) {
222230 try {
223231 const stat = statSync ( lockPath ) ;
@@ -231,10 +239,61 @@ export class MemoryStore {
231239 }
232240
233241 const release = await lockfile . lock ( lockPath , {
234- retries : { retries : 10 , factor : 2 , minTimeout : 200 , maxTimeout : 5000 } ,
235- stale : 10000 ,
242+ retries : {
243+ retries : 10 ,
244+ factor : 2 ,
245+ minTimeout : 1000 , // James 保守設定:避免高負載下過度密集重試
246+ maxTimeout : 30000 , // James 保守設定:支撐更久的 event loop 阻塞
247+ } ,
248+ stale : 10000 , // 10 秒後視為 stale,觸發 ECOMPROMISED callback
249+ // 注意:ECOMPROMISED 是 ambiguous degradation 訊號,mtime 無法區分
250+ // "holder 崩潰" vs "holder event loop 阻塞",所以不嘗試區分
251+ onCompromised : ( err : unknown ) => {
252+ // 【修復 #415 關鍵】必須是同步 callback
253+ // setLockAsCompromised() 不等待 Promise,async throw 無法傳回 caller
254+ isCompromised = true ;
255+ compromisedErr = err ;
256+ } ,
236257 } ) ;
237- try { return await fn ( ) ; } finally { await release ( ) ; }
258+
259+ try {
260+ const result = await fn ( ) ;
261+ fnSucceeded = true ;
262+ return result ;
263+ } catch ( e : unknown ) {
264+ fnError = e ;
265+ throw e ;
266+ } finally {
267+ if ( isCompromised ) {
268+ // fnError 優先:fn() 失敗時,fn 的錯誤比 compromised 重要
269+ if ( fnError !== null ) {
270+ throw fnError ;
271+ }
272+ // fn() 尚未完成就 compromised → throw,讓 caller 知道要重試
273+ if ( ! fnSucceeded ) {
274+ throw compromisedErr as Error ;
275+ }
276+ // fn() 成功執行,但 lock 在執行期間被標記 compromised
277+ // 正確行為:回傳成功結果(資料已寫入),明確告知 caller 不要重試
278+ console . warn (
279+ `[memory-lancedb-pro] Returning successful result despite compromised lock at "${ lockPath } ". ` +
280+ `Callers must not retry this operation automatically.` ,
281+ ) ;
282+ // 【修復 #415】compromised 後 release() 會回 ERELEASED,忽略即可
283+ // 重要:不要在這裡 return!否則 finally 的 return 會覆蓋 try 的 return 值
284+ try {
285+ await release ( ) ;
286+ } catch ( e : unknown ) {
287+ if ( ( e as NodeJS . ErrnoException ) . code === 'ERELEASED' ) {
288+ // ERELEASED 是預期行為,不做任何事,讓 try 的 return 值通過
289+ } else {
290+ throw e ; // 其他錯誤照拋
291+ }
292+ }
293+ } else {
294+ await release ( ) ;
295+ }
296+ }
238297 }
239298
240299 get dbPath ( ) : string {
@@ -297,24 +356,24 @@ export class MemoryStore {
297356
298357 if ( missingColumns . length > 0 ) {
299358 console . warn (
300- `memory-lancedb-pro: migrating legacy table ?? adding columns: ${ missingColumns . map ( ( c ) => c . name ) . join ( ", " ) } ` ,
359+ `memory-lancedb-pro: migrating legacy table — adding columns: ${ missingColumns . map ( ( c ) => c . name ) . join ( ", " ) } ` ,
301360 ) ;
302361 await table . addColumns ( missingColumns ) ;
303362 console . log (
304- `memory-lancedb-pro: migration complete ?? ${ missingColumns . length } column(s) added` ,
363+ `memory-lancedb-pro: migration complete — ${ missingColumns . length } column(s) added` ,
305364 ) ;
306365 }
307366 } catch ( err ) {
308367 const msg = String ( err ) ;
309368 if ( msg . includes ( "already exists" ) ) {
310- // Concurrent initialization race ?? another process already added the columns
369+ // Concurrent initialization race — another process already added the columns
311370 console . log ( "memory-lancedb-pro: migration columns already exist (concurrent init)" ) ;
312371 } else {
313372 console . warn ( "memory-lancedb-pro: could not check/migrate table schema:" , err ) ;
314373 }
315374 }
316375 } catch ( _openErr ) {
317- // Table doesn't exist yet ?? create it
376+ // Table doesn't exist yet — create it
318377 const schemaEntry : MemoryEntry = {
319378 id : "__schema__" ,
320379 text : "" ,
@@ -333,7 +392,7 @@ export class MemoryStore {
333392 await table . delete ( 'id = "__schema__"' ) ;
334393 } catch ( createErr ) {
335394 // Race: another caller (or eventual consistency) created the table
336- // between our failed openTable and this createTable ?? just open it.
395+ // between our failed openTable and this createTable — just open it.
337396 if ( String ( createErr ) . includes ( "already exists" ) ) {
338397 table = await db . openTable ( TABLE_NAME ) ;
339398 } else {
@@ -408,9 +467,10 @@ export class MemoryStore {
408467 return this . runWithFileLock ( async ( ) => {
409468 try {
410469 await this . table ! . add ( [ fullEntry ] ) ;
411- } catch ( err : any ) {
412- const code = err . code || "" ;
413- const message = err . message || String ( err ) ;
470+ } catch ( err : unknown ) {
471+ const e = err as { code ?: string ; message ?: string } ;
472+ const code = e . code || "" ;
473+ const message = e . message || String ( err ) ;
414474 throw new Error (
415475 `Failed to store memory in "${ this . config . dbPath } ": ${ code } ${ message } ` ,
416476 ) ;
@@ -465,12 +525,6 @@ export class MemoryStore {
465525 return res . length > 0 ;
466526 }
467527
468- /** Lightweight total row count via LanceDB countRows(). */
469- async count ( ) : Promise < number > {
470- await this . ensureInitialized ( ) ;
471- return await this . table ! . countRows ( ) ;
472- }
473-
474528 async getById ( id : string , scopeFilter ?: string [ ] ) : Promise < MemoryEntry | null > {
475529 await this . ensureInitialized ( ) ;
476530
@@ -901,7 +955,7 @@ export class MemoryStore {
901955 throw new Error ( `Memory ${ id } is outside accessible scopes` ) ;
902956 }
903957
904- return this . runWithFileLock ( async ( ) => {
958+ return this . runWithFileLock ( ( ) => this . runSerializedUpdate ( async ( ) => {
905959 // Support both full UUID and short prefix (8+ hex chars), same as delete()
906960 const uuidRegex =
907961 / ^ [ 0 - 9 a - f ] { 8 } - [ 0 - 9 a - f ] { 4 } - [ 0 - 9 a - f ] { 4 } - [ 0 - 9 a - f ] { 4 } - [ 0 - 9 a - f ] { 12 } $ / i;
@@ -1016,25 +1070,22 @@ export class MemoryStore {
10161070 }
10171071
10181072 return updated ;
1019- } ) ;
1073+ } ) ) ;
10201074 }
10211075
10221076 private async runSerializedUpdate < T > ( action : ( ) => Promise < T > ) : Promise < T > {
1023- // Tail-reset: no infinite promise chain. Uses a boolean flag + FIFO queue.
1024- if ( ! this . _updating ) {
1025- this . _updating = true ;
1026- try {
1027- return await action ( ) ;
1028- } finally {
1029- this . _updating = false ;
1030- const next = this . _waitQueue . shift ( ) ;
1031- if ( next ) next ( ) ;
1032- }
1033- } else {
1034- // Already busy — enqueue and wait for the current owner to signal done.
1035- return new Promise < void > ( ( resolve ) => {
1036- this . _waitQueue . push ( resolve ) ;
1037- } ) . then ( ( ) => this . runSerializedUpdate ( action ) ) as Promise < T > ;
1077+ const previous = this . updateQueue ;
1078+ let release : ( ( ) => void ) | undefined ;
1079+ const lock = new Promise < void > ( ( resolve ) => {
1080+ release = resolve ;
1081+ } ) ;
1082+ this . updateQueue = previous . then ( ( ) => lock ) ;
1083+
1084+ await previous ;
1085+ try {
1086+ return await action ( ) ;
1087+ } finally {
1088+ release ?.( ) ;
10381089 }
10391090 }
10401091
0 commit comments