From 9fd3d27dbedba3f5c7d4624323fdc50614479bbb Mon Sep 17 00:00:00 2001 From: RooikeCAO <24and8@gmail.com> Date: Sun, 15 Mar 2026 03:58:19 +0800 Subject: [PATCH] fix: prevent infinite recursion in embedSingle() for CJK text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a large CJK text (14KB+ Chinese .md file) is processed by auto-recall, embedSingle() enters an infinite recursion loop because: 1. smartChunk() treats token limits as character limits, but CJK characters use 2-3x more tokens than ASCII characters 2. Chunks of 5740 chars (70% of 8192 token limit) still exceed the model's token context for CJK text 3. smartChunk() returns 1 chunk identical to input → embedSingle() recurses with the same text → infinite loop This produced ~50,000 embedding errors in 12 minutes, blocking the entire Node.js event loop and making all agents unresponsive. Fixes: - Add recursion depth limit (max 3) to embedSingle() with forced truncation as fallback - Detect single-chunk output (same size as input) and truncate instead of recursing - Add CJK-aware chunk sizing in smartChunk() (divide char limit by 2.5 when CJK ratio > 30%) - Truncate auto-recall query to 1000 chars before embedding - Add 10s global timeout on embedPassage()/embedQuery() Closes #214 Co-Authored-By: Claude Opus 4.6 (1M context) --- index.ts | 14 +++++++- src/chunker.ts | 37 +++++++++++++++++++-- src/embedder.ts | 85 +++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 125 insertions(+), 11 deletions(-) diff --git a/index.ts b/index.ts index 32f5e778..396517a9 100644 --- a/index.ts +++ b/index.ts @@ -2002,8 +2002,20 @@ const memoryLanceDBProPlugin = { const agentId = resolveHookAgentId(ctx?.agentId, (event as any).sessionKey); const accessibleScopes = scopeManager.getAccessibleScopes(agentId); + // FR-04: Truncate long prompts (e.g. file attachments) before embedding. + // Auto-recall only needs the user's intent, not full attachment text. + const MAX_RECALL_QUERY_LENGTH = 1_000; + let recallQuery = event.prompt; + if (recallQuery.length > MAX_RECALL_QUERY_LENGTH) { + const originalLength = recallQuery.length; + recallQuery = recallQuery.slice(0, MAX_RECALL_QUERY_LENGTH); + api.logger.info( + `memory-lancedb-pro: auto-recall query truncated from ${originalLength} to ${MAX_RECALL_QUERY_LENGTH} chars` + ); + } + const results = await retrieveWithRetry({ - query: event.prompt, + query: recallQuery, limit: 3, scopeFilter: accessibleScopes, source: "auto-recall", diff --git a/src/chunker.ts b/src/chunker.ts index d1581237..2ac5955b 100644 --- a/src/chunker.ts +++ b/src/chunker.ts @@ -162,6 +162,32 @@ function sliceTrimWithIndices(text: string, start: number, end: number): { chunk }; } +// ============================================================================ +// CJK Detection +// ============================================================================ + +// CJK Unicode ranges: Unified Ideographs, Extension A, Compatibility, +// Hangul Syllables, Katakana, Hiragana +const CJK_RE = + /[\u3040-\u309F\u30A0-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]/; + +/** Ratio of CJK characters to total non-whitespace characters. */ +function getCjkRatio(text: string): number { + let cjk = 0; + let total = 0; + for (const ch of text) { + if (/\s/.test(ch)) continue; + total++; + if (CJK_RE.test(ch)) cjk++; + } + return total === 0 ? 0 : cjk / total; +} + +// CJK chars are ~2-3 tokens each. When text is predominantly CJK, we divide +// char limits by this factor to stay within the model's token budget. +const CJK_CHAR_TOKEN_DIVISOR = 2.5; +const CJK_RATIO_THRESHOLD = 0.3; + // ============================================================================ // Chunking Core // ============================================================================ @@ -239,10 +265,15 @@ export function smartChunk(text: string, embedderModel?: string): ChunkResult { const limit = embedderModel ? EMBEDDING_CONTEXT_LIMITS[embedderModel] : undefined; const base = limit ?? 8192; + // CJK characters consume ~2-3 tokens each, so a char-based limit that works + // for Latin text will vastly overshoot the token budget for CJK-heavy text. + const cjkHeavy = getCjkRatio(text) > CJK_RATIO_THRESHOLD; + const divisor = cjkHeavy ? CJK_CHAR_TOKEN_DIVISOR : 1; + const config: ChunkerConfig = { - maxChunkSize: Math.max(1000, Math.floor(base * 0.7)), - overlapSize: Math.max(0, Math.floor(base * 0.05)), - minChunkSize: Math.max(100, Math.floor(base * 0.1)), + maxChunkSize: Math.max(1000, Math.floor(base * 0.7 / divisor)), + overlapSize: Math.max(0, Math.floor(base * 0.05 / divisor)), + minChunkSize: Math.max(100, Math.floor(base * 0.1 / divisor)), semanticSplit: true, maxLinesPerChunk: 50, }; diff --git a/src/embedder.ts b/src/embedder.ts index 5009425e..368970f5 100644 --- a/src/embedder.ts +++ b/src/embedder.ts @@ -248,6 +248,35 @@ export function formatEmbeddingProviderError( return `${genericPrefix}${detailText}`; } +// ============================================================================ +// Safety Constants +// ============================================================================ + +/** Maximum recursion depth for embedSingle chunking retries. */ +const MAX_EMBED_DEPTH = 3; + +/** Global timeout for a single embedding operation (ms). */ +const EMBED_TIMEOUT_MS = 10_000; + +/** + * Safe character limits per model for forced truncation. + * CJK characters typically consume ~3 tokens each, so the char limit is + * conservative compared to the token limit. + */ +const SAFE_CHAR_LIMITS: Record = { + "nomic-embed-text": 2300, + "mxbai-embed-large": 2300, + "all-MiniLM-L6-v2": 1000, + "all-mpnet-base-v2": 1500, +}; + +const DEFAULT_SAFE_CHAR_LIMIT = 2000; + +/** Return a safe character count for forced truncation given a model name. */ +function getSafeCharLimit(model: string): number { + return SAFE_CHAR_LIMITS[model] ?? DEFAULT_SAFE_CHAR_LIMIT; +} + export function getVectorDimensions(model: string, overrideDims?: number): number { if (overrideDims && overrideDims > 0) { return overrideDims; @@ -391,6 +420,21 @@ export class Embedder { return this.clients.length; } + /** FR-05: Wrap a promise with a global timeout to prevent indefinite hangs. */ + private withTimeout(promise: Promise, label: string): Promise { + return Promise.race([ + promise, + new Promise((_, reject) => { + setTimeout( + () => reject(new Error( + `[memory-lancedb-pro] ${label} timed out after ${EMBED_TIMEOUT_MS}ms` + )), + EMBED_TIMEOUT_MS, + ); + }), + ]); + } + // -------------------------------------------------------------------------- // Backward-compatible API // -------------------------------------------------------------------------- @@ -415,11 +459,11 @@ export class Embedder { // -------------------------------------------------------------------------- async embedQuery(text: string): Promise { - return this.embedSingle(text, this._taskQuery); + return this.withTimeout(this.embedSingle(text, this._taskQuery), "embedQuery"); } async embedPassage(text: string): Promise { - return this.embedSingle(text, this._taskPassage); + return this.withTimeout(this.embedSingle(text, this._taskPassage), "embedPassage"); } async embedBatchQuery(texts: string[]): Promise { @@ -466,11 +510,21 @@ export class Embedder { return payload; } - private async embedSingle(text: string, task?: string): Promise { + private async embedSingle(text: string, task?: string, depth: number = 0): Promise { if (!text || text.trim().length === 0) { throw new Error("Cannot embed empty text"); } + // FR-01: Recursion depth limit — force truncate when too deep + if (depth >= MAX_EMBED_DEPTH) { + const safeLimit = getSafeCharLimit(this._model); + console.warn( + `[memory-lancedb-pro] Recursion depth ${depth} reached MAX_EMBED_DEPTH (${MAX_EMBED_DEPTH}), ` + + `force-truncating ${text.length} chars → ${safeLimit} chars` + ); + text = text.slice(0, safeLimit); + } + // Check cache first const cached = this._cache.get(text, task); if (cached) return cached; @@ -494,17 +548,34 @@ export class Embedder { try { console.log(`Document exceeded context limit (${errorMsg}), attempting chunking...`); const chunkResult = smartChunk(text, this._model); - + if (chunkResult.chunks.length === 0) { throw new Error(`Failed to chunk document: ${errorMsg}`); } + // FR-03: Single chunk output detection — if smartChunk produced only + // one chunk that is nearly the same size as the original text, chunking + // did not actually reduce the problem. Force-truncate instead of + // recursing (which would loop forever). + if ( + chunkResult.chunks.length === 1 && + chunkResult.chunks[0].length > text.length * 0.9 + ) { + const safeLimit = getSafeCharLimit(this._model); + console.warn( + `[memory-lancedb-pro] smartChunk produced 1 chunk (${chunkResult.chunks[0].length} chars) ≈ original (${text.length} chars). ` + + `Force-truncating to ${safeLimit} chars to avoid infinite recursion.` + ); + const truncated = text.slice(0, safeLimit); + return this.embedSingle(truncated, task, depth + 1); + } + // Embed all chunks in parallel console.log(`Split document into ${chunkResult.chunkCount} chunks for embedding`); const chunkEmbeddings = await Promise.all( chunkResult.chunks.map(async (chunk, idx) => { try { - const embedding = await this.embedSingle(chunk, task); + const embedding = await this.embedSingle(chunk, task, depth + 1); return { embedding }; } catch (chunkError) { console.warn(`Failed to embed chunk ${idx}:`, chunkError); @@ -525,11 +596,11 @@ export class Embedder { ); const finalEmbedding = avgEmbedding.map(v => v / chunkEmbeddings.length); - + // Cache the result for the original text (using its hash) this._cache.set(text, task, finalEmbedding); console.log(`Successfully embedded long document as ${chunkEmbeddings.length} averaged chunks`); - + return finalEmbedding; } catch (chunkError) { // If chunking fails, throw the original error