From 9fd3d27dbedba3f5c7d4624323fdc50614479bbb Mon Sep 17 00:00:00 2001
From: RooikeCAO <24and8@gmail.com>
Date: Sun, 15 Mar 2026 03:58:19 +0800
Subject: [PATCH] fix: prevent infinite recursion in embedSingle() for CJK text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a large CJK text (14KB+ Chinese .md file) is processed by
auto-recall, embedSingle() enters an infinite recursion loop because:

1. smartChunk() treats token limits as character limits, but CJK
   characters use 2-3x more tokens than ASCII characters
2. Chunks of 5740 chars (70% of 8192 token limit) still exceed
   the model's token context for CJK text
3. smartChunk() returns 1 chunk identical to input → embedSingle()
   recurses with the same text → infinite loop

This produced ~50,000 embedding errors in 12 minutes, blocking
the entire Node.js event loop and making all agents unresponsive.

Fixes:
- Add recursion depth limit (max 3) to embedSingle() with forced
  truncation as fallback
- Detect single-chunk output (same size as input) and truncate
  instead of recursing
- Add CJK-aware chunk sizing in smartChunk() (divide char limit
  by 2.5 when CJK ratio > 30%)
- Truncate auto-recall query to 1000 chars before embedding
- Add 10s global timeout on embedPassage()/embedQuery()

Closes #214

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 index.ts        | 14 +++++++-
 src/chunker.ts  | 37 +++++++++++++++++++--
 src/embedder.ts | 85 +++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 125 insertions(+), 11 deletions(-)

diff --git a/index.ts b/index.ts
index 32f5e778..396517a9 100644
--- a/index.ts
+++ b/index.ts
@@ -2002,8 +2002,20 @@ const memoryLanceDBProPlugin = {
           const agentId = resolveHookAgentId(ctx?.agentId, (event as any).sessionKey);
           const accessibleScopes = scopeManager.getAccessibleScopes(agentId);
 
+          // FR-04: Truncate long prompts (e.g. file attachments) before embedding.
+          // Auto-recall only needs the user's intent, not full attachment text.
+          const MAX_RECALL_QUERY_LENGTH = 1_000;
+          let recallQuery = event.prompt;
+          if (recallQuery.length > MAX_RECALL_QUERY_LENGTH) {
+            const originalLength = recallQuery.length;
+            recallQuery = recallQuery.slice(0, MAX_RECALL_QUERY_LENGTH);
+            api.logger.info(
+              `memory-lancedb-pro: auto-recall query truncated from ${originalLength} to ${MAX_RECALL_QUERY_LENGTH} chars`
+            );
+          }
+
           const results = await retrieveWithRetry({
-            query: event.prompt,
+            query: recallQuery,
             limit: 3,
             scopeFilter: accessibleScopes,
             source: "auto-recall",
diff --git a/src/chunker.ts b/src/chunker.ts
index d1581237..2ac5955b 100644
--- a/src/chunker.ts
+++ b/src/chunker.ts
@@ -162,6 +162,32 @@ function sliceTrimWithIndices(text: string, start: number, end: number): { chunk
   };
 }
 
+// ============================================================================
+// CJK Detection
+// ============================================================================
+
+// CJK Unicode ranges: Unified Ideographs, Extension A, Compatibility,
+// Hangul Syllables, Katakana, Hiragana
+const CJK_RE =
+  /[\u3040-\u309F\u30A0-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]/;
+
+/** Ratio of CJK characters to total non-whitespace characters. */
+function getCjkRatio(text: string): number {
+  let cjk = 0;
+  let total = 0;
+  for (const ch of text) {
+    if (/\s/.test(ch)) continue;
+    total++;
+    if (CJK_RE.test(ch)) cjk++;
+  }
+  return total === 0 ? 0 : cjk / total;
+}
+
+// CJK chars are ~2-3 tokens each. When text is predominantly CJK, we divide
+// char limits by this factor to stay within the model's token budget.
+const CJK_CHAR_TOKEN_DIVISOR = 2.5;
+const CJK_RATIO_THRESHOLD = 0.3;
+
 // ============================================================================
 // Chunking Core
 // ============================================================================
@@ -239,10 +265,15 @@ export function smartChunk(text: string, embedderModel?: string): ChunkResult {
   const limit = embedderModel ? EMBEDDING_CONTEXT_LIMITS[embedderModel] : undefined;
   const base = limit ?? 8192;
 
+  // CJK characters consume ~2-3 tokens each, so a char-based limit that works
+  // for Latin text will vastly overshoot the token budget for CJK-heavy text.
+  const cjkHeavy = getCjkRatio(text) > CJK_RATIO_THRESHOLD;
+  const divisor = cjkHeavy ? CJK_CHAR_TOKEN_DIVISOR : 1;
+
   const config: ChunkerConfig = {
-    maxChunkSize: Math.max(1000, Math.floor(base * 0.7)),
-    overlapSize: Math.max(0, Math.floor(base * 0.05)),
-    minChunkSize: Math.max(100, Math.floor(base * 0.1)),
+    maxChunkSize: Math.max(1000, Math.floor(base * 0.7 / divisor)),
+    overlapSize: Math.max(0, Math.floor(base * 0.05 / divisor)),
+    minChunkSize: Math.max(100, Math.floor(base * 0.1 / divisor)),
     semanticSplit: true,
     maxLinesPerChunk: 50,
   };
diff --git a/src/embedder.ts b/src/embedder.ts
index 5009425e..368970f5 100644
--- a/src/embedder.ts
+++ b/src/embedder.ts
@@ -248,6 +248,35 @@ export function formatEmbeddingProviderError(
   return `${genericPrefix}${detailText}`;
 }
 
+// ============================================================================
+// Safety Constants
+// ============================================================================
+
+/** Maximum recursion depth for embedSingle chunking retries. */
+const MAX_EMBED_DEPTH = 3;
+
+/** Global timeout for a single embedding operation (ms). */
+const EMBED_TIMEOUT_MS = 10_000;
+
+/**
+ * Safe character limits per model for forced truncation.
+ * CJK characters typically consume ~3 tokens each, so the char limit is
+ * conservative compared to the token limit.
+ */
+const SAFE_CHAR_LIMITS: Record<string, number> = {
+  "nomic-embed-text": 2300,
+  "mxbai-embed-large": 2300,
+  "all-MiniLM-L6-v2": 1000,
+  "all-mpnet-base-v2": 1500,
+};
+
+const DEFAULT_SAFE_CHAR_LIMIT = 2000;
+
+/** Return a safe character count for forced truncation given a model name. */
+function getSafeCharLimit(model: string): number {
+  return SAFE_CHAR_LIMITS[model] ?? DEFAULT_SAFE_CHAR_LIMIT;
+}
+
 export function getVectorDimensions(model: string, overrideDims?: number): number {
   if (overrideDims && overrideDims > 0) {
     return overrideDims;
@@ -391,6 +420,21 @@ export class Embedder {
     return this.clients.length;
   }
 
+  /** FR-05: Wrap a promise with a global timeout to prevent indefinite hangs. */
+  private withTimeout<T>(promise: Promise<T>, label: string): Promise<T> {
+    return Promise.race([
+      promise,
+      new Promise<never>((_, reject) => {
+        setTimeout(
+          () => reject(new Error(
+            `[memory-lancedb-pro] ${label} timed out after ${EMBED_TIMEOUT_MS}ms`
+          )),
+          EMBED_TIMEOUT_MS,
+        );
+      }),
+    ]);
+  }
+
   // --------------------------------------------------------------------------
   // Backward-compatible API
   // --------------------------------------------------------------------------
@@ -415,11 +459,11 @@ export class Embedder {
   // --------------------------------------------------------------------------
 
   async embedQuery(text: string): Promise<number[]> {
-    return this.embedSingle(text, this._taskQuery);
+    return this.withTimeout(this.embedSingle(text, this._taskQuery), "embedQuery");
   }
 
   async embedPassage(text: string): Promise<number[]> {
-    return this.embedSingle(text, this._taskPassage);
+    return this.withTimeout(this.embedSingle(text, this._taskPassage), "embedPassage");
   }
 
   async embedBatchQuery(texts: string[]): Promise<number[][]> {
@@ -466,11 +510,21 @@ export class Embedder {
     return payload;
   }
 
-  private async embedSingle(text: string, task?: string): Promise<number[]> {
+  private async embedSingle(text: string, task?: string, depth: number = 0): Promise<number[]> {
     if (!text || text.trim().length === 0) {
       throw new Error("Cannot embed empty text");
     }
 
+    // FR-01: Recursion depth limit — force truncate when too deep
+    if (depth >= MAX_EMBED_DEPTH) {
+      const safeLimit = getSafeCharLimit(this._model);
+      console.warn(
+        `[memory-lancedb-pro] Recursion depth ${depth} reached MAX_EMBED_DEPTH (${MAX_EMBED_DEPTH}), ` +
+        `force-truncating ${text.length} chars → ${safeLimit} chars`
+      );
+      text = text.slice(0, safeLimit);
+    }
+
     // Check cache first
     const cached = this._cache.get(text, task);
     if (cached) return cached;
@@ -494,17 +548,34 @@ export class Embedder {
         try {
           console.log(`Document exceeded context limit (${errorMsg}), attempting chunking...`);
           const chunkResult = smartChunk(text, this._model);
-          
+
           if (chunkResult.chunks.length === 0) {
             throw new Error(`Failed to chunk document: ${errorMsg}`);
           }
 
+          // FR-03: Single chunk output detection — if smartChunk produced only
+          // one chunk that is nearly the same size as the original text, chunking
+          // did not actually reduce the problem.  Force-truncate instead of
+          // recursing (which would loop forever).
+          if (
+            chunkResult.chunks.length === 1 &&
+            chunkResult.chunks[0].length > text.length * 0.9
+          ) {
+            const safeLimit = getSafeCharLimit(this._model);
+            console.warn(
+              `[memory-lancedb-pro] smartChunk produced 1 chunk (${chunkResult.chunks[0].length} chars) ≈ original (${text.length} chars). ` +
+              `Force-truncating to ${safeLimit} chars to avoid infinite recursion.`
+            );
+            const truncated = text.slice(0, safeLimit);
+            return this.embedSingle(truncated, task, depth + 1);
+          }
+
           // Embed all chunks in parallel
           console.log(`Split document into ${chunkResult.chunkCount} chunks for embedding`);
           const chunkEmbeddings = await Promise.all(
             chunkResult.chunks.map(async (chunk, idx) => {
               try {
-                const embedding = await this.embedSingle(chunk, task);
+                const embedding = await this.embedSingle(chunk, task, depth + 1);
                 return { embedding };
               } catch (chunkError) {
                 console.warn(`Failed to embed chunk ${idx}:`, chunkError);
@@ -525,11 +596,11 @@ export class Embedder {
           );
 
           const finalEmbedding = avgEmbedding.map(v => v / chunkEmbeddings.length);
-          
+
           // Cache the result for the original text (using its hash)
           this._cache.set(text, task, finalEmbedding);
           console.log(`Successfully embedded long document as ${chunkEmbeddings.length} averaged chunks`);
-          
+
           return finalEmbedding;
         } catch (chunkError) {
           // If chunking fails, throw the original error