Merge pull request #238 from Hi-Jiajun/fix-reviewer-concerns

rwmjhb · web-flow · commit 2b0174e9ab1c · 2026-03-18T16:15:32.000+08:00
fix: prevent infinite recursion in embedSingle() for CJK text (replaces PR #215)
diff --git a/index.ts b/index.ts
@@ -2093,8 +2093,20 @@ const memoryLanceDBProPlugin = {
           const agentId = resolveHookAgentId(ctx?.agentId, (event as any).sessionKey);
           const accessibleScopes = resolveScopeFilter(scopeManager, agentId);
 
+          // FR-04: Truncate long prompts (e.g. file attachments) before embedding.
+          // Auto-recall only needs the user's intent, not full attachment text.
+          const MAX_RECALL_QUERY_LENGTH = 1_000;
+          let recallQuery = event.prompt;
+          if (recallQuery.length > MAX_RECALL_QUERY_LENGTH) {
+            const originalLength = recallQuery.length;
+            recallQuery = recallQuery.slice(0, MAX_RECALL_QUERY_LENGTH);
+            api.logger.info(
+              `memory-lancedb-pro: auto-recall query truncated from ${originalLength} to ${MAX_RECALL_QUERY_LENGTH} chars`
+            );
+          }
+
           const results = filterUserMdExclusiveRecallResults(await retrieveWithRetry({
-            query: event.prompt,
+            query: recallQuery,
             limit: 3,
             scopeFilter: accessibleScopes,
             source: "auto-recall",
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -28,7 +28,6 @@
     "@lancedb/lancedb": "^0.26.2",
     "@sinclair/typebox": "0.34.48",
     "apache-arrow": "18.1.0",
-    "json5": "^2.2.3",
     "openai": "^6.21.0"
   },
   "openclaw": {
@@ -37,7 +36,7 @@
     ]
   },
   "scripts": {
-    "test": "node test/embedder-error-hints.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs",
+    "test": "node test/embedder-error-hints.test.mjs && node test/cjk-recursion-regression.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs",
     "test:openclaw-host": "node test/openclaw-host-functional.mjs",
     "version": "node scripts/sync-plugin-version.mjs openclaw.plugin.json package.json && git add openclaw.plugin.json"
   },
diff --git a/src/chunker.ts b/src/chunker.ts
@@ -162,6 +162,32 @@ function sliceTrimWithIndices(text: string, start: number, end: number): { chunk
   };
 }
 
+// ============================================================================
+// CJK Detection
+// ============================================================================
+
+// CJK Unicode ranges: Unified Ideographs, Extension A, Compatibility,
+// Hangul Syllables, Katakana, Hiragana
+const CJK_RE =
+  /[\u3040-\u309F\u30A0-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]/;
+
+/** Ratio of CJK characters to total non-whitespace characters. */
+function getCjkRatio(text: string): number {
+  let cjk = 0;
+  let total = 0;
+  for (const ch of text) {
+    if (/\s/.test(ch)) continue;
+    total++;
+    if (CJK_RE.test(ch)) cjk++;
+  }
+  return total === 0 ? 0 : cjk / total;
+}
+
+// CJK chars are ~2-3 tokens each. When text is predominantly CJK, we divide
+// char limits by this factor to stay within the model's token budget.
+const CJK_CHAR_TOKEN_DIVISOR = 2.5;
+const CJK_RATIO_THRESHOLD = 0.3;
+
 // ============================================================================
 // Chunking Core
 // ============================================================================
@@ -239,10 +265,15 @@ export function smartChunk(text: string, embedderModel?: string): ChunkResult {
   const limit = embedderModel ? EMBEDDING_CONTEXT_LIMITS[embedderModel] : undefined;
   const base = limit ?? 8192;
 
+  // CJK characters consume ~2-3 tokens each, so a char-based limit that works
+  // for Latin text will vastly overshoot the token budget for CJK-heavy text.
+  const cjkHeavy = getCjkRatio(text) > CJK_RATIO_THRESHOLD;
+  const divisor = cjkHeavy ? CJK_CHAR_TOKEN_DIVISOR : 1;
+
   const config: ChunkerConfig = {
-    maxChunkSize: Math.max(1000, Math.floor(base * 0.7)),
-    overlapSize: Math.max(0, Math.floor(base * 0.05)),
-    minChunkSize: Math.max(100, Math.floor(base * 0.1)),
+    maxChunkSize: Math.max(200, Math.floor(base * 0.7 / divisor)),
+    overlapSize: Math.max(0, Math.floor(base * 0.05 / divisor)),
+    minChunkSize: Math.max(100, Math.floor(base * 0.1 / divisor)),
     semanticSplit: true,
     maxLinesPerChunk: 50,
   };
diff --git a/src/embedder.ts b/src/embedder.ts
@@ -352,6 +352,22 @@ export function formatEmbeddingProviderError(
   return `${genericPrefix}${detailText}`;
 }
 
+// ============================================================================
+// Safety Constants
+// ============================================================================
+
+/** Maximum recursion depth for embedSingle chunking retries. */
+const MAX_EMBED_DEPTH = 3;
+
+/** Global timeout for a single embedding operation (ms). */
+const EMBED_TIMEOUT_MS = 10_000;
+
+/**
+ * Strictly decreasing character limit for forced truncation.
+ * Each recursion level MUST reduce input by this factor to guarantee progress.
+ */
+const STRICT_REDUCTION_FACTOR = 0.5; // Each retry must be at most 50% of previous
+
 export function getVectorDimensions(model: string, overrideDims?: number): number {
   if (overrideDims && overrideDims > 0) {
     return overrideDims;
@@ -472,16 +488,23 @@ export class Embedder {
   /**
    * Call embeddings.create with automatic key rotation on rate-limit errors.
    * Tries each key in the pool at most once before giving up.
+   * Accepts an optional AbortSignal to support true request cancellation.
    */
-  private async embedWithRetry(payload: any): Promise<any> {
+  private async embedWithRetry(payload: any, signal?: AbortSignal): Promise<any> {
     const maxAttempts = this.clients.length;
     let lastError: Error | undefined;
 
     for (let attempt = 0; attempt < maxAttempts; attempt++) {
       const client = this.nextClient();
       try {
-        return await client.embeddings.create(payload);
+        // Pass signal to OpenAI SDK if provided (SDK v6+ supports this)
+        return await client.embeddings.create(payload, signal ? { signal } : undefined);
       } catch (error) {
+        // If aborted, re-throw immediately
+        if (error instanceof Error && error.name === 'AbortError') {
+          throw error;
+        }
+        
         lastError = error instanceof Error ? error : new Error(String(error));
 
         if (this.isRateLimitError(error) && attempt < maxAttempts - 1) {
@@ -510,6 +533,13 @@ export class Embedder {
     return this.clients.length;
   }
 
+  /** Wrap a single embedding operation with a global timeout via AbortSignal. */
+  private withTimeout<T>(promiseFactory: (signal: AbortSignal) => Promise<T>, _label: string): Promise<T> {
+    const controller = new AbortController();
+    const timeoutId = setTimeout(() => controller.abort(), EMBED_TIMEOUT_MS);
+    return promiseFactory(controller.signal).finally(() => clearTimeout(timeoutId));
+  }
+
   // --------------------------------------------------------------------------
   // Backward-compatible API
   // --------------------------------------------------------------------------
@@ -534,13 +564,17 @@ export class Embedder {
   // --------------------------------------------------------------------------
 
   async embedQuery(text: string): Promise<number[]> {
-    return this.embedSingle(text, this._taskQuery);
+    return this.withTimeout((signal) => this.embedSingle(text, this._taskQuery, 0, signal), "embedQuery");
   }
 
   async embedPassage(text: string): Promise<number[]> {
-    return this.embedSingle(text, this._taskPassage);
+    return this.withTimeout((signal) => this.embedSingle(text, this._taskPassage, 0, signal), "embedPassage");
   }
 
+  // Note: embedBatchQuery/embedBatchPassage are NOT wrapped with withTimeout because
+  // they handle multiple texts in a single API call. The timeout would fire after
+  // EMBED_TIMEOUT_MS regardless of how many texts succeed. Individual text embedding
+  // within the batch is protected by the SDK's own timeout handling.
   async embedBatchQuery(texts: string[]): Promise<number[][]> {
     return this.embedMany(texts, this._taskQuery);
   }
@@ -595,17 +629,32 @@ export class Embedder {
     return payload;
   }
 
-  private async embedSingle(text: string, task?: string): Promise<number[]> {
+  private async embedSingle(text: string, task?: string, depth: number = 0, signal?: AbortSignal): Promise<number[]> {
     if (!text || text.trim().length === 0) {
       throw new Error("Cannot embed empty text");
     }
 
+    // FR-01: Recursion depth limit — force truncate when too deep
+    if (depth >= MAX_EMBED_DEPTH) {
+      const safeLimit = Math.floor(text.length * STRICT_REDUCTION_FACTOR);
+      console.warn(
+        `[memory-lancedb-pro] Recursion depth ${depth} reached MAX_EMBED_DEPTH (${MAX_EMBED_DEPTH}), ` +
+        `force-truncating ${text.length} chars → ${safeLimit} chars (strict ${STRICT_REDUCTION_FACTOR * 100}% reduction)`
+      );
+      if (safeLimit < 100) {
+        throw new Error(
+          `[memory-lancedb-pro] Failed to embed: input too large for model context after ${MAX_EMBED_DEPTH} retries`
+        );
+      }
+      text = text.slice(0, safeLimit);
+    }
+
     // Check cache first
     const cached = this._cache.get(text, task);
     if (cached) return cached;
 
     try {
-      const response = await this.embedWithRetry(this.buildPayload(text, task));
+      const response = await this.embedWithRetry(this.buildPayload(text, task), signal);
       const embedding = response.data[0]?.embedding as number[] | undefined;
       if (!embedding) {
         throw new Error("No embedding returned from provider");
@@ -628,12 +677,35 @@ export class Embedder {
             throw new Error(`Failed to chunk document: ${errorMsg}`);
           }
 
+          // FR-03: Single chunk output detection — if smartChunk produced only
+          // one chunk that is nearly the same size as the original text, chunking
+          // did not actually reduce the problem. Force-truncate with STRICT
+          // reduction to guarantee progress.
+          if (
+            chunkResult.chunks.length === 1 &&
+            chunkResult.chunks[0].length > text.length * 0.9
+          ) {
+            // Use strict reduction factor to guarantee each retry makes progress
+            const safeLimit = Math.floor(text.length * STRICT_REDUCTION_FACTOR);
+            console.warn(
+              `[memory-lancedb-pro] smartChunk produced 1 chunk (${chunkResult.chunks[0].length} chars) ≈ original (${text.length} chars). ` +
+              `Force-truncating to ${safeLimit} chars (strict ${STRICT_REDUCTION_FACTOR * 100}% reduction) to avoid infinite recursion.`
+            );
+            if (safeLimit < 100) {
+              throw new Error(
+                `[memory-lancedb-pro] Failed to embed: chunking couldn't reduce input size enough for model context`
+              );
+            }
+            const truncated = text.slice(0, safeLimit);
+            return this.embedSingle(truncated, task, depth + 1, signal);
+          }
+
           // Embed all chunks in parallel
           console.log(`Split document into ${chunkResult.chunkCount} chunks for embedding`);
           const chunkEmbeddings = await Promise.all(
             chunkResult.chunks.map(async (chunk, idx) => {
               try {
-                const embedding = await this.embedSingle(chunk, task);
+                const embedding = await this.embedSingle(chunk, task, depth + 1, signal);
                 return { embedding };
               } catch (chunkError) {
                 console.warn(`Failed to embed chunk ${idx}:`, chunkError);
@@ -661,14 +733,9 @@ export class Embedder {
 
           return finalEmbedding;
         } catch (chunkError) {
-          // If chunking fails, throw the original error
-          console.warn(`Chunking failed, using original error:`, chunkError);
-          const friendly = formatEmbeddingProviderError(error, {
-            baseURL: this._baseURL,
-            model: this._model,
-            mode: "single",
-          });
-          throw new Error(friendly, { cause: error });
+          // Preserve and surface the more specific chunkError
+          console.warn(`Chunking failed:`, chunkError);
+          throw chunkError;
         }
       }
 
diff --git a/test/cjk-recursion-regression.test.mjs b/test/cjk-recursion-regression.test.mjs