Skip to content

Commit 2b0174e

Browse files
authored
Merge pull request #238 from Hi-Jiajun/fix-reviewer-concerns
fix: prevent infinite recursion in embedSingle() for CJK text (replaces PR #215)
2 parents 4daa1bc + b152482 commit 2b0174e

File tree

6 files changed

+384
-35
lines changed

6 files changed

+384
-35
lines changed

index.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2093,8 +2093,20 @@ const memoryLanceDBProPlugin = {
20932093
const agentId = resolveHookAgentId(ctx?.agentId, (event as any).sessionKey);
20942094
const accessibleScopes = resolveScopeFilter(scopeManager, agentId);
20952095

2096+
// FR-04: Truncate long prompts (e.g. file attachments) before embedding.
2097+
// Auto-recall only needs the user's intent, not full attachment text.
2098+
const MAX_RECALL_QUERY_LENGTH = 1_000;
2099+
let recallQuery = event.prompt;
2100+
if (recallQuery.length > MAX_RECALL_QUERY_LENGTH) {
2101+
const originalLength = recallQuery.length;
2102+
recallQuery = recallQuery.slice(0, MAX_RECALL_QUERY_LENGTH);
2103+
api.logger.info(
2104+
`memory-lancedb-pro: auto-recall query truncated from ${originalLength} to ${MAX_RECALL_QUERY_LENGTH} chars`
2105+
);
2106+
}
2107+
20962108
const results = filterUserMdExclusiveRecallResults(await retrieveWithRetry({
2097-
query: event.prompt,
2109+
query: recallQuery,
20982110
limit: 3,
20992111
scopeFilter: accessibleScopes,
21002112
source: "auto-recall",

package-lock.json

Lines changed: 0 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
"@lancedb/lancedb": "^0.26.2",
2929
"@sinclair/typebox": "0.34.48",
3030
"apache-arrow": "18.1.0",
31-
"json5": "^2.2.3",
3231
"openai": "^6.21.0"
3332
},
3433
"openclaw": {
@@ -37,7 +36,7 @@
3736
]
3837
},
3938
"scripts": {
40-
"test": "node test/embedder-error-hints.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs",
39+
"test": "node test/embedder-error-hints.test.mjs && node test/cjk-recursion-regression.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs",
4140
"test:openclaw-host": "node test/openclaw-host-functional.mjs",
4241
"version": "node scripts/sync-plugin-version.mjs openclaw.plugin.json package.json && git add openclaw.plugin.json"
4342
},

src/chunker.ts

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,32 @@ function sliceTrimWithIndices(text: string, start: number, end: number): { chunk
162162
};
163163
}
164164

165+
// ============================================================================
166+
// CJK Detection
167+
// ============================================================================
168+
169+
// CJK Unicode ranges: Unified Ideographs, Extension A, Compatibility,
170+
// Hangul Syllables, Katakana, Hiragana
171+
const CJK_RE =
172+
/[\u3040-\u309F\u30A0-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF]/;
173+
174+
/** Ratio of CJK characters to total non-whitespace characters. */
175+
function getCjkRatio(text: string): number {
176+
let cjk = 0;
177+
let total = 0;
178+
for (const ch of text) {
179+
if (/\s/.test(ch)) continue;
180+
total++;
181+
if (CJK_RE.test(ch)) cjk++;
182+
}
183+
return total === 0 ? 0 : cjk / total;
184+
}
185+
186+
// CJK chars are ~2-3 tokens each. When text is predominantly CJK, we divide
187+
// char limits by this factor to stay within the model's token budget.
188+
const CJK_CHAR_TOKEN_DIVISOR = 2.5;
189+
const CJK_RATIO_THRESHOLD = 0.3;
190+
165191
// ============================================================================
166192
// Chunking Core
167193
// ============================================================================
@@ -239,10 +265,15 @@ export function smartChunk(text: string, embedderModel?: string): ChunkResult {
239265
const limit = embedderModel ? EMBEDDING_CONTEXT_LIMITS[embedderModel] : undefined;
240266
const base = limit ?? 8192;
241267

268+
// CJK characters consume ~2-3 tokens each, so a char-based limit that works
269+
// for Latin text will vastly overshoot the token budget for CJK-heavy text.
270+
const cjkHeavy = getCjkRatio(text) > CJK_RATIO_THRESHOLD;
271+
const divisor = cjkHeavy ? CJK_CHAR_TOKEN_DIVISOR : 1;
272+
242273
const config: ChunkerConfig = {
243-
maxChunkSize: Math.max(1000, Math.floor(base * 0.7)),
244-
overlapSize: Math.max(0, Math.floor(base * 0.05)),
245-
minChunkSize: Math.max(100, Math.floor(base * 0.1)),
274+
maxChunkSize: Math.max(200, Math.floor(base * 0.7 / divisor)),
275+
overlapSize: Math.max(0, Math.floor(base * 0.05 / divisor)),
276+
minChunkSize: Math.max(100, Math.floor(base * 0.1 / divisor)),
246277
semanticSplit: true,
247278
maxLinesPerChunk: 50,
248279
};

src/embedder.ts

Lines changed: 82 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,22 @@ export function formatEmbeddingProviderError(
352352
return `${genericPrefix}${detailText}`;
353353
}
354354

355+
// ============================================================================
356+
// Safety Constants
357+
// ============================================================================
358+
359+
/** Maximum recursion depth for embedSingle chunking retries. */
360+
const MAX_EMBED_DEPTH = 3;
361+
362+
/** Global timeout for a single embedding operation (ms). */
363+
const EMBED_TIMEOUT_MS = 10_000;
364+
365+
/**
366+
* Strictly decreasing character limit for forced truncation.
367+
* Each recursion level MUST reduce input by this factor to guarantee progress.
368+
*/
369+
const STRICT_REDUCTION_FACTOR = 0.5; // Each retry must be at most 50% of previous
370+
355371
export function getVectorDimensions(model: string, overrideDims?: number): number {
356372
if (overrideDims && overrideDims > 0) {
357373
return overrideDims;
@@ -472,16 +488,23 @@ export class Embedder {
472488
/**
473489
* Call embeddings.create with automatic key rotation on rate-limit errors.
474490
* Tries each key in the pool at most once before giving up.
491+
* Accepts an optional AbortSignal to support true request cancellation.
475492
*/
476-
private async embedWithRetry(payload: any): Promise<any> {
493+
private async embedWithRetry(payload: any, signal?: AbortSignal): Promise<any> {
477494
const maxAttempts = this.clients.length;
478495
let lastError: Error | undefined;
479496

480497
for (let attempt = 0; attempt < maxAttempts; attempt++) {
481498
const client = this.nextClient();
482499
try {
483-
return await client.embeddings.create(payload);
500+
// Pass signal to OpenAI SDK if provided (SDK v6+ supports this)
501+
return await client.embeddings.create(payload, signal ? { signal } : undefined);
484502
} catch (error) {
503+
// If aborted, re-throw immediately
504+
if (error instanceof Error && error.name === 'AbortError') {
505+
throw error;
506+
}
507+
485508
lastError = error instanceof Error ? error : new Error(String(error));
486509

487510
if (this.isRateLimitError(error) && attempt < maxAttempts - 1) {
@@ -510,6 +533,13 @@ export class Embedder {
510533
return this.clients.length;
511534
}
512535

536+
/** Wrap a single embedding operation with a global timeout via AbortSignal. */
537+
private withTimeout<T>(promiseFactory: (signal: AbortSignal) => Promise<T>, _label: string): Promise<T> {
538+
const controller = new AbortController();
539+
const timeoutId = setTimeout(() => controller.abort(), EMBED_TIMEOUT_MS);
540+
return promiseFactory(controller.signal).finally(() => clearTimeout(timeoutId));
541+
}
542+
513543
// --------------------------------------------------------------------------
514544
// Backward-compatible API
515545
// --------------------------------------------------------------------------
@@ -534,13 +564,17 @@ export class Embedder {
534564
// --------------------------------------------------------------------------
535565

536566
async embedQuery(text: string): Promise<number[]> {
537-
return this.embedSingle(text, this._taskQuery);
567+
return this.withTimeout((signal) => this.embedSingle(text, this._taskQuery, 0, signal), "embedQuery");
538568
}
539569

540570
async embedPassage(text: string): Promise<number[]> {
541-
return this.embedSingle(text, this._taskPassage);
571+
return this.withTimeout((signal) => this.embedSingle(text, this._taskPassage, 0, signal), "embedPassage");
542572
}
543573

574+
// Note: embedBatchQuery/embedBatchPassage are NOT wrapped with withTimeout because
575+
// they handle multiple texts in a single API call. The timeout would fire after
576+
// EMBED_TIMEOUT_MS regardless of how many texts succeed. Individual text embedding
577+
// within the batch is protected by the SDK's own timeout handling.
544578
async embedBatchQuery(texts: string[]): Promise<number[][]> {
545579
return this.embedMany(texts, this._taskQuery);
546580
}
@@ -595,17 +629,32 @@ export class Embedder {
595629
return payload;
596630
}
597631

598-
private async embedSingle(text: string, task?: string): Promise<number[]> {
632+
private async embedSingle(text: string, task?: string, depth: number = 0, signal?: AbortSignal): Promise<number[]> {
599633
if (!text || text.trim().length === 0) {
600634
throw new Error("Cannot embed empty text");
601635
}
602636

637+
// FR-01: Recursion depth limit — force truncate when too deep
638+
if (depth >= MAX_EMBED_DEPTH) {
639+
const safeLimit = Math.floor(text.length * STRICT_REDUCTION_FACTOR);
640+
console.warn(
641+
`[memory-lancedb-pro] Recursion depth ${depth} reached MAX_EMBED_DEPTH (${MAX_EMBED_DEPTH}), ` +
642+
`force-truncating ${text.length} chars → ${safeLimit} chars (strict ${STRICT_REDUCTION_FACTOR * 100}% reduction)`
643+
);
644+
if (safeLimit < 100) {
645+
throw new Error(
646+
`[memory-lancedb-pro] Failed to embed: input too large for model context after ${MAX_EMBED_DEPTH} retries`
647+
);
648+
}
649+
text = text.slice(0, safeLimit);
650+
}
651+
603652
// Check cache first
604653
const cached = this._cache.get(text, task);
605654
if (cached) return cached;
606655

607656
try {
608-
const response = await this.embedWithRetry(this.buildPayload(text, task));
657+
const response = await this.embedWithRetry(this.buildPayload(text, task), signal);
609658
const embedding = response.data[0]?.embedding as number[] | undefined;
610659
if (!embedding) {
611660
throw new Error("No embedding returned from provider");
@@ -628,12 +677,35 @@ export class Embedder {
628677
throw new Error(`Failed to chunk document: ${errorMsg}`);
629678
}
630679

680+
// FR-03: Single chunk output detection — if smartChunk produced only
681+
// one chunk that is nearly the same size as the original text, chunking
682+
// did not actually reduce the problem. Force-truncate with STRICT
683+
// reduction to guarantee progress.
684+
if (
685+
chunkResult.chunks.length === 1 &&
686+
chunkResult.chunks[0].length > text.length * 0.9
687+
) {
688+
// Use strict reduction factor to guarantee each retry makes progress
689+
const safeLimit = Math.floor(text.length * STRICT_REDUCTION_FACTOR);
690+
console.warn(
691+
`[memory-lancedb-pro] smartChunk produced 1 chunk (${chunkResult.chunks[0].length} chars) ≈ original (${text.length} chars). ` +
692+
`Force-truncating to ${safeLimit} chars (strict ${STRICT_REDUCTION_FACTOR * 100}% reduction) to avoid infinite recursion.`
693+
);
694+
if (safeLimit < 100) {
695+
throw new Error(
696+
`[memory-lancedb-pro] Failed to embed: chunking couldn't reduce input size enough for model context`
697+
);
698+
}
699+
const truncated = text.slice(0, safeLimit);
700+
return this.embedSingle(truncated, task, depth + 1, signal);
701+
}
702+
631703
// Embed all chunks in parallel
632704
console.log(`Split document into ${chunkResult.chunkCount} chunks for embedding`);
633705
const chunkEmbeddings = await Promise.all(
634706
chunkResult.chunks.map(async (chunk, idx) => {
635707
try {
636-
const embedding = await this.embedSingle(chunk, task);
708+
const embedding = await this.embedSingle(chunk, task, depth + 1, signal);
637709
return { embedding };
638710
} catch (chunkError) {
639711
console.warn(`Failed to embed chunk ${idx}:`, chunkError);
@@ -661,14 +733,9 @@ export class Embedder {
661733

662734
return finalEmbedding;
663735
} catch (chunkError) {
664-
// If chunking fails, throw the original error
665-
console.warn(`Chunking failed, using original error:`, chunkError);
666-
const friendly = formatEmbeddingProviderError(error, {
667-
baseURL: this._baseURL,
668-
model: this._model,
669-
mode: "single",
670-
});
671-
throw new Error(friendly, { cause: error });
736+
// Preserve and surface the more specific chunkError
737+
console.warn(`Chunking failed:`, chunkError);
738+
throw chunkError;
672739
}
673740
}
674741

0 commit comments

Comments
 (0)