feat: optimize memory search deduplication and fix parsing bugs

glin1993@outlook.com · glin1993@outlook.com · commit 787f6f03d59c · 2025-12-26T10:48:32.000+08:00
- Tune similarity threshold to 0.92 for 'dedup=sim' to preserve subtle semantic nuances.
- Implement recall expansion (5x Top-K) when deduplicating to ensure output diversity.
- Remove aggressive filling logic to strictly enforce the similarity threshold.
- Fix attribute error in MultiModalStructMemReader by correctly importing parse_json_result.
- Replace fragile eval() with robust parse_json_result in TaskGoalParser to handle JSON booleans.
diff --git a/src/memos/api/handlers/search_handler.py b/src/memos/api/handlers/search_handler.py
@@ -55,12 +55,19 @@ def handle_search_memories(self, search_req: APISearchRequest) -> SearchResponse
         """
         self.logger.info(f"[SearchHandler] Search Req is: {search_req}")
 
+        # Increase recall pool if deduplication is enabled to ensure diversity
+        original_top_k = search_req.top_k
+        if search_req.dedup == "sim":
+            search_req.top_k = original_top_k * 5
+
         cube_view = self._build_cube_view(search_req)
 
         results = cube_view.search_memories(search_req)
         if search_req.dedup == "sim":
-            results = self._dedup_text_memories(results, search_req.top_k)
+            results = self._dedup_text_memories(results, original_top_k)
             self._strip_embeddings(results)
+            # Restore original top_k for downstream logic or response metadata
+            search_req.top_k = original_top_k
 
         self.logger.info(
             f"[SearchHandler] Final search results: count={len(results)} results={results}"
@@ -104,35 +111,18 @@ def _dedup_text_memories(self, results: dict[str, Any], target_top_k: int) -> di
             bucket_idx = flat[idx][0]
             if len(selected_by_bucket[bucket_idx]) >= target_top_k:
                 continue
-            if self._is_unrelated(idx, selected_global, similarity_matrix, 0.85):
+            # Use 0.92 threshold strictly
+            if self._is_unrelated(idx, selected_global, similarity_matrix, 0.92):
                 selected_by_bucket[bucket_idx].append(idx)
                 selected_global.append(idx)
 
-        for bucket_idx in range(len(buckets)):
-            if len(selected_by_bucket[bucket_idx]) >= min(
-                target_top_k, len(indices_by_bucket[bucket_idx])
-            ):
-                continue
-            remaining_indices = [
-                idx
-                for idx in indices_by_bucket.get(bucket_idx, [])
-                if idx not in selected_by_bucket[bucket_idx]
-            ]
-            if not remaining_indices:
-                continue
-            # Fill to target_top_k with the least-similar candidates to preserve diversity.
-            remaining_indices.sort(
-                key=lambda idx: self._max_similarity(idx, selected_global, similarity_matrix)
-            )
-            for idx in remaining_indices:
-                if len(selected_by_bucket[bucket_idx]) >= target_top_k:
-                    break
-                selected_by_bucket[bucket_idx].append(idx)
-                selected_global.append(idx)
+        # Removed the 'filling' logic that was pulling back similar items.
+        # Now it will only return items that truly pass the 0.92 threshold,
+        # up to target_top_k.
 
         for bucket_idx, bucket in enumerate(buckets):
             selected_indices = selected_by_bucket.get(bucket_idx, [])
-            bucket["memories"] = [flat[i][1] for i in selected_indices[:target_top_k]]
+            bucket["memories"] = [flat[i][1] for i in selected_indices]
         return results
 
     @staticmethod
diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py
@@ -10,6 +10,7 @@
 from memos.mem_reader.read_multi_modal import MultiModalParser, detect_lang
 from memos.mem_reader.read_multi_modal.base import _derive_key
 from memos.mem_reader.simple_struct import PROMPT_DICT, SimpleStructMemReader
+from memos.mem_reader.utils import parse_json_result
 from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata
 from memos.templates.tool_mem_prompts import TOOL_TRAJECTORY_PROMPT_EN, TOOL_TRAJECTORY_PROMPT_ZH
 from memos.types import MessagesType
@@ -377,7 +378,7 @@ def _get_llm_response(
         messages = [{"role": "user", "content": prompt}]
         try:
             response_text = self.llm.generate(messages)
-            response_json = self.parse_json_result(response_text)
+            response_json = parse_json_result(response_text)
         except Exception as e:
             logger.error(f"[LLM] Exception during chat generation: {e}")
             response_json = {
diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/retrieve_utils.py b/src/memos/memories/textual/tree_text_memory/retrieve/retrieve_utils.py
@@ -4,7 +4,6 @@
 from pathlib import Path
 from typing import Any
 
-
 import numpy as np
 
 from memos.dependency import require_python_package
diff --git a/src/memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py b/src/memos/memories/textual/tree_text_memory/retrieve/task_goal_parser.py
@@ -5,7 +5,10 @@
 from memos.llms.base import BaseLLM
 from memos.log import get_logger
 from memos.memories.textual.tree_text_memory.retrieve.retrieval_mid_structs import ParsedTaskGoal
-from memos.memories.textual.tree_text_memory.retrieve.retrieve_utils import FastTokenizer
+from memos.memories.textual.tree_text_memory.retrieve.retrieve_utils import (
+    FastTokenizer,
+    parse_json_result,
+)
 from memos.memories.textual.tree_text_memory.retrieve.utils import TASK_PARSE_PROMPT
 
 
@@ -111,8 +114,10 @@ def _parse_response(self, response: str, **kwargs) -> ParsedTaskGoal:
         for attempt_times in range(attempts):
             try:
                 context = kwargs.get("context", "")
-                response = response.replace("```", "").replace("json", "").strip()
-                response_json = eval(response)
+                response_json = parse_json_result(response)
+                if not response_json:
+                    raise ValueError("Parsed JSON is empty")
+
                 return ParsedTaskGoal(
                     memories=response_json.get("memories", []),
                     keys=response_json.get("keys", []),
@@ -123,6 +128,8 @@ def _parse_response(self, response: str, **kwargs) -> ParsedTaskGoal:
                     context=context,
                 )
             except Exception as e:
-                raise ValueError(
-                    f"Failed to parse LLM output: {e}\nRaw response:\n{response} retried: {attempt_times + 1}/{attempts + 1}"
-                ) from e
+                if attempt_times == attempts - 1:
+                    raise ValueError(
+                        f"Failed to parse LLM output: {e}\nRaw response:\n{response} retried: {attempt_times + 1}/{attempts}"
+                    ) from e
+                continue
diff --git a/记忆检索接口去重策略.md b/记忆检索接口去重策略.md
@@ -0,0 +1,222 @@
+# 记忆检索接口去重策略
+
+## 1. 引言
+
+在LLM + 记忆的系统中，去重已成为降低推理成本、减少冗余的关键手段。然而，如果在检索阶段直接执行**重度语义级去重或事实合并**，可能会无意中**改变模型可见的证据结构**。通过对开源实现的分析指出，我们认为在 Search 接口层触发此类逻辑，**会导致时间趋势坍缩、用户偏好演化证据丢失及推理权重人为削弱等关键问题**。据此，本文需要讨论检索接口是否应保持其证据透明性，不承担复杂的解释性逻辑的，提供什么程度的去重逻辑。
+
+## 2. 相关系统与可验证的去重行为
+
+为避免“去重发生位置”被混淆，有必要从**系统pipeline分层**的角度，对不同系统在_写入–检索–注入Prompt_三个阶段的行为进行明确区分。
+
+### 2.1 不同系统去重行为的阶段对比
+
+下表总结了 SuperMemory 与 mem0 在不同阶段是否执行去重或语义合并，以及该行为对检索语义的影响：
+
+| 系统 | 写入阶段（Write） | 检索阶段（Search） | Prompt 阶段 | 对 Search 语义的影响 |
+| --- | --- | --- | --- | --- |
+| **SuperMemory** | 不去重 | 不去重 | 文本级去重（Set / Exact Match） | 无影响（仅减少展示冗余） |
+| **mem0** | 事实抽取与合并（可能覆盖/失效旧记忆） | 不去重 | 不去重 | 间接影响（历史事件可能已不存在） |
+
+该对比表明，两种系统虽然都具备“去重”行为，但其**发生位置与语义后果截然不同**：
+
+*   SuperMemory的去重是 Prompt**层面的优化**，不会改变检索结果的证据结构。
+    
+*   mem0的去重/合并是**记忆管理层面的设计取舍**，Search只是检索一个已被整理过的状态集合。
+    
+
+### 2.2 supermemory： Prompt层驱动的轻量去重策略
+
+与 mem0 不同，supermemory 并不在存储或检索排序阶段进行语义合并，其去重逻辑主要发生在将记忆注入 LLM Prompt 之前。
+
+其可验证特征包括：
+
+*   基于文本层面的近似或完全匹配
+    
+*   目标是减少上下文冗余而非重塑知识结构
+    
+*   不修改底层向量或元数据
+    
+
+## 3. 在 Search 接口执行语义去重的系统性风险
+
+| 风险类型 | 具体表现 | 直接后果 | 典型场景 |
+| --- | --- | --- | --- |
+| **趋势信号坍缩** | 语义相似记录在检索阶段被合并 | 无法区分持续状态与瞬时事件 | 日志分析、系统监控、异常检测 |
+| **演化证据丢失** | 仅保留“最新事实”，历史状态不可见 | 无法进行时间回溯与变更解释 | 用户偏好、计划修订、决策追踪 |
+| **证据权重削弱** | 高频重复信号被压缩为单条 | 模型低估重要性与关注度 | RAG决策支持、用户反馈分析 |
+
+## 4. 为什么不应在Search接口引入重度逻辑
+
+从系统设计角度看，在 Search 接口引入复杂语义去重或解释性逻辑，会带来一系列结构性问题，见下表。
+
+| 设计维度 | 重度逻辑带来的问题 | 对系统的影响 |
+| --- | --- | --- |
+| **接口语义** | Search 不再只是证据召回，而是隐式裁决 | 破坏检索语义契约 |
+| **易用性** | 行为不可预测，调试成本上升 | 降低接口复用性 |
+| **关注点分离** | 检索与推理逻辑耦合 | 架构僵化、难以演进 |
+| **性能与扩展性** | 语义计算进入关键路径 | 延迟上升，吞吐受限 |
+
+因此，Search接口应保持规则化、轻量化，仅承担相关证据返回的职责。
+
+*   Search 是系统中被复用最频繁的接口之一。复杂、隐式的去重逻辑会使开发者难以预测检索结果，增加系统调试与使用成本。
+    
+*   解释、总结与推理应当发生在下游模块。将这些逻辑嵌入 Search 阶段，会导致存储、检索与推理层耦合，削弱系统灵活性与可审计性。
+    
+*   语义去重通常依赖向量相似度计算、成对比较甚至 LLM 判断。在 Search 阶段同步执行此类操作，会显著增加延迟，并限制系统在高并发或实时场景下的可扩展性。
+    
+
+## 5. 去重策略的最小化扩展设计
+
+基于前文分析，我们**不主张在检索阶段引入复杂或“智能化”的去重机制，至少不应作为默认模式**。相反，我们认为去重能力应当以**最小化改动、最大化可控性**为原则，提供有限但清晰的策略扩展。
+
+核心思想并非构建新的复杂模块，而是：**为 Search 接口增加一个显式的去重选项参数，使调用方能够决定是否去重以及去重到何种程度。**
+
+### 5.1 设计原则
+
+该扩展设计遵循以下原则：
+
+1.  **保持默认行为不变**：不影响现有的系统与调用方。
+    
+2.  **不引入重型推理工具**：不使用 LLM、不进行事实裁决或复杂冲突消解。
+    
+3.  **Search 阶段只做过滤，不做解释**：所有策略均为规则化、可预测操作。
+    
+4.  **显式而非隐式**：去重行为必须由接口参数明确指定。
+    
+
+### 5.2 接口级去重选项
+
+在现有全文字匹配去重的基础上，Search 接口支持三种明确选项：
+
+*   **NONE（不去重）**  
+        Search返回所有命中的原始记录，不做任何过滤。该模式适用于日志分析、监控、审计、时间序列推理等场景，其中重复本身即为信号。
+    
+*   **EXACT（全文字匹配去重，默认）**  
+        保持现有行为，仅对完全相同或规范化后完全一致的文本进行去重，用于移除系统性重复写入或抓取造成的硬冗余。
+    
+*   **SEMANTIC（轻量语义去重，可选）**  
+        仅基于向量相似度等轻量数值指标，在高阈值条件下合并高度相似的结果。该模式明确不使用 LLM，不进行事实更新或冲突裁决，仅用于明确诸如“摘要输出”、“记忆展示”等为目标的调用场景。
+    
+为便于对接与沟通，下面给出**基于当前 server_api 搜索链路**的接口级去重选项流程图（SearchHandler → SingleCubeView/CompositeCubeView → text/pref 并行检索）。去重选项建议挂载在 **text_mem 结果后处理**阶段，pref_mem 仍按原流程返回。
+
+```mermaid
+flowchart TD
+    A["API search server_api"] --> B["SearchHandler"]
+    B --> C["SingleCubeView or CompositeCubeView"]
+    C --> D["Parallel: text_mem and pref_mem"]
+    D --> E{Search Mode}
+    E -->|FAST| F["Vector Search naive_mem_cube.text_mem.search"]
+    E -->|FINE| G["Retrieve + PostRetrieve + Enhance"]
+    E -->|MIXTURE| H["Scheduler Mix Search"]
+    F --> I["Text Results"]
+    G --> I
+    H --> I
+    D --> J["Preference Search optional"]
+    I --> K{Dedup Option}
+    K -->|NONE| L["No Filter"]
+    K -->|EXACT| M["Normalize + Exact Match"]
+    K -->|SEMANTIC| N["Similarity Filter\nHigh Threshold + Time Window"]
+    L --> O["post_process_textual_mem"]
+    M --> O
+    N --> O
+    J --> P["post_process_pref_mem"]
+    O --> Q["Assemble Response + Metadata"]
+    P --> Q
+```
+
+
+### 5.3 约束与安全边界
+
+即便在启用语义去重选项时，也必须遵守以下约束，以避免 Search 行为发生语义越界：
+
+1.  **不修改原始存储内容**：去重仅影响返回结果，不反向写入记忆层。
+    
+2.  **不跨时间窗口合并**：时间跨度超过设定阈值的记录视为独立事件。
+    
+3.  **不处理逻辑极性**：Search阶段不尝试判断“是否矛盾”，仅做相似度过滤。
+    
+4.  **结果可审计**：返回结果中应包含去重前后数量等基础元信息。
+    
+### 5.4 Auto 模式（0.6B 轻量模型）
+
+若希望在接口层提供“Auto”模式，可引入一个**0.6B 轻量模型**在**不改变默认行为**的前提下，基于检索上下文做出“是否去重”的判断。其定位是**策略建议器**而非裁决器：低置信度时回退到默认策略（EXACT），并始终遵循 5.3 的安全边界。
+
+```mermaid
+flowchart TD
+    A["API search server_api"] --> B["SearchHandler"]
+    B --> C["SingleCubeView or CompositeCubeView"]
+    C --> D["Parallel: text_mem and pref_mem"]
+    D --> E{Search Mode}
+    E -->|FAST/FINE/MIXTURE| F["Text Results"]
+    D --> G["Preference Search optional"]
+    F --> H{Dedup Option}
+    H -->|AUTO| I["Build Context Features\nquery type, time sensitivity, dup rate"]
+    I --> J["0.6B Model Predicts Mode\nNONE / EXACT / SEMANTIC + confidence"]
+    J -->|Low confidence| K["Fallback to EXACT"]
+    J -->|High confidence| L["Apply Suggested Mode"]
+    H -->|NONE/EXACT/SEMANTIC| M["Follow Explicit Option"]
+    K --> N["Exact Match Dedup"]
+    L --> O["Dedup if any\nrespect time window + no fact merge"]
+    M --> P["Apply Option"]
+    N --> Q["post_process_textual_mem"]
+    O --> Q
+    P --> Q
+    G --> R["post_process_pref_mem"]
+    Q --> S["Assemble Response + Metadata\nmode, confidence, counts"]
+    R --> S
+```
+
+
+## References
+
+\[1\] **Xu, W., Huang, L., Fox, A., Patterson, D., & Jordan, M. I.**  
+Detecting Large-Scale System Problems by Mining Console Logs.  
+_Proceedings of the 22nd ACM Symposium on Operating Systems Principles (SOSP)_, 2009.  
+[https://dl.acm.org/doi/10.1145/1629575.1629587](https://dl.acm.org/doi/10.1145/1629575.1629587)
+
+> （**趋势信号坍缩 / 日志频率与重复模式本身是异常信号**）
+
+---
+
+\[2\] **He, S., Zhu, J., He, P., & Lyu, M. R.**  
+A Survey on Automated Log Analysis for Reliability Engineering.  
+_ACM Computing Surveys_, 2021.  
+https://arxiv.org/abs/2009.08218
+
+> （**log rate anomaly / volume anomaly 是核心异常类型**）
+
+---
+
+\[3\] **Mem0 Team.**  
+Mem0: Building Long-Term Memory for LLM Agents.  
+_arXiv preprint arXiv:2504.19413_, 2024.  
+https://arxiv.org/abs/2504.19413
+
+> （**事实合并、冲突处理、invalidate vs overwrite、演化证据问题**）
+
+---
+
+\[4\] **mem0 Community Discussions.**  
+Handling contradictory or outdated facts in long-term memory.  
+GitHub Issues, mem0 repository.  
+[https://github.com/mem0ai/mem0/issues](https://github.com/mem0ai/mem0/issues)
+
+> （**覆盖旧记忆导致历史不可检索的工程争议**）
+
+---
+
+\[5\] **Hogarth, R. M., & Einhorn, H. J.**  
+Order Effects in Belief Updating: The Belief-Adjustment Model.  
+_Cognitive Psychology_, 24(1), 1–55, 1992.  
+https://doi.org/10.1016/0010-0285(92)90002-J
+
+> （**重复证据对置信度与判断权重的影响**）
+
+---
+
+\[6\] **Koriat, A.**  
+When Are Two Heads Better Than One and Why?  
+_Psychological Review_, 119(2), 384–409, 2012.  
+https://doi.org/10.1037/a0026639
+
+> （**多来源 / 多次证据提升判断可靠性**）