fix(memory): improve search relevance and configuration defaults

jinliyl · jinliyl · commit dfb402039cb0 · 2026-02-10T16:24:43.000+08:00
diff --git a/reme/__init__.py b/reme/__init__.py
@@ -18,7 +18,7 @@
     "ReMeFs",
 ]
 
-__version__ = "0.3.0.0a2"
+__version__ = "0.3.0.0a3"
 
 
 """
diff --git a/reme/agent/chat/fs_cli.yaml b/reme/agent/chat/fs_cli.yaml
@@ -97,7 +97,7 @@ system_prompt_zh: |
 
   ### 🔍 检索工具
   在回答关于过往工作、决策、日期、人员、偏好或待办事项的问题之前：
-  1. 对 MEMORY.md + memory/*.md 运行 `memory_search`
+  1. 对 MEMORY.md + memory/*.md 运行 `memory_search`，没有搜索结果可以从不同角度多次尝试
   2. 如果你需要阅读每日笔记 `memory/YYYY-MM-DD.md`，可以使用读取工具访问它。
 
   ### 🛠️ 其他工具
@@ -108,6 +108,7 @@ system_prompt_zh: |
   - **write_tool** — 创建新文件
   - **execute_code** — 运行 Python 代码
   - **dashscope_search** — 网络搜索
+  如果对于工具结果不满意，可以混合使用多种工具，或者单个工具不同的使用参数。
 
   ## 像人类一样回应 😊
   **何时使用表情回应：**
diff --git a/reme/core/memory_store/sqlite_memory_store.py b/reme/core/memory_store/sqlite_memory_store.py
@@ -159,12 +159,13 @@ async def _create_tables(self) -> None:
                     path UNINDEXED,
                     source UNINDEXED,
                     start_line UNINDEXED,
-                    end_line UNINDEXED
+                    end_line UNINDEXED,
+                    tokenize='trigram'
                 )
             """,
             )
             self.fts_available = True
-            logger.info("Created FTS5 table")
+            logger.info("Created FTS5 table with trigram tokenizer")
 
         self.conn.commit()
         cursor.close()
@@ -538,7 +539,10 @@ async def vector_search(
 
             results = []
             for _, path, start, end, src, text, dist in cursor.fetchall():
-                score = max(0.0, 1.0 - dist)
+                # Convert L2 distance to similarity score
+                # For normalized vectors, L2 distance range is [0, 2]
+                # Map to [1, 0] score range (higher score = more similar)
+                score = max(0.0, 1.0 - dist / 2.0)
                 snippet = text
                 results.append(
                     MemorySearchResult(
@@ -548,7 +552,7 @@ async def vector_search(
                         score=score,
                         snippet=snippet,
                         source=MemorySource(src),
-                        distance=dist,
+                        raw_metric=dist,
                     ),
                 )
 
@@ -568,7 +572,19 @@ def _sanitize_fts_query(self, query: str) -> str:
         - " (phrase search, needs escaping)
         - : (column filter)
         - ^ (start of line anchor, not standard FTS5)
-        - Other special chars that may interfere
+        - ' (single quote, causes syntax errors)
+        - ` (backtick, can cause issues)
+        - | (pipe, OR operator)
+        - + (plus, can be used for required terms)
+        - - (minus, NOT operator)
+        - = (equals, can cause issues)
+        - < > (angle brackets, comparison operators)
+        - ! (exclamation, NOT operator variant)
+        - @ # $ % & (other special chars)
+        - "\"
+        - / (slash, can interfere)
+        - ; (semicolon, statement separator)
+        - , (comma, can interfere with phrase parsing)
 
         Args:
             query: Raw query string
@@ -580,8 +596,38 @@ def _sanitize_fts_query(self, query: str) -> str:
             return ""
 
         # Remove FTS5 special characters that we don't want users to use
-        # Keep only alphanumeric, spaces, and some safe punctuation
-        special_chars = ["*", "?", ":", "^", "(", ")", "[", "]", "{", "}"]
+        # Keep only alphanumeric, spaces, periods, and underscores
+        special_chars = [
+            "*",
+            "?",
+            ":",
+            "^",
+            "(",
+            ")",
+            "[",
+            "]",
+            "{",
+            "}",
+            "'",
+            '"',
+            "`",
+            "|",
+            "+",
+            "-",
+            "=",
+            "<",
+            ">",
+            "!",
+            "@",
+            "#",
+            "$",
+            "%",
+            "&",
+            "\\",
+            "/",
+            ";",
+            ",",
+        ]
         cleaned = query
         for char in special_chars:
             cleaned = cleaned.replace(char, " ")
@@ -650,6 +696,7 @@ async def keyword_search(
                         score=score,
                         snippet=snippet,
                         source=MemorySource(src),
+                        raw_metric=rank,
                     ),
                 )
 
diff --git a/reme/core/schema/memory_search_result.py b/reme/core/schema/memory_search_result.py
@@ -14,7 +14,7 @@ class MemorySearchResult(BaseModel):
     score: float = Field(..., description="Relevance score of the search result")
     snippet: str = Field(..., description="Text snippet from the matched content")
     source: MemorySource = Field(..., description="Source of the memory data")
-    distance: float | None = Field(None, description="Original distance value from vector search")
+    raw_metric: float | None = Field(None, description="Raw metric value from search (e.g., distance, rank)")
     metadata: dict = Field(default_factory=dict, description="Additional metadata")
 
     @property
diff --git a/reme/core/schema/service_config.py b/reme/core/schema/service_config.py
@@ -107,8 +107,8 @@ class FileWatcherConfig(BaseModel):
     suffix_filters: list[str] = Field(default_factory=list)
     recursive: bool = Field(default=False)
     debounce: int = Field(default=500)
-    chunk_tokens: int = Field(default=400)
-    chunk_overlap: int = Field(default=80)
+    chunk_tokens: int = Field(default=1000)
+    chunk_overlap: int = Field(default=100)
     memory_store: str = Field(default="default")
     scan_on_start: bool = Field(default=True)
 
diff --git a/reme/reme_fs.py b/reme/reme_fs.py
@@ -53,8 +53,8 @@ def __init__(
         suffix_filters: list[str] | None = None,
         recursive: bool = False,
         debounce: int = 500,
-        chunk_tokens: int = 400,
-        chunk_overlap: int = 80,
+        chunk_tokens: int = 1000,
+        chunk_overlap: int = 100,
         scan_on_start: bool = True,
         default_file_watcher_config: dict | None = None,
         context_window_tokens: int = 128000,
@@ -182,16 +182,16 @@ async def summary(self, messages: list[Message | dict], date: str, language: str
         )
         return await summarizer.call(messages=messages, date=date, service_context=self.service_context)
 
-    async def memory_search(self, query: str, max_results: int = 10, min_score: float = 0.3) -> str:
+    async def memory_search(self, query: str, max_results: int = 5, min_score: float = 0.1) -> str:
         """
         Mandatory recall step: semantically search MEMORY.md + memory/*.md (and optional session transcripts)
         before answering questions about prior work, decisions, dates, people, preferences, or todos;
         returns top snippets with path + lines.
 
         Args:
             query: The semantic search query to find relevant memory snippets
-            max_results: Maximum number of search results to return (optional), default is 10
-            min_score: Minimum similarity score threshold for results (optional), default is 0.3
+            max_results: Maximum number of search results to return (optional), default is 5
+            min_score: Minimum similarity score threshold for results (optional), default is 0.1
 
         Returns:
             Search results as formatted string
diff --git a/reme/tool/fs/fs_memory_search.py b/reme/tool/fs/fs_memory_search.py
@@ -16,7 +16,7 @@ def __init__(
         self,
         sources: list[MemorySource] | None = None,
         min_score: float = 0.1,
-        max_results: int = 20,
+        max_results: int = 5,
         hybrid_enabled: bool = True,
         hybrid_vector_weight: float = 0.7,
         hybrid_text_weight: float = 0.3,
@@ -52,11 +52,11 @@ def _build_tool_call(self) -> ToolCall:
                         },
                         "max_results": {
                             "type": "integer",
-                            "description": "Maximum number of search results to return (optional)",
+                            "description": "Maximum number of search results to return (optional), default 5",
                         },
                         "min_score": {
                             "type": "number",
-                            "description": "Minimum similarity score threshold for results (optional)",
+                            "description": "Minimum similarity score threshold for results (optional), default 0.1",
                         },
                     },
                     "required": ["query"],
@@ -79,16 +79,16 @@ async def execute(self) -> str:
             vector_results = await self._search_vector(query, candidates)
 
             # Log original vector results
-            logger.debug("\n=== Vector Search Results ===")
+            logger.info("\n=== Vector Search Results ===")
             for i, r in enumerate(vector_results[:10], 1):
                 snippet_preview = (r.snippet[:100] + "...") if len(r.snippet) > 100 else r.snippet
-                logger.debug(f"{i}. Score: {r.score:.4f} | Snippet: {snippet_preview}")
+                logger.info(f"{i}. Score: {r.score:.4f} | Snippet: {snippet_preview}")
 
             # Log original keyword results
-            logger.debug("\n=== Keyword Search Results ===")
+            logger.info("\n=== Keyword Search Results ===")
             for i, r in enumerate(keyword_results[:10], 1):
                 snippet_preview = (r.snippet[:100] + "...") if len(r.snippet) > 100 else r.snippet
-                logger.debug(f"{i}. Score: {r.score:.4f} | Snippet: {snippet_preview}")
+                logger.info(f"{i}. Score: {r.score:.4f} | Snippet: {snippet_preview}")
 
             if not keyword_results:
                 results = [r for r in vector_results if r.score >= min_score][:max_results]
@@ -103,10 +103,10 @@ async def execute(self) -> str:
                 )
 
                 # Log merged results
-                logger.debug("\n=== Merged Hybrid Results ===")
+                logger.info("\n=== Merged Hybrid Results ===")
                 for i, r in enumerate(merged[:10], 1):
                     snippet_preview = (r.snippet[:100] + "...") if len(r.snippet) > 100 else r.snippet
-                    logger.debug(f"{i}. Score: {r.score:.4f} | Snippet: {snippet_preview}")
+                    logger.info(f"{i}. Score: {r.score:.4f} | Snippet: {snippet_preview}")
 
                 results = [r for r in merged if r.score >= min_score][:max_results]
         else:

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`"ReMeFs",`
`19`	`19`	`]`
`20`	`20`
`21`		`-__version__ = "0.3.0.0a2"`
	`21`	`+__version__ = "0.3.0.0a3"`
`22`	`22`
`23`	`23`
`24`	`24`	`"""`