feat: update doc mem reader (#123)

Nyakult · CaralHsi · web-flow · commit aa4f1f939aba · 2025-07-19T12:31:04.000+08:00
* fix n4j cypher query

* feat: add llm extra body

* feat: update memory extraction prompt and result parser

* fix: evaluation locomo search

* ci: fix format and update test

* feat: update result json parser

* feat: recursively cluster nodes to max_cluster_size

* fix: fix template

* feat: keep default min-group-size 3

* feat: keep default min-group-size 3

* feat: update doc mem reader

* test: fix test

---------

Co-authored-by: CaralHsi &lt;caralhsi@gmail.com&gt;
diff --git a/src/memos/mem_reader/simple_struct.py b/src/memos/mem_reader/simple_struct.py
@@ -208,15 +208,15 @@ def _process_doc_data(self, scene_data_info, info):
         for i, chunk_res in enumerate(processed_chunks):
             if chunk_res:
                 node_i = TextualMemoryItem(
-                    memory=chunk_res["summary"],
+                    memory=chunk_res["value"],
                     metadata=TreeNodeTextualMemoryMetadata(
                         user_id=info.get("user_id"),
                         session_id=info.get("session_id"),
                         memory_type="LongTermMemory",
                         status="activated",
                         tags=chunk_res["tags"],
-                        key="",
-                        embedding=self.embedder.embed([chunk_res["summary"]])[0],
+                        key=chunk_res["key"],
+                        embedding=self.embedder.embed([chunk_res["value"]])[0],
                         usage=[],
                         sources=[f"{scene_data_info['file']}_{i}"],
                         background="",
diff --git a/src/memos/templates/mem_reader_prompts.py b/src/memos/templates/mem_reader_prompts.py
@@ -1,8 +1,5 @@
 SIMPLE_STRUCT_MEM_READER_PROMPT = """You are a memory extraction expert.
-Always respond in the same language as the conversation. If the conversation is in Chinese, respond in Chinese.
-
-Your task is to extract memories from the perspective of ${user_a}, based on a conversation between ${user_a} and ${user_b}. This means identifying what ${user_a} would plausibly remember — including their own experiences, thoughts, plans, or relevant statements and actions made by others (such as ${user_b}) that impacted or were acknowledged by ${user_a}.
-
+Your task is to extract memories from the perspective of user, based on a conversation between user and assistant. This means identifying what user would plausibly remember — including their own experiences, thoughts, plans, or relevant statements and actions made by others (such as assistant) that impacted or were acknowledged by user.
 Please perform:
 1. Identify information that reflects user's experiences, beliefs, concerns, decisions, plans, or reactions — including meaningful input from assistant that user acknowledged or responded to.
 2. Resolve all time, person, and event references clearly:
@@ -27,20 +24,16 @@
     {
       "key": <string, a unique, concise memory title>,
       "memory_type": <string, Either "LongTermMemory" or "UserMemory">,
-      "value": <A detailed, self-contained, and unambiguous memory statement
-      — written in English if the input conversation is in English,
-      or in Chinese if the conversation is in Chinese, or any language which
-      align with the conversation language>,
+      "value": <A detailed, self-contained, and unambiguous memory statement — written in English if the input conversation is in English, or in Chinese if the conversation is in Chinese>,
       "tags": <A list of relevant thematic keywords (e.g., ["deadline", "team", "planning"])>
     },
     ...
   ],
-  "summary": <a natural paragraph summarizing the above memories from user's
-  perspective, 120–200 words, **same language** as the input>
+  "summary": <a natural paragraph summarizing the above memories from user's perspective, 120–200 words, same language as the input>
 }
 
 Language rules:
-- The `key`, `value`, `tags`, `summary` fields must match the language of the input conversation.
+- The `key`, `value`, `tags`, `summary` fields must match the mostly used language of the input conversation.  **如果输入是中文，请输出中文**
 - Keep `memory_type` in English.
 
 Example:
@@ -92,37 +85,42 @@
 
 Your Output:"""
 
-SIMPLE_STRUCT_DOC_READER_PROMPT = """
-**ABSOLUTE, NON-NEGOTIABLE, CRITICAL RULE: The language of your entire JSON output's string values (specifically `summary` and `tags`) MUST be identical to the language of the input `[DOCUMENT_CHUNK]`. There are absolutely no exceptions. Do not translate. If the input is Chinese, the output must be Chinese. If English, the output must be English. Any deviation from this rule constitutes a failure to follow instructions.**
-
-You are an expert text analyst for a search and retrieval system. Your task is to process a document chunk and generate a single, structured JSON object.
-Written in English if the input conversation is in English, or in Chinese if
-the conversation is in Chinese, or any language which align with the
-conversation language. 如果输入语言是中文，请务必输出中文。
-
-The input is a single piece of text: `[DOCUMENT_CHUNK]`.
-You must generate a single JSON object with two top-level keys: `summary` and `tags`.
-Written in English if the input conversation is in English, or in Chinese if
-the conversation is in Chinese, or any language which align with the conversation language.
-
-1. `summary`:
-   - A dense, searchable summary of the ENTIRE `[DOCUMENT_CHUNK]`.
-   - The purpose is for semantic search embedding.
-   - A clear and accurate sentence that comprehensively summarizes the main points, arguments, and information within the `[DOCUMENT_CHUNK]`.
-   - The goal is to create a standalone overview that allows a reader to fully understand the essence of the chunk without reading the original text.
-   - The summary should be **no more than 50 words**.
-2. `tags`:
-   - A concise list of **3 to 5 high-level, summative tags**.
-   - **Each tag itself should be a short phrase, ideally 2 to 4 words long.**
-   - These tags must represent the core abstract themes of the text, suitable for broad categorization.
-   - **Crucially, prioritize abstract concepts** over specific entities or phrases mentioned in the text. For example, prefer "Supply Chain Resilience" over "Reshoring Strategies".
-
-Here is the document chunk to process:
-`[DOCUMENT_CHUNK]`
+SIMPLE_STRUCT_DOC_READER_PROMPT = """You are an expert text analyst for a search and retrieval system.
+Your task is to process a document chunk and generate a single, structured JSON object.
+
+Please perform:
+1. Identify key information that reflects factual content, insights, decisions, or implications from the documents — including any notable themes, conclusions, or data points. Allow a reader to fully understand the essence of the chunk without reading the original text.
+2. Resolve all time, person, location, and event references clearly:
+   - Convert relative time expressions (e.g., “last year,” “next quarter”) into absolute dates if context allows.
+   - Clearly distinguish between event time and document time.
+   - If uncertainty exists, state it explicitly (e.g., “around 2024,” “exact date unclear”).
+   - Include specific locations if mentioned.
+   - Resolve all pronouns, aliases, and ambiguous references into full names or identities.
+   - Disambiguate entities with the same name if applicable.
+3. Always write from a third-person perspective, referring to the subject or content clearly rather than using first-person ("I", "me", "my").
+4. Do not omit any information that is likely to be important or memorable from the document summaries.
+   - Include all key facts, insights, emotional tones, and plans — even if they seem minor.
+   - Prioritize completeness and fidelity over conciseness.
+   - Do not generalize or skip details that could be contextually meaningful.
+
+Return a single valid JSON object with the following structure:
+
+Return valid JSON:
+{
+  "key": <string, a concise title of the `value` field>,
+  "memory_type": "LongTermMemory",
+  "value": <A clear and accurate paragraph that comprehensively summarizes the main points, arguments, and information within the document chunk — written in English if the input memory items are in English, or in Chinese if the input is in Chinese>,
+  "tags": <A list of relevant thematic keywords (e.g., ["deadline", "team", "planning"])>
+}
+
+Language rules:
+- The `key`, `value`, `tags`, `summary` fields must match the mostly used language of the input document summaries.  **如果输入是中文，请输出中文**
+- Keep `memory_type` in English.
+
+Document chunk:
 {chunk_text}
 
-Produce ONLY the JSON object as your response.
-"""
+Your Output:"""
 
 SIMPLE_STRUCT_MEM_READER_EXAMPLE = """Example:
 Conversation:
diff --git a/src/memos/templates/tree_reorganize_prompts.py b/src/memos/templates/tree_reorganize_prompts.py
@@ -37,6 +37,44 @@
 
 """
 
+DOC_REORGANIZE_PROMPT = """You are a document summarization and knowledge extraction expert.
+
+Given the following summarized document items:
+
+{memory_items_text}
+
+Please perform:
+1. Identify key information that reflects factual content, insights, decisions, or implications from the documents — including any notable themes, conclusions, or data points.
+2. Resolve all time, person, location, and event references clearly:
+   - Convert relative time expressions (e.g., “last year,” “next quarter”) into absolute dates if context allows.
+   - Clearly distinguish between event time and document time.
+   - If uncertainty exists, state it explicitly (e.g., “around 2024,” “exact date unclear”).
+   - Include specific locations if mentioned.
+   - Resolve all pronouns, aliases, and ambiguous references into full names or identities.
+   - Disambiguate entities with the same name if applicable.
+3. Always write from a third-person perspective, referring to the subject or content clearly rather than using first-person ("I", "me", "my").
+4. Do not omit any information that is likely to be important or memorable from the document summaries.
+   - Include all key facts, insights, emotional tones, and plans — even if they seem minor.
+   - Prioritize completeness and fidelity over conciseness.
+   - Do not generalize or skip details that could be contextually meaningful.
+5. Summarize all document summaries into one integrated memory item.
+
+Language rules:
+- The `key`, `value`, `tags`, `summary` fields must match the mostly used language of the input document summaries.  **如果输入是中文，请输出中文**
+- Keep `memory_type` in English.
+
+Return valid JSON:
+{
+  "key": <string, a concise title of the `value` field>,
+  "memory_type": "LongTermMemory",
+  "value": <A detailed, self-contained, and unambiguous memory statement, only contain detailed, unaltered information extracted and consolidated from the input `value` fields, do not include summary content — written in English if the input memory items are in English, or in Chinese if the input is in Chinese>,
+  "tags": <A list of relevant thematic keywords (e.g., ["deadline", "team", "planning"])>,
+  "summary": <a natural paragraph summarizing the above memories from user's perspective, only contain information from the input `summary` fields, 120–200 words, same language as the input>
+}
+
+"""
+
+
 LOCAL_SUBCLUSTER_PROMPT = """You are a memory organization expert.
 
 You are given a cluster of memory items, each with an ID and content.
diff --git a/tests/mem_reader/test_simple_structure.py b/tests/mem_reader/test_simple_structure.py
@@ -75,7 +75,9 @@ def test_process_doc_data(self):
         info = {"user_id": "user1", "session_id": "session1"}
 
         # Mock LLM response
-        mock_response = '{"summary": "A sample document about testing.", "tags": ["document"]}'
+        mock_response = (
+            '{"value": "A sample document about testing.", "tags": ["document"], "key": "title"}'
+        )
         self.reader.llm.generate.return_value = mock_response
         self.reader.chunker.chunk.return_value = [
             Chunk(text="Parsed document text", token_count=3, sentences=["Parsed document text"])