modify code in evaluation (#392)

Wang-Daoji · yuan.wang · fridayL · web-flow · commit e069928df306 · 2025-10-27T14:32:20.000+08:00
* modify code in evaluation

* modify code in evaluation

---------

Co-authored-by: yuan.wang &lt;yuan.wang@yuanwangdebijibendiannao.local&gt;
Co-authored-by: chunyu li &lt;78344051+fridayL@users.noreply.github.com&gt;
diff --git a/evaluation/.env-example b/evaluation/.env-example
@@ -22,13 +22,8 @@ SUPERMEMORY_API_KEY="sm_xxx"
 MEMOBASE_API_KEY="xxx"
 MEMOBASE_PROJECT_URL="http://***.***.***.***:8019"
 
-# pref
-PRE_SPLIT_CHUNK=false  # pre split chunk in client end, for personamem and prefeval
-# 1. text_mem + pref_mem + instruction_completion: set INSTRUCT_COMPLETE=true, ABLATION_PREF=false
-# 2. text_mem + pref_mem: set INSTRUCT_COMPLETE=false, ABLATION_PREF=false
-# 3. text_mem: set INSTRUCT_COMPLETE=false, ABLATION_PREF=true
-INSTRUCT_COMPLETE=true  # use instruct complete format or not
-ABLATION_PREF=false  # remove pref mem, only text mem
+# eval settings
+PRE_SPLIT_CHUNK=false
 
 # Configuration Only For Scheduler
 # RabbitMQ Configuration
diff --git a/evaluation/scripts/PrefEval/pref_memos.py b/evaluation/scripts/PrefEval/pref_memos.py
@@ -72,7 +72,6 @@ def search_memory_for_line(line_data: tuple, mem_client, top_k_value: int) -> di
     """
     Processes a single line of data, searching memory based on the question.
     """
-    from utils.pref_mem_utils import create_mem_string
 
     i, line = line_data
     try:
@@ -94,7 +93,13 @@ def search_memory_for_line(line_data: tuple, mem_client, top_k_value: int) -> di
         start_time_search = time.monotonic()
         relevant_memories = mem_client.search(query=question, user_id=user_id, top_k=top_k_value)
         search_memories_duration = time.monotonic() - start_time_search
-        memories_str = create_mem_string(relevant_memories)
+        memories_str = (
+            "\n".join(
+                f"- {entry.get('memory', '')}"
+                for entry in relevant_memories["text_mem"][0]["memories"]
+            )
+            + f"\n{relevant_memories['pref_mem']}"
+        )
 
         memory_tokens_used = len(tokenizer.encode(memories_str))
 
@@ -119,7 +124,6 @@ def generate_response_for_line(line_data: tuple, openai_client: OpenAI, lib: str
     """
     Generates a response for a single line of data using pre-fetched memories.
     """
-    from utils.pref_mem_utils import add_pref_instruction, remove_pref_mem_from_mem_string
     from utils.prompts import PREFEVAL_ANSWER_PROMPT
 
     i, line = line_data
@@ -146,10 +150,7 @@ def generate_response_for_line(line_data: tuple, openai_client: OpenAI, lib: str
             )
             return original_data
 
-        memories_str = remove_pref_mem_from_mem_string(memories_str, frame=lib)
-
-        template = add_pref_instruction(PREFEVAL_ANSWER_PROMPT, frame=lib)
-        system_prompt = template.format(context=memories_str)
+        system_prompt = PREFEVAL_ANSWER_PROMPT.format(context=memories_str)
         messages = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": question},
diff --git a/evaluation/scripts/locomo/locomo_responses.py b/evaluation/scripts/locomo/locomo_responses.py
@@ -35,10 +35,7 @@ async def locomo_response(frame, llm_client, context: str, question: str) -> str
             question=question,
         )
     else:
-        from utils.pref_mem_utils import add_pref_instruction
-
-        template = add_pref_instruction(ANSWER_PROMPT_MEMOS, frame=frame)
-        prompt = template.format(
+        prompt = ANSWER_PROMPT_MEMOS.format(
             context=context,
             question=question,
         )
@@ -55,16 +52,13 @@ async def locomo_response(frame, llm_client, context: str, question: str) -> str
 
 
 async def process_qa(frame, qa, search_result, oai_client):
-    from utils.pref_mem_utils import remove_pref_mem_from_mem_string
-
     start = time()
     query = qa.get("question")
     gold_answer = qa.get("answer")
     qa_category = qa.get("category")
 
     context = search_result.get("context")
 
-    context = remove_pref_mem_from_mem_string(context, frame)
     answer = await locomo_response(frame, oai_client, context, query)
 
     response_duration_ms = (time() - start) * 1000
diff --git a/evaluation/scripts/locomo/locomo_search.py b/evaluation/scripts/locomo/locomo_search.py
@@ -100,14 +100,19 @@ def memos_api_search(
     client, query, speaker_a_user_id, speaker_b_user_id, top_k, speaker_a, speaker_b
 ):
     from prompts import TEMPLATE_MEMOS
-    from utils.pref_mem_utils import create_mem_string
 
     start = time()
     search_a_results = client.search(query=query, user_id=speaker_a_user_id, top_k=top_k)
     search_b_results = client.search(query=query, user_id=speaker_b_user_id, top_k=top_k)
 
-    speaker_a_context = create_mem_string(search_a_results)
-    speaker_b_context = create_mem_string(search_b_results)
+    speaker_a_context = (
+        "\n".join([i["memory"] for i in search_a_results["text_mem"][0]["memories"]])
+        + f"\n{search_a_results['pref_mem']}"
+    )
+    speaker_b_context = (
+        "\n".join([i["memory"] for i in search_b_results["text_mem"][0]["memories"]])
+        + f"\n{search_b_results['pref_mem']}"
+    )
 
     context = TEMPLATE_MEMOS.format(
         speaker_1=speaker_a,
diff --git a/evaluation/scripts/locomo/prompts.py b/evaluation/scripts/locomo/prompts.py
@@ -1,14 +1,3 @@
-import os
-
-
-PREF_INSTRUCTIONS = """
-    # Note:
-    Plaintext memory are summaries of facts, while preference memories are summaries of user preferences.
-    Your response must not violate any of the user's preferences, whether explicit or implicit, and briefly explain why you answer this way to avoid conflicts.
-    When encountering preference conflicts, the priority is: explicit preference > implicit preference > plaintext memory.
-"""
-
-
 ANSWER_PROMPT_MEM0 = """
     You are an intelligent memory assistant tasked with retrieving accurate information from conversation memories.
 
@@ -114,18 +103,14 @@
    5. Formulate a precise, concise answer based on the evidence from the memories (and allowed world knowledge).
    6. Double-check that your answer directly addresses the question asked and adheres to all instructions.
    7. Ensure your final answer is specific and avoids vague time references.
-   {pref_instructions}
+
    {context}
 
    Question: {question}
 
    Answer:
    """
 
-if os.getenv("INSTRUCT_COMPLETE") == "true":
-    ANSWER_PROMPT_MEMOS = ANSWER_PROMPT_MEMOS.replace("{pref_instructions}", PREF_INSTRUCTIONS)
-else:
-    ANSWER_PROMPT_MEMOS = ANSWER_PROMPT_MEMOS.replace("{pref_instructions}", "")
 
 custom_instructions = """
 Generate personal memories that follow these guidelines:
diff --git a/evaluation/scripts/longmemeval/lme_responses.py b/evaluation/scripts/longmemeval/lme_responses.py
@@ -12,13 +12,11 @@
 
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from utils.pref_mem_utils import add_pref_instruction, remove_pref_mem_from_mem_string
 from utils.prompts import LME_ANSWER_PROMPT
 
 
-def lme_response(llm_client, context, question, question_date, frame):
-    template = add_pref_instruction(LME_ANSWER_PROMPT, frame=frame)
-    prompt = template.format(
+def lme_response(llm_client, context, question, question_date):
+    prompt = LME_ANSWER_PROMPT.format(
         question=question,
         question_date=question_date,
         context=context,
@@ -35,14 +33,13 @@ def lme_response(llm_client, context, question, question_date, frame):
     return result
 
 
-def process_qa(user_id, search_result, llm_client, frame):
+def process_qa(user_id, search_result, llm_client):
     start = time()
     search_result = search_result[0]
     question = search_result.get("question")
     question_date = search_result.get("date")
     context = search_result.get("search_context", "")
-    context = remove_pref_mem_from_mem_string(context, frame=frame)
-    anwer = lme_response(llm_client, context, question, question_date, frame)
+    anwer = lme_response(llm_client, context, question, question_date)
 
     response_duration_ms = (time() - start) * 1000
 
@@ -97,7 +94,7 @@ def main(frame, version, num_workers=4):
         future_to_user_id = {}
 
         for user_id, search_results in lme_search_results.items():
-            future = executor.submit(process_qa, user_id, search_results, oai_client, frame)
+            future = executor.submit(process_qa, user_id, search_results, oai_client)
             future_to_user_id[future] = user_id
 
         for future in tqdm(
diff --git a/evaluation/scripts/longmemeval/lme_search.py b/evaluation/scripts/longmemeval/lme_search.py
@@ -13,7 +13,6 @@
 import pandas as pd
 
 from tqdm import tqdm
-from utils.pref_mem_utils import create_mem_string
 from utils.prompts import (
     MEM0_CONTEXT_TEMPLATE,
     MEM0_GRAPH_CONTEXT_TEMPLATE,
@@ -45,7 +44,10 @@ def mem0_search(client, query, user_id, top_k):
 def memos_search(client, query, user_id, top_k):
     start = time()
     results = client.search(query=query, user_id=user_id, top_k=top_k)
-    context = create_mem_string(results)
+    context = (
+        "\n".join([i["memory"] for i in results["text_mem"][0]["memories"]])
+        + f"\n{results['pref_mem']}"
+    )
     context = MEMOS_CONTEXT_TEMPLATE.format(user_id=user_id, memories=context)
     duration_ms = (time() - start) * 1000
     return context, duration_ms
diff --git a/evaluation/scripts/personamem/pm_responses.py b/evaluation/scripts/personamem/pm_responses.py
@@ -14,7 +14,6 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import re
 
-from utils.pref_mem_utils import add_pref_instruction, remove_pref_mem_from_mem_string
 from utils.prompts import PM_ANSWER_PROMPT
 
 
@@ -49,9 +48,8 @@ def _extract_only_options(text):
     return False, predicted_answer
 
 
-def pm_response(llm_client, context, question, options, frame):
-    template = add_pref_instruction(PM_ANSWER_PROMPT, frame=frame)
-    prompt = template.format(
+def pm_response(llm_client, context, question, options):
+    prompt = PM_ANSWER_PROMPT.format(
         question=question,
         context=context,
         options=options,
@@ -68,19 +66,17 @@ def pm_response(llm_client, context, question, options, frame):
     return result
 
 
-def process_qa(user_id, search_result, num_runs, llm_client, frame):
+def process_qa(user_id, search_result, num_runs, llm_client):
     search_result = search_result[0]
     question = search_result.get("question")
     context = search_result.get("search_context", "")
     options = search_result.get("all_options", [])
 
-    context = remove_pref_mem_from_mem_string(context, frame=frame)
-
     run_results = []
 
     for idx in range(num_runs):
         start = time()
-        answer = pm_response(llm_client, context, question, options, frame)
+        answer = pm_response(llm_client, context, question, options)
         is_correct, answer = extract_choice_answer(answer, search_result.get("golden_answer", ""))
         response_duration_ms = (time() - start) * 1000
 
@@ -154,9 +150,7 @@ def main(frame, version, num_runs=3, num_workers=4):
         future_to_user_id = {}
 
         for user_id, search_results in pm_search_results.items():
-            future = executor.submit(
-                process_qa, user_id, search_results, num_runs, oai_client, frame
-            )
+            future = executor.submit(process_qa, user_id, search_results, num_runs, oai_client)
             future_to_user_id[future] = user_id
 
         for future in tqdm(
diff --git a/evaluation/scripts/personamem/pm_search.py b/evaluation/scripts/personamem/pm_search.py
@@ -14,7 +14,6 @@
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from utils.pref_mem_utils import create_mem_string
 from utils.prompts import (
     MEM0_CONTEXT_TEMPLATE,
     MEM0_GRAPH_CONTEXT_TEMPLATE,
@@ -83,7 +82,10 @@ def memobase_search(client, query, user_id, top_k):
 def memos_search(client, user_id, query, top_k):
     start = time()
     results = client.search(query=query, user_id=user_id, top_k=top_k)
-    search_memories = create_mem_string(results)
+    search_memories = (
+        "\n".join(item["memory"] for cube in results["text_mem"] for item in cube["memories"])
+        + f"\n{results['pref_mem']}"
+    )
     context = MEMOS_CONTEXT_TEMPLATE.format(user_id=user_id, memories=search_memories)
 
     duration_ms = (time() - start) * 1000
diff --git a/evaluation/scripts/utils/pref_mem_utils.py b/evaluation/scripts/utils/pref_mem_utils.py
diff --git a/evaluation/scripts/utils/prompts.py b/evaluation/scripts/utils/prompts.py
@@ -1,11 +1,3 @@
-PREF_INSTRUCTIONS = """
-    # Note:
-    Plaintext memory are summaries of facts, while preference memories are summaries of user preferences.
-    Your response must not violate any of the user's preferences, whether explicit or implicit, and briefly explain why you answer this way to avoid conflicts.
-    When encountering preference conflicts, the priority is: explicit preference > implicit preference > plaintext memory.
-"""
-
-
 LME_ANSWER_PROMPT = """
     You are an intelligent memory assistant tasked with retrieving accurate information from conversation memories.
 
@@ -25,7 +17,7 @@
     5. Formulate a precise, concise answer based solely on the evidence in the memories.
     6. Double-check that your answer directly addresses the question asked.
     7. Ensure your final answer is specific and avoids vague time references.
-    {pref_instructions}
+
     {context}
 
     Current Date: {question_date}
@@ -55,7 +47,7 @@
     - Your final answer **must use parentheses**, like (a) or (b).
     - Do NOT list multiple choices. Choose only one.
     - Do NOT include extra text after <final_answer>. Just output the answer.
-    {pref_instructions}
+
     # QUESTION:
     {question}
 
@@ -71,7 +63,6 @@
     You are a helpful AI. Answer the question based on the query and the following memories:
     User Memories:
     {context}
-    {pref_instructions}
 """
 
 
diff --git a/src/memos/vec_dbs/milvus.py b/src/memos/vec_dbs/milvus.py
@@ -138,7 +138,7 @@ def search(
 
             items.append(
                 MilvusVecDBItem(
-                    id=str(hit["id"]),
+                    id=str(entity.get("id")),
                     memory=entity.get("memory"),
                     vector=entity.get("vector"),
                     payload=entity.get("payload", {}),