Skip to content

Commit 515b735

Browse files
authored
Feat: sync test to dev (#375)
## Description <!-- Please include a summary of the changes below; Fill in the issue number that this PR addresses (if applicable); Fill in the related MemOS-Docs repository issue or PR link (if applicable); Mention the person who will review this PR (if you know who it is); Replace (summary), (issue), (docs-issue-or-pr-link), and (reviewer) with the appropriate information. 请在下方填写更改的摘要; 填写此 PR 解决的问题编号(如果适用); 填写相关的 MemOS-Docs 仓库 issue 或 PR 链接(如果适用); 提及将审查此 PR 的人(如果您知道是谁); 替换 (summary)、(issue)、(docs-issue-or-pr-link) 和 (reviewer) 为适当的信息。 --> Summary: (summary) Fix: #(issue) Docs Issue/PR: (docs-issue-or-pr-link) Reviewer: @(reviewer) ## Checklist: - [ ] I have performed a self-review of my own code | 我已自行检查了自己的代码 - [ ] I have commented my code in hard-to-understand areas | 我已在难以理解的地方对代码进行了注释 - [ ] I have added tests that prove my fix is effective or that my feature works | 我已添加测试以证明我的修复有效或功能正常 - [ ] I have created related documentation issue/PR in [MemOS-Docs](https://github.com/MemTensor/MemOS-Docs) (if applicable) | 我已在 [MemOS-Docs](https://github.com/MemTensor/MemOS-Docs) 中创建了相关的文档 issue/PR(如果适用) - [ ] I have linked the issue to this PR (if applicable) | 我已将 issue 链接到此 PR(如果适用) - [ ] I have mentioned the person who will review this PR | 我已提及将审查此 PR 的人
2 parents 921a9dc + d64c6ba commit 515b735

File tree

73 files changed

+5237
-860
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+5237
-860
lines changed

evaluation/.env-example

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,24 @@ ZEP_API_KEY="z_***REDACTED***"
99
CHAT_MODEL="gpt-4o-mini"
1010
CHAT_MODEL_BASE_URL="http://***.***.***.***:3000/v1"
1111
CHAT_MODEL_API_KEY="sk-***REDACTED***"
12+
13+
# Configuration Only For Scheduler
14+
# RabbitMQ Configuration
15+
MEMSCHEDULER_RABBITMQ_HOST_NAME=rabbitmq-cn-***.cn-***.amqp-32.net.mq.amqp.aliyuncs.com
16+
MEMSCHEDULER_RABBITMQ_USER_NAME=***
17+
MEMSCHEDULER_RABBITMQ_PASSWORD=***
18+
MEMSCHEDULER_RABBITMQ_VIRTUAL_HOST=memos
19+
MEMSCHEDULER_RABBITMQ_ERASE_ON_CONNECT=true
20+
MEMSCHEDULER_RABBITMQ_PORT=5672
21+
22+
# OpenAI Configuration
23+
MEMSCHEDULER_OPENAI_API_KEY=sk-***
24+
MEMSCHEDULER_OPENAI_BASE_URL=http://***.***.***.***:3000/v1
25+
MEMSCHEDULER_OPENAI_DEFAULT_MODEL=gpt-4o-mini
26+
27+
# Graph DB Configuration
28+
MEMSCHEDULER_GRAPHDBAUTH_URI=bolt://localhost:7687
29+
MEMSCHEDULER_GRAPHDBAUTH_USER=neo4j
30+
MEMSCHEDULER_GRAPHDBAUTH_PASSWORD=***
31+
MEMSCHEDULER_GRAPHDBAUTH_DB_NAME=neo4j
32+
MEMSCHEDULER_GRAPHDBAUTH_AUTO_CREATE=true

evaluation/__init__.py

Whitespace-only changes.

evaluation/scripts/__init__.py

Whitespace-only changes.

evaluation/scripts/temporal_locomo/models/__init__.py

Whitespace-only changes.

evaluation/scripts/temporal_locomo/locomo_eval.py renamed to evaluation/scripts/temporal_locomo/models/locomo_eval.py

Lines changed: 132 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
from bert_score import score as bert_score
1111
from dotenv import load_dotenv
12-
from modules.locomo_eval_module import LocomoEvalModelModules
1312
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
1413
from nltk.translate.meteor_score import meteor_score
1514
from openai import AsyncOpenAI
@@ -19,6 +18,7 @@
1918
from sentence_transformers import SentenceTransformer
2019
from tqdm import tqdm
2120

21+
from evaluation.scripts.temporal_locomo.modules.locomo_eval_module import LocomoEvalModelModules
2222
from memos.log import get_logger
2323

2424

@@ -281,33 +281,64 @@ def __init__(self, args):
281281
api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL")
282282
)
283283

284-
async def run(self):
285-
print(
286-
f"\n=== Starting LoCoMo evaluation for {self.frame} (version: {self.version}) with {self.num_runs} run(s) per question ==="
287-
)
288-
print(f"Using {self.max_workers} concurrent workers for processing groups")
284+
def _load_response_data(self):
285+
"""
286+
Load response data from the response path file.
289287
288+
Returns:
289+
dict: The loaded response data
290+
"""
290291
with open(self.response_path) as file:
291-
locomo_responses = json.load(file)
292+
return json.load(file)
292293

293-
num_users = 10
294+
def _load_existing_evaluation_results(self):
295+
"""
296+
Attempt to load existing evaluation results from the judged path.
297+
If the file doesn't exist or there's an error loading it, return an empty dict.
298+
299+
Returns:
300+
dict: Existing evaluation results or empty dict if none available
301+
"""
294302
all_grades = {}
303+
try:
304+
if os.path.exists(self.judged_path):
305+
with open(self.judged_path) as f:
306+
all_grades = json.load(f)
307+
print(f"Loaded existing evaluation results from {self.judged_path}")
308+
except Exception as e:
309+
print(f"Error loading existing evaluation results: {e}")
295310

296-
total_responses_count = sum(
297-
len(locomo_responses.get(f"locomo_exp_user_{i}", [])) for i in range(num_users)
298-
)
299-
print(f"Found {total_responses_count} total responses across {num_users} users to evaluate")
311+
return all_grades
312+
313+
def _create_evaluation_tasks(self, locomo_responses, all_grades, num_users):
314+
"""
315+
Create evaluation tasks for groups that haven't been evaluated yet.
316+
317+
Args:
318+
locomo_responses (dict): The loaded response data
319+
all_grades (dict): Existing evaluation results
320+
num_users (int): Number of user groups to process
300321
301-
# Create tasks for processing each group
322+
Returns:
323+
tuple: (tasks list, active users count)
324+
"""
302325
tasks = []
303326
active_users = 0
327+
304328
for group_idx in range(num_users):
305329
group_id = f"locomo_exp_user_{group_idx}"
306330
group_responses = locomo_responses.get(group_id, [])
331+
307332
if not group_responses:
308333
print(f"No responses found for group {group_id}")
309334
continue
310335

336+
# Skip groups that already have evaluation results
337+
if all_grades.get(group_id):
338+
print(f"Skipping group {group_id} as it already has evaluation results")
339+
active_users += 1
340+
continue
341+
311342
active_users += 1
312343
tasks.append(
313344
process_single_group(
@@ -319,29 +350,50 @@ async def run(self):
319350
)
320351
)
321352

322-
print(f"Starting evaluation of {active_users} user groups with responses")
353+
return tasks, active_users
354+
355+
async def _process_tasks(self, tasks):
356+
"""
357+
Process evaluation tasks with concurrency control.
358+
359+
Args:
360+
tasks (list): List of tasks to process
361+
362+
Returns:
363+
list: Results from processing all tasks
364+
"""
365+
if not tasks:
366+
return []
323367

324368
semaphore = asyncio.Semaphore(self.max_workers)
325369

326370
async def limited_task(task):
371+
"""Helper function to limit concurrent task execution"""
327372
async with semaphore:
328373
return await task
329374

330375
limited_tasks = [limited_task(task) for task in tasks]
331-
group_results = await asyncio.gather(*limited_tasks)
376+
return await asyncio.gather(*limited_tasks)
332377

333-
for group_id, graded_responses in group_results:
334-
all_grades[group_id] = graded_responses
378+
def _calculate_scores(self, all_grades):
379+
"""
380+
Calculate evaluation scores based on all grades.
335381
336-
print("\n=== Evaluation Complete: Calculating final scores ===")
382+
Args:
383+
all_grades (dict): The complete evaluation results
337384
385+
Returns:
386+
tuple: (run_scores, evaluated_count)
387+
"""
338388
run_scores = []
339389
evaluated_count = 0
390+
340391
if self.num_runs > 0:
341392
for i in range(1, self.num_runs + 1):
342393
judgment_key = f"judgment_{i}"
343394
current_run_correct_count = 0
344395
current_run_total_count = 0
396+
345397
for group in all_grades.values():
346398
for response in group:
347399
if judgment_key in response["llm_judgments"]:
@@ -355,6 +407,16 @@ async def limited_task(task):
355407

356408
evaluated_count = current_run_total_count
357409

410+
return run_scores, evaluated_count
411+
412+
def _report_scores(self, run_scores, evaluated_count):
413+
"""
414+
Report evaluation scores to the console.
415+
416+
Args:
417+
run_scores (list): List of accuracy scores for each run
418+
evaluated_count (int): Number of evaluated responses
419+
"""
358420
if evaluated_count > 0:
359421
mean_of_scores = np.mean(run_scores)
360422
std_of_scores = np.std(run_scores)
@@ -368,11 +430,63 @@ async def limited_task(task):
368430
print("No responses were evaluated")
369431
print("LLM-as-a-Judge score: N/A (0/0)")
370432

433+
def _save_results(self, all_grades):
434+
"""
435+
Save evaluation results to the judged path file.
436+
437+
Args:
438+
all_grades (dict): The complete evaluation results to save
439+
"""
371440
all_grades = convert_numpy_types(all_grades)
372441
with open(self.judged_path, "w") as f:
373442
json.dump(all_grades, f, indent=2)
374443
print(f"Saved detailed evaluation results to {self.judged_path}")
375444

445+
async def run(self):
446+
"""
447+
Main execution method for the LoCoMo evaluation process.
448+
This method orchestrates the entire evaluation workflow:
449+
1. Loads existing evaluation results if available
450+
2. Processes only groups that haven't been evaluated yet
451+
3. Calculates and reports final evaluation scores
452+
"""
453+
print(
454+
f"\n=== Starting LoCoMo evaluation for {self.frame} (version: {self.version}) with {self.num_runs} run(s) per question ==="
455+
)
456+
print(f"Using {self.max_workers} concurrent workers for processing groups")
457+
458+
# Load response data and existing evaluation results
459+
locomo_responses = self._load_response_data()
460+
all_grades = self._load_existing_evaluation_results()
461+
462+
# Count total responses for reporting
463+
num_users = 10
464+
total_responses_count = sum(
465+
len(locomo_responses.get(f"locomo_exp_user_{i}", [])) for i in range(num_users)
466+
)
467+
print(f"Found {total_responses_count} total responses across {num_users} users to evaluate")
468+
469+
# Create tasks only for groups that haven't been evaluated yet
470+
tasks, active_users = self._create_evaluation_tasks(locomo_responses, all_grades, num_users)
471+
print(
472+
f"Starting evaluation of {len(tasks)} user groups with responses (out of {active_users} active users)"
473+
)
474+
475+
# Process tasks and update all_grades with results
476+
if tasks:
477+
group_results = await self._process_tasks(tasks)
478+
for group_id, graded_responses in group_results:
479+
all_grades[group_id] = graded_responses
480+
481+
print("\n=== Evaluation Complete: Calculating final scores ===")
482+
483+
# Calculate and report scores
484+
run_scores, evaluated_count = self._calculate_scores(all_grades)
485+
self._report_scores(run_scores, evaluated_count)
486+
487+
# Save results
488+
self._save_results(all_grades)
489+
376490

377491
if __name__ == "__main__":
378492
parser = argparse.ArgumentParser()

evaluation/scripts/temporal_locomo/locomo_ingestion.py renamed to evaluation/scripts/temporal_locomo/models/locomo_ingestion.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,16 @@
66
from datetime import datetime, timezone
77
from pathlib import Path
88

9-
from modules.constants import (
9+
from tqdm import tqdm
10+
11+
from evaluation.scripts.temporal_locomo.modules.constants import (
1012
MEM0_GRAPH_MODEL,
1113
MEM0_MODEL,
1214
MEMOS_MODEL,
1315
MEMOS_SCHEDULER_MODEL,
1416
ZEP_MODEL,
1517
)
16-
from modules.locomo_eval_module import LocomoEvalModelModules
17-
from tqdm import tqdm
18-
18+
from evaluation.scripts.temporal_locomo.modules.locomo_eval_module import LocomoEvalModelModules
1919
from memos.log import get_logger
2020

2121

evaluation/scripts/temporal_locomo/locomo_metric.py renamed to evaluation/scripts/temporal_locomo/models/locomo_metric.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
import pandas as pd
66

7-
from modules.locomo_eval_module import LocomoEvalModelModules
7+
from evaluation.scripts.temporal_locomo.modules.locomo_eval_module import LocomoEvalModelModules
88

99

1010
# Category mapping as per your request

0 commit comments

Comments
 (0)