Skip to content

Commit 2387f6d

Browse files
authored
feat(eval): add longmemeval evaluation pipeline (#104)
* feat(eval): add eval dependencies * feat(eval): add configs example * docs(eval): update README.md * feat(eval): remove the dependency (pydantic) * feat(eval): add run locomo eval script * fix(eval): delete about memos redundant search branches * chore: fix format * feat(eval): add openai memory on locomo - eval guide * docs(eval): modify openai memory on locomo - eval guide * feat(eval): add longmemeval evaluation pipeline * chore(eval): formatter * chore: update * feat(eval): add configs example
1 parent 90fbbfa commit 2387f6d

File tree

18 files changed

+1980
-5
lines changed

18 files changed

+1980
-5
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ tmp/
66
**/tmp_data/
77

88
# evaluation data
9-
evaluation/data/langmemeval
109
evaluation/*tmp/
1110
evaluation/results
1211
evaluation/.env
12+
!evaluation/configs-example/*.json
1313
evaluation/configs/*
1414
**tree_textual_memory_locomo**
1515
.env
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"user_id": "__USER_ID__",
3+
"cube_id": "__USER_ID__",
4+
"text_mem": {
5+
"backend": "tree_text",
6+
"config": {
7+
"extractor_llm": {
8+
"backend": "openai",
9+
"config": {
10+
"model_name_or_path": "gpt-4o-mini",
11+
"temperature": 0.8,
12+
"max_tokens": 1024,
13+
"top_p": 0.9,
14+
"top_k": 50,
15+
"api_key": "sk-***REDACTED***",
16+
"api_base": "http://***.***.***.***:3000/v1"
17+
}
18+
},
19+
"dispatcher_llm": {
20+
"backend": "openai",
21+
"config": {
22+
"model_name_or_path": "gpt-4o-mini",
23+
"temperature": 0.8,
24+
"max_tokens": 1024,
25+
"top_p": 0.9,
26+
"top_k": 50,
27+
"api_key": "sk-***REDACTED***",
28+
"api_base": "http://***.***.***.***:3000/v1"
29+
}
30+
},
31+
"graph_db": {
32+
"backend": "neo4j",
33+
"config": {
34+
"uri": "bolt://***.***.***.***:7687",
35+
"user": "***REDACTED***",
36+
"password": "***REDACTED***",
37+
"db_name": "__DB_NAME__",
38+
"auto_create": true
39+
}
40+
},
41+
"embedder": {
42+
"backend": "ollama",
43+
"config": {
44+
"model_name_or_path": "nomic-embed-text:latest"
45+
}
46+
}
47+
}
48+
},
49+
"act_mem": {},
50+
"para_mem": {}
51+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"user_id": "root",
3+
"chat_model": {
4+
"backend": "openai",
5+
"config": {
6+
"model_name_or_path": "gpt-4o-mini",
7+
"api_key": "sk-***REDACTED***",
8+
"api_base": "http://***.***.***.***:3000/v1",
9+
"temperature": 0.1,
10+
"remove_think_prefix": true,
11+
"max_tokens": 4096
12+
}
13+
},
14+
"mem_reader": {
15+
"backend": "simple_struct",
16+
"config": {
17+
"llm": {
18+
"backend": "openai",
19+
"config": {
20+
"model_name_or_path": "gpt-4o-mini",
21+
"temperature": 0.8,
22+
"max_tokens": 1024,
23+
"top_p": 0.9,
24+
"top_k": 50,
25+
"api_key": "sk-***REDACTED***",
26+
"api_base": "http://***.***.***.***:3000/v1"
27+
}
28+
},
29+
"embedder": {
30+
"backend": "ollama",
31+
"config": {
32+
"model_name_or_path": "nomic-embed-text:latest"
33+
}
34+
},
35+
"chunker": {
36+
"backend": "sentence",
37+
"config": {
38+
"tokenizer_or_token_counter": "gpt2",
39+
"chunk_size": 512,
40+
"chunk_overlap": 128,
41+
"min_sentences_per_chunk": 1
42+
}
43+
}
44+
}
45+
},
46+
"max_turns_window": 30,
47+
"top_k": "__TOP_K__",
48+
"enable_textual_memory": true,
49+
"enable_activation_memory": false,
50+
"enable_parametric_memory": false
51+
}

evaluation/data/longmemeval/.gitkeep

Whitespace-only changes.

evaluation/scripts/locomo/locomo_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ async def locomo_grader(llm_client, question: str, gold_answer: str, response: s
5353
"""
5454

5555
accuracy_prompt = f"""
56-
Your task is to label an answer to a question as ’CORRECT’ or ’WRONG’. You williolw23 be given the following data:
56+
Your task is to label an answer to a question as ’CORRECT’ or ’WRONG’. You will be given the following data:
5757
(1) a question (posed by one user to another user),
5858
(2) a ’gold’ (ground truth) answer,
5959
(3) a generated answer

0 commit comments

Comments
 (0)