Skip to content

Commit f8502c5

Browse files
authored
Merge pull request #198 from e06084/dev
feat: add factcheck example
2 parents edc4f30 + 7619e1d commit f8502c5

File tree

3 files changed

+137
-33
lines changed

3 files changed

+137
-33
lines changed

dingo/model/llm/llm_factcheck_public.py

Lines changed: 17 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ def eval(cls, input_data: Data) -> ModelRes:
9999

100100
except Exception as e:
101101
return ModelRes(
102+
error_status=True,
103+
type="QUALITY_BAD_FACTUALITY",
104+
name="FACTUALITY_CHECK_ERROR",
102105
score=0.0,
103106
threshold=cls.threshold,
104107
reason=[f"Evaluation failed: {str(e)}"],
@@ -210,41 +213,23 @@ def _parse_check_results(cls, text: str) -> List[FactCheckResult]:
210213

211214
results = []
212215
for item in data:
213-
evidence_list = [
214-
Evidence(**e) for e in item["supporting_evidence"]
215-
]
216+
# 处理 evidence,确保所有必需字段都存在
217+
evidence_list = []
218+
for e in item.get("supporting_evidence", []):
219+
# 确保所有必需字段都存在,提供默认值
220+
evidence = Evidence(
221+
url=e.get("url", ""),
222+
snippet=e.get("snippet", ""), # 提供默认值避免缺失
223+
summary=e.get("summary", "")
224+
)
225+
evidence_list.append(evidence)
226+
216227
results.append(FactCheckResult(
217-
claim=item["claim"],
218-
answer=item["answer"],
219-
reasoning=item["reasoning"],
228+
claim=item.get("claim", ""),
229+
answer=item.get("answer", "unsure"), # 默认为 unsure
230+
reasoning=item.get("reasoning", ""),
220231
supporting_evidence=evidence_list
221232
))
222233
return results
223234
except Exception as e:
224235
raise ValueError(f"Invalid results format: {str(e)}")
225-
226-
@classmethod
227-
def send_messages(cls, messages: List) -> str:
228-
"""重写发送消息方法,避免使用 models.list()"""
229-
if not cls.dynamic_config.model:
230-
raise ValueError("model name must be specified")
231-
232-
params = cls.dynamic_config.parameters or {}
233-
cls.validate_config(params)
234-
235-
completions = cls.client.chat.completions.create(
236-
model=cls.dynamic_config.model,
237-
messages=messages,
238-
temperature=params.get("temperature", 0.3),
239-
top_p=params.get("top_p", 1),
240-
max_tokens=params.get("max_tokens", 4000),
241-
presence_penalty=params.get("presence_penalty", 0),
242-
frequency_penalty=params.get("frequency_penalty", 0),
243-
)
244-
245-
if completions.choices[0].finish_reason == "length":
246-
raise ExceedMaxTokens(
247-
f"Exceed max tokens: {params.get('max_tokens', 4000)}"
248-
)
249-
250-
return str(completions.choices[0].message.content)

docs/factcheck_guide.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,6 @@ dingo/
221221
│ └── prompt_factcheck.py # 评估提示词
222222
└── examples/
223223
└── factcheck/
224-
├── factcheck_demo.py # 单条评估示例
225224
└── dataset_factcheck_evaluation.py # 数据集评估示例
226225
```
227226

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
"""
2+
Dataset Factuality Evaluation Example
3+
4+
This example demonstrates how to use Dingo's factuality evaluation capability
5+
for batch evaluation of datasets, particularly useful for:
6+
- LLM response validation
7+
- RAG system evaluation
8+
- SFT data quality assessment
9+
"""
10+
import os
11+
from pathlib import Path
12+
13+
from dingo.config import InputArgs
14+
from dingo.exec import Executor
15+
from dingo.io import Data
16+
# Force import factuality evaluation modules
17+
from dingo.model.llm.llm_factcheck_public import LLMFactCheckPublic
18+
from dingo.model.prompt.prompt_factcheck import PromptFactCheck
19+
20+
OPENAI_MODEL = 'deepseek-chat'
21+
OPENAI_URL = 'https://api.deepseek.com/v1'
22+
OPENAI_KEY = os.getenv("OPENAI_KEY")
23+
24+
25+
def evaluate_factuality_jsonl_dataset():
26+
"""
27+
Example: Evaluate a JSONL dataset for factuality
28+
Expected JSONL format:
29+
{"data_id": "1", "prompt": "question", "content": "response"}
30+
"""
31+
print("=== Dataset Factuality Evaluation ===")
32+
33+
input_data = {
34+
"input_path": str(Path("test/data/factcheck_test.jsonl")), # Your JSONL file path
35+
"output_path": "output/factcheck_evaluation/",
36+
"dataset": {
37+
"source": "local",
38+
"format": "jsonl",
39+
"field": {
40+
"prompt": "question", # 注意这里使用 question 作为 prompt 字段
41+
"content": "content"
42+
}
43+
},
44+
"executor": {
45+
"eval_group": "factuality", # 使用 factuality 评估组
46+
"result_save": {
47+
"bad": True, # 保存不实信息
48+
"good": True # 保存真实信息
49+
}
50+
},
51+
"evaluator": {
52+
"llm_config": {
53+
"LLMFactCheckPublic": {
54+
"model": OPENAI_MODEL,
55+
"key": OPENAI_KEY,
56+
"api_url": OPENAI_URL,
57+
}
58+
}
59+
}
60+
}
61+
62+
input_args = InputArgs(**input_data)
63+
executor = Executor.exec_map["local"](input_args)
64+
result = executor.execute()
65+
66+
print("\n=== Evaluation Summary ===")
67+
print(f"Total processed: {result.total}")
68+
print(f"Factual responses: {result.num_good}")
69+
print(f"Non-factual responses: {result.num_bad}")
70+
print(f"Overall factuality score: {result.score:.2%}")
71+
print(f"\nType distribution: {result.type_ratio}")
72+
print(f"Name distribution: {result.name_ratio}")
73+
74+
75+
def evaluate_single_data_example():
76+
"""
77+
Example: Evaluate a single piece of data for factuality
78+
This is useful for testing or real-time evaluation
79+
"""
80+
print("=== Single Data Factuality Evaluation ===")
81+
82+
# 配置评估器
83+
evaluator = LLMFactCheckPublic()
84+
evaluator.dynamic_config.model = OPENAI_MODEL
85+
evaluator.dynamic_config.key = OPENAI_KEY
86+
evaluator.dynamic_config.api_url = OPENAI_URL
87+
evaluator.dynamic_config.parameters = {
88+
"temperature": 0.1, # 降低随机性以提高一致性
89+
"max_tokens": 2000
90+
}
91+
92+
# 创建测试数据
93+
test_data = Data(
94+
data_id="test_1",
95+
prompt="Tell me about Albert Einstein's Nobel Prize.",
96+
content="Albert Einstein won the Nobel Prize in Physics in 1921 for his work on the photoelectric effect. However, many people mistakenly think he won it for his theory of relativity, which actually never received a Nobel Prize due to the controversial nature of relativity at the time."
97+
)
98+
# 执行评估
99+
result = evaluator.eval(test_data)
100+
101+
print("\n=== Evaluation Result ===")
102+
print(f"Error Status: {result.error_status}")
103+
print(f"Type: {result.type}")
104+
print(f"Name: {result.name}")
105+
print(f"Reason: {result.reason}")
106+
107+
108+
if __name__ == "__main__":
109+
print("📊 Dingo Factuality Evaluation Examples")
110+
print("=" * 60)
111+
print()
112+
113+
# Run examples
114+
# print("1. Dataset Evaluation Example")
115+
# print("-" * 30)
116+
# evaluate_factuality_jsonl_dataset()
117+
118+
print("2. Single Data Evaluation Example")
119+
print("-" * 30)
120+
evaluate_single_data_example()

0 commit comments

Comments
 (0)