Skip to content

Commit a886c34

Browse files
authored
Merge pull request #143 from e06084/dev
feat: add GPT-5 Hallucinations eval(no Web search)
2 parents 9c16fb2 + 033cdbf commit a886c34

File tree

7 files changed

+702
-0
lines changed

7 files changed

+702
-0
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ Our evaluation system includes:
181181
- **Classification Metrics**: Topic categorization and content classification
182182
- **Multimodality Assessment Metrics**: Image classification and relevance evaluation
183183
- **Rule-Based Quality Metrics**: Automated quality checks using heuristic rules for effectiveness and similarity detection
184+
- **Factuality Assessment Metrics**: Two-stage factuality evaluation based on GPT-5 System Card
184185
- etc
185186

186187
Most metrics are backed by academic sources to ensure objectivity and scientific rigor.
@@ -215,6 +216,12 @@ For detailed guidance on using Dingo's hallucination detection capabilities, inc
215216

216217
📖 **[View Hallucination Detection Guide →](docs/hallucination_guide.md)**
217218

219+
### Factuality Assessment
220+
221+
For comprehensive guidance on using Dingo's two-stage factuality evaluation system:
222+
223+
📖 **[View Factuality Assessment Guide →](docs/factcheck_guide.md)**
224+
218225
# Rule Groups
219226

220227
Dingo provides pre-configured rule groups for different types of datasets:

README_ja.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ Dingoはルールベースおよびプロンプトベースの評価メトリク
178178
- **分類メトリクス**: トピック分類とコンテンツ分類
179179
- **マルチモーダル評価メトリクス**: 画像分類と関連性評価
180180
- **ルールベース品質メトリクス**: ヒューリスティックルールによる効果性と類似性検出を用いた自動品質チェック
181+
- **事実性評価メトリクス**: GPT-5 System Cardに基づく二段階事実性評価
181182
- など
182183

183184
大部分のメトリクスは学術的なソースによって支持されており、客観性と科学的厳密性を保証しています。
@@ -212,6 +213,12 @@ HHEM-2.1-Openローカル推論とLLMベース評価を含む、Dingoの幻覚
212213

213214
📖 **[幻覚検出ガイドを見る →](docs/hallucination_guide.md)**
214215

216+
### 事実性評価
217+
218+
Dingoの二段階事実性評価システムの使用に関する詳細なガイダンス:
219+
220+
📖 **[事実性評価ガイドを見る →](docs/factcheck_guide.md)**
221+
215222
# ルールグループ
216223

217224
Dingoは異なるタイプのデータセット用に事前設定されたルールグループを提供します:

README_zh-CN.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ Dingo通过基于规则和基于提示的评估指标提供全面的数据质量
179179
- **分类指标**:主题分类和内容分类
180180
- **多模态评估指标**:图像分类和相关性评估
181181
- **基于规则的质量指标**:使用启发式规则进行效果性和相似性检测的自动化质量检查
182+
- **事实性评估指标**:基于 GPT-5 System Card 的两阶段事实性评估
182183
- 等等
183184

184185
大部分指标都由学术来源支持,以确保客观性和科学严谨性。
@@ -213,6 +214,12 @@ input_data = {
213214

214215
📖 **[查看幻觉检测指南 →](docs/hallucination_guide.md)**
215216

217+
### 事实性评估
218+
219+
有关使用Dingo两阶段事实性评估系统的详细指导:
220+
221+
📖 **[查看事实性评估指南 →](docs/factcheck_guide.md)**
222+
216223
# 规则组
217224

218225
Dingo为不同类型的数据集提供预配置的规则组:
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
from dataclasses import dataclass
2+
from typing import Dict, List, Literal, Optional
3+
4+
from dingo.io import Data
5+
from dingo.model import Model
6+
from dingo.model.llm.base_openai import BaseOpenAI
7+
from dingo.model.modelres import ModelRes
8+
from dingo.model.prompt.prompt_factcheck import PromptFactCheck
9+
from dingo.utils.exception import ExceedMaxTokens
10+
11+
12+
@dataclass
13+
class Evidence:
14+
"""验证证据"""
15+
url: str
16+
snippet: str
17+
summary: str
18+
19+
20+
@dataclass
21+
class FactCheckResult:
22+
"""单条声明的验证结果"""
23+
claim: str
24+
answer: Literal["true", "false", "unsure"]
25+
reasoning: str
26+
supporting_evidence: List[Evidence]
27+
28+
29+
@Model.prompt_register(metric_type="QUALITY_BAD_FACTUALITY", group=["factuality"])
30+
@Model.llm_register("LLMFactCheckPublic")
31+
class LLMFactCheckPublic(BaseOpenAI):
32+
"""公开事实性评估器 - 基于 GPT-5 System Card 的两阶段评估"""
33+
34+
_metric_info = {
35+
"category": "Factuality Assessment",
36+
"quality_dimension": "FACTUAL_CORRECTNESS",
37+
"metric_name": "LLMFactCheckPublic",
38+
"description": "Two-stage factuality evaluation pipeline from GPT-5",
39+
"paper_title": "GPT-5 System Card",
40+
"paper_url": "https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf",
41+
"paper_authors": "OpenAI"
42+
}
43+
44+
prompt = PromptFactCheck
45+
threshold = 0.8
46+
batch_size = 10 # 默认批处理大小
47+
web_enabled = True # 默认启用网络搜索
48+
49+
@classmethod
50+
def eval(cls, input_data: Data) -> ModelRes:
51+
"""执行两阶段评估"""
52+
try:
53+
# 0. 初始化 client
54+
if cls.client is None:
55+
cls.create_client()
56+
57+
# 1. 提取声明
58+
claims = cls._extract_claims(input_data.prompt, input_data.content)
59+
if not claims:
60+
return ModelRes(
61+
score=0.0,
62+
threshold=cls.threshold,
63+
reason=["No factual claims found"],
64+
raw_resp={"claims": [], "results": []}
65+
)
66+
67+
# 2. 分批验证
68+
all_results = []
69+
for i in range(0, len(claims), cls.batch_size):
70+
batch = claims[i:i + cls.batch_size]
71+
results = cls._verify_claims(input_data.prompt, input_data.content, batch)
72+
all_results.extend(results)
73+
74+
# 3. 计算指标
75+
metrics = cls._calculate_metrics(all_results)
76+
77+
# 4. 设置评估结果
78+
result = ModelRes(
79+
score=metrics["factual_ratio"],
80+
threshold=cls.threshold,
81+
reason=[cls._format_reason(metrics)],
82+
raw_resp={
83+
"claims": claims,
84+
"results": all_results,
85+
"metrics": metrics
86+
}
87+
)
88+
89+
# 5. 根据分数设置状态
90+
if metrics["factual_ratio"] < cls.threshold:
91+
result.error_status = True
92+
result.type = "QUALITY_BAD_FACTUALITY"
93+
result.name = "FACTUALITY_CHECK_FAILED"
94+
else:
95+
result.type = "QUALITY_GOOD"
96+
result.name = "FACTUALITY_CHECK_PASSED"
97+
98+
return result
99+
100+
except Exception as e:
101+
return ModelRes(
102+
score=0.0,
103+
threshold=cls.threshold,
104+
reason=[f"Evaluation failed: {str(e)}"],
105+
raw_resp={"error": str(e)}
106+
)
107+
108+
@classmethod
109+
def _extract_claims(cls, prompt: str, response: str) -> List[str]:
110+
"""提取事实性声明"""
111+
messages = [
112+
{"role": "user", "content": (PromptFactCheck.CLAIM_LISTING +
113+
(PromptFactCheck.CLAIM_LISTING_NO_WEB if not cls.web_enabled else "")).format(
114+
prompt=prompt,
115+
response=response
116+
)}
117+
]
118+
result = cls.send_messages(messages)
119+
try:
120+
claims = cls._parse_json_list(result)
121+
return [c for c in claims if c.strip()] # 过滤空声明
122+
except Exception as e:
123+
raise ValueError(f"Failed to parse claims: {str(e)}")
124+
125+
@classmethod
126+
def _verify_claims(cls,
127+
prompt: str,
128+
response: str,
129+
claims: List[str]) -> List[FactCheckResult]:
130+
"""验证一批声明"""
131+
messages = [
132+
{"role": "user", "content": (PromptFactCheck.FACT_CHECKING +
133+
(PromptFactCheck.FACT_CHECKING_NO_WEB if not cls.web_enabled else "")).format(
134+
prompt=prompt,
135+
response=response,
136+
claims=claims
137+
)}
138+
]
139+
result = cls.send_messages(messages)
140+
try:
141+
return cls._parse_check_results(result)
142+
except Exception as e:
143+
raise ValueError(f"Failed to parse check results: {str(e)}")
144+
145+
@classmethod
146+
def _calculate_metrics(cls, results: List[FactCheckResult]) -> Dict:
147+
"""计算评估指标"""
148+
total = len(results)
149+
if total == 0:
150+
return {
151+
"factual_ratio": 0.0,
152+
"true_count": 0,
153+
"false_count": 0,
154+
"unsure_count": 0,
155+
"total_claims": 0
156+
}
157+
158+
counts = {
159+
"true": sum(1 for r in results if r.answer == "true"),
160+
"false": sum(1 for r in results if r.answer == "false"),
161+
"unsure": sum(1 for r in results if r.answer == "unsure")
162+
}
163+
164+
return {
165+
"factual_ratio": counts["true"] / total,
166+
"true_count": counts["true"],
167+
"false_count": counts["false"],
168+
"unsure_count": counts["unsure"],
169+
"total_claims": total
170+
}
171+
172+
@classmethod
173+
def _format_reason(cls, metrics: Dict) -> str:
174+
"""格式化评估原因"""
175+
return (
176+
f"Found {metrics['total_claims']} claims: "
177+
f"{metrics['true_count']} true, "
178+
f"{metrics['false_count']} false, "
179+
f"{metrics['unsure_count']} unsure. "
180+
f"Factual ratio: {metrics['factual_ratio']:.2%}"
181+
)
182+
183+
@classmethod
184+
def _parse_json_list(cls, text: str) -> List[str]:
185+
"""解析 JSON 列表"""
186+
import json
187+
try:
188+
# 提取 JSON 部分
189+
start = text.find("[")
190+
end = text.rfind("]") + 1
191+
if start == -1 or end == 0:
192+
raise ValueError("No JSON list found")
193+
json_str = text[start:end]
194+
return json.loads(json_str)
195+
except Exception as e:
196+
raise ValueError(f"Invalid JSON format: {str(e)}")
197+
198+
@classmethod
199+
def _parse_check_results(cls, text: str) -> List[FactCheckResult]:
200+
"""解析验证结果"""
201+
import json
202+
try:
203+
# 提取 JSON 部分
204+
start = text.find("[")
205+
end = text.rfind("]") + 1
206+
if start == -1 or end == 0:
207+
raise ValueError("No JSON results found")
208+
json_str = text[start:end]
209+
data = json.loads(json_str)
210+
211+
results = []
212+
for item in data:
213+
evidence_list = [
214+
Evidence(**e) for e in item["supporting_evidence"]
215+
]
216+
results.append(FactCheckResult(
217+
claim=item["claim"],
218+
answer=item["answer"],
219+
reasoning=item["reasoning"],
220+
supporting_evidence=evidence_list
221+
))
222+
return results
223+
except Exception as e:
224+
raise ValueError(f"Invalid results format: {str(e)}")
225+
226+
@classmethod
227+
def send_messages(cls, messages: List) -> str:
228+
"""重写发送消息方法,避免使用 models.list()"""
229+
if not cls.dynamic_config.model:
230+
raise ValueError("model name must be specified")
231+
232+
params = cls.dynamic_config.parameters or {}
233+
cls.validate_config(params)
234+
235+
completions = cls.client.chat.completions.create(
236+
model=cls.dynamic_config.model,
237+
messages=messages,
238+
temperature=params.get("temperature", 0.3),
239+
top_p=params.get("top_p", 1),
240+
max_tokens=params.get("max_tokens", 4000),
241+
presence_penalty=params.get("presence_penalty", 0),
242+
frequency_penalty=params.get("frequency_penalty", 0),
243+
)
244+
245+
if completions.choices[0].finish_reason == "length":
246+
raise ExceedMaxTokens(
247+
f"Exceed max tokens: {params.get('max_tokens', 4000)}"
248+
)
249+
250+
return str(completions.choices[0].message.content)

0 commit comments

Comments
 (0)