Skip to content

Commit 1cd4e97

Browse files
authored
Merge pull request #244 from e06084/dev
feat: add 5 RAG eval metrics
2 parents 674f639 + e479452 commit 1cd4e97

27 files changed

+2790
-20
lines changed

.github/workflows/lint.yml

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,37 @@ jobs:
1515
uses: actions/setup-python@v4
1616
with:
1717
python-version: "3.10"
18-
- name: Run pre-commit
18+
19+
- name: Install pre-commit
20+
run: pip install pre-commit==3.8.0
21+
22+
- name: Run pre-commit (auto-fix)
23+
id: pre_commit_auto_fix
24+
run: |
25+
# 运行 pre-commit,允许自动修复,不因修复而失败
26+
pre-commit run --all-files || true
27+
28+
- name: Check for changes
29+
id: check_changes
30+
run: |
31+
if [[ -n $(git status --porcelain) ]]; then
32+
echo "changed=true" >> $GITHUB_OUTPUT
33+
echo "📝 Files were modified by pre-commit auto-fix"
34+
else
35+
echo "changed=false" >> $GITHUB_OUTPUT
36+
echo "✅ No auto-fix changes"
37+
fi
38+
39+
- name: Commit auto-fix changes
40+
if: steps.check_changes.outputs.changed == 'true' && github.event_name == 'push'
41+
run: |
42+
git config --local user.email "[email protected]"
43+
git config --local user.name "GitHub Action"
44+
git add -A
45+
git commit -m "🎨 Auto-format code with pre-commit"
46+
git push
47+
48+
- name: Run pre-commit (final check)
1949
run: |
20-
pip install pre-commit==3.8.0
50+
# 再次运行 pre-commit,这次如果有错误就真的失败
2151
pre-commit run --all-files

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,10 @@ For detailed guidance on using Dingo's hallucination detection capabilities, inc
237237

238238
📖 **[View Hallucination Detection Guide →](docs/hallucination_guide.md)**
239239

240+
For comprehensive guidance on RAG evaluation metrics including Faithfulness, Context Precision, Answer Relevancy, Context Recall, and Context Relevancy:
241+
242+
📖 **[View RAG Evaluation Metrics Guide →](docs/rag_evaluation_metrics_zh.md)**
243+
240244
### Factuality Assessment
241245

242246
For comprehensive guidance on using Dingo's two-stage factuality evaluation system:
@@ -431,6 +435,7 @@ The current built-in detection rules and model methods focus on common data qual
431435
- [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
432436
- [mlflow](https://github.com/mlflow/mlflow)
433437
- [deepeval](https://github.com/confident-ai/deepeval)
438+
- [ragas](https://github.com/explodinggradients/ragas)
434439

435440
# Contribution
436441

README_ja.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ result = executor.execute()
428428
- [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
429429
- [mlflow](https://github.com/mlflow/mlflow)
430430
- [deepeval](https://github.com/confident-ai/deepeval)
431+
- [ragas](https://github.com/explodinggradients/ragas)
431432

432433
# 貢献
433434

README_zh-CN.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,10 @@ input_data = {
233233

234234
📖 **[查看幻觉检测指南 →](docs/hallucination_guide.md)**
235235

236+
有关RAG评估指标的完整指导,包括忠实度、上下文精度、答案相关性、上下文召回和上下文相关性:
237+
238+
📖 **[查看RAG评估指标指南 →](docs/rag_evaluation_metrics_zh.md)**
239+
236240
### 事实性评估
237241

238242
有关使用Dingo两阶段事实性评估系统的详细指导:
@@ -427,6 +431,7 @@ result = executor.execute()
427431
- [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
428432
- [mlflow](https://github.com/mlflow/mlflow)
429433
- [deepeval](https://github.com/confident-ai/deepeval)
434+
- [ragas](https://github.com/explodinggradients/ragas)
430435

431436
# 贡献
432437

dingo/config/input_args.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
1-
import json
2-
import os
3-
import time
4-
import uuid
51
from typing import Dict, List, Optional
62

7-
from pydantic import BaseModel, ValidationError
3+
from pydantic import BaseModel
84

95

106
class DatasetHFConfigArgs(BaseModel):
@@ -37,7 +33,7 @@ class DatasetArgs(BaseModel):
3733

3834

3935
class ExecutorResultSaveArgs(BaseModel):
40-
bad: bool = False
36+
bad: bool = True
4137
good: bool = False
4238
all_labels: bool = False
4339
raw: bool = False
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
"""
2+
RAG Answer Relevancy (答案相关性) LLM评估器
3+
4+
基于LLM评估答案是否直接回答了问题。
5+
"""
6+
7+
import json
8+
from typing import List
9+
10+
from dingo.io import Data
11+
from dingo.model import Model
12+
from dingo.model.llm.base_openai import BaseOpenAI
13+
from dingo.model.modelres import ModelRes
14+
from dingo.model.prompt.prompt_rag_answer_relevancy import PromptRAGAnswerRelevancy
15+
from dingo.model.response.response_class import ResponseScoreReason
16+
from dingo.utils import log
17+
from dingo.utils.exception import ConvertJsonError
18+
19+
20+
@Model.llm_register("LLMRAGAnswerRelevancy")
21+
class LLMRAGAnswerRelevancy(BaseOpenAI):
22+
"""
23+
RAG答案相关性评估LLM
24+
25+
输入要求:
26+
- input_data.prompt 或 raw_data['question']: 用户问题
27+
- input_data.content 或 raw_data['answer']: 生成的答案
28+
"""
29+
30+
prompt = PromptRAGAnswerRelevancy
31+
32+
@classmethod
33+
def build_messages(cls, input_data: Data) -> List:
34+
"""构建LLM输入消息"""
35+
# 提取字段
36+
question = input_data.prompt or input_data.raw_data.get("question", "")
37+
answer = input_data.content or input_data.raw_data.get("answer", "")
38+
39+
if not question:
40+
raise ValueError("Answer Relevancy评估需要question字段")
41+
if not answer:
42+
raise ValueError("Answer Relevancy评估需要answer字段")
43+
44+
# 构建prompt内容
45+
prompt_content = cls.prompt.content.format(question, answer)
46+
47+
messages = [{"role": "user", "content": prompt_content}]
48+
49+
return messages
50+
51+
@classmethod
52+
def process_response(cls, response: str) -> ModelRes:
53+
"""处理LLM响应"""
54+
log.info(f"RAG Answer Relevancy response: {response}")
55+
56+
# 清理响应
57+
if response.startswith("```json"):
58+
response = response[7:]
59+
if response.startswith("```"):
60+
response = response[3:]
61+
if response.endswith("```"):
62+
response = response[:-3]
63+
64+
try:
65+
response_json = json.loads(response.strip())
66+
except json.JSONDecodeError:
67+
raise ConvertJsonError(f"Convert to JSON format failed: {response}")
68+
69+
# 解析响应
70+
response_model = ResponseScoreReason(**response_json)
71+
72+
result = ModelRes()
73+
result.score = response_model.score
74+
75+
# 根据分数判断是否通过(默认阈值5,满分10分)
76+
threshold = 5
77+
if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters:
78+
threshold = cls.dynamic_config.parameters.get('threshold', 5)
79+
80+
if response_model.score >= threshold:
81+
result.error_status = False
82+
result.type = "QUALITY_GOOD"
83+
result.name = "ANSWER_RELEVANCY_PASS"
84+
result.reason = [f"答案相关性评估通过 (分数: {response_model.score}/10)\n{response_model.reason}"]
85+
else:
86+
result.error_status = True
87+
result.type = cls.prompt.metric_type
88+
result.name = cls.prompt.__name__
89+
result.reason = [f"答案相关性评估未通过 (分数: {response_model.score}/10)\n{response_model.reason}"]
90+
91+
return result
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
"""
2+
RAG Context Precision (上下文精度) LLM评估器
3+
4+
基于LLM评估检索上下文的精确度和排序质量。
5+
"""
6+
7+
import json
8+
from typing import List
9+
10+
from dingo.io import Data
11+
from dingo.model import Model
12+
from dingo.model.llm.base_openai import BaseOpenAI
13+
from dingo.model.modelres import ModelRes
14+
from dingo.model.prompt.prompt_rag_context_precision import PromptRAGContextPrecision
15+
from dingo.model.response.response_class import ResponseScoreReason
16+
from dingo.utils import log
17+
from dingo.utils.exception import ConvertJsonError
18+
19+
20+
@Model.llm_register("LLMRAGContextPrecision")
21+
class LLMRAGContextPrecision(BaseOpenAI):
22+
"""
23+
RAG上下文精度评估LLM
24+
25+
输入要求:
26+
- input_data.prompt 或 raw_data['question']: 用户问题
27+
- input_data.content 或 raw_data['answer']: 生成的答案
28+
- input_data.context 或 raw_data['contexts']: 检索到的上下文列表
29+
"""
30+
31+
prompt = PromptRAGContextPrecision
32+
33+
@classmethod
34+
def build_messages(cls, input_data: Data) -> List:
35+
"""构建LLM输入消息"""
36+
# 提取字段
37+
question = input_data.prompt or input_data.raw_data.get("question", "")
38+
answer = input_data.content or input_data.raw_data.get("answer", "")
39+
40+
if not answer:
41+
raise ValueError("Context Precision评估需要answer字段")
42+
43+
# 处理contexts
44+
contexts = None
45+
if input_data.context:
46+
if isinstance(input_data.context, list):
47+
contexts = input_data.context
48+
else:
49+
contexts = [input_data.context]
50+
elif "contexts" in input_data.raw_data:
51+
raw_contexts = input_data.raw_data["contexts"]
52+
if isinstance(raw_contexts, list):
53+
contexts = raw_contexts
54+
else:
55+
contexts = [raw_contexts]
56+
57+
if not contexts:
58+
raise ValueError("Context Precision评估需要contexts字段")
59+
60+
# 格式化上下文列表
61+
contexts_formatted = "\n".join([f"{i + 1}. {ctx}" for i, ctx in enumerate(contexts)])
62+
63+
# 构建prompt内容
64+
prompt_content = cls.prompt.content.format(question, answer, contexts_formatted)
65+
66+
messages = [{"role": "user", "content": prompt_content}]
67+
68+
return messages
69+
70+
@classmethod
71+
def process_response(cls, response: str) -> ModelRes:
72+
"""处理LLM响应"""
73+
log.info(f"RAG Context Precision response: {response}")
74+
75+
# 清理响应
76+
if response.startswith("```json"):
77+
response = response[7:]
78+
if response.startswith("```"):
79+
response = response[3:]
80+
if response.endswith("```"):
81+
response = response[:-3]
82+
83+
try:
84+
response_json = json.loads(response.strip())
85+
except json.JSONDecodeError:
86+
raise ConvertJsonError(f"Convert to JSON format failed: {response}")
87+
88+
# 解析响应
89+
response_model = ResponseScoreReason(**response_json)
90+
91+
result = ModelRes()
92+
result.score = response_model.score
93+
94+
# 根据分数判断是否通过(默认阈值5,满分10分)
95+
threshold = 5
96+
if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters:
97+
threshold = cls.dynamic_config.parameters.get('threshold', 5)
98+
99+
if response_model.score >= threshold:
100+
result.error_status = False
101+
result.type = "QUALITY_GOOD"
102+
result.name = "CONTEXT_PRECISION_PASS"
103+
result.reason = [f"上下文精度评估通过 (分数: {response_model.score}/10)\n{response_model.reason}"]
104+
else:
105+
result.error_status = True
106+
result.type = cls.prompt.metric_type
107+
result.name = cls.prompt.__name__
108+
result.reason = [f"上下文精度评估未通过 (分数: {response_model.score}/10)\n{response_model.reason}"]
109+
110+
return result

0 commit comments

Comments
 (0)