Skip to content

Commit 2ecffc6

Browse files
fix : update answer_relevancy metric (#291)
* fix : update answer_relevancy metric * fix : update answer_relevancy metric * fix : update ragflow_eval_data_50.jsonl * fix * 🎨 Auto-format code with pre-commit --------- Co-authored-by: GitHub Action <[email protected]>
1 parent 1375e14 commit 2ecffc6

File tree

3 files changed

+139
-95
lines changed

3 files changed

+139
-95
lines changed

dingo/model/llm/rag/llm_rag_answer_relevancy.py

Lines changed: 74 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -42,34 +42,40 @@ class LLMRAGAnswerRelevancy(BaseOpenAI):
4242
"source_frameworks": "Ragas"
4343
}
4444

45-
# 问题生成的prompt模板
46-
question_generation_prompt = """为给定的答案生成一个问题,并判断该答案是否是非承诺性的。如果答案是非承诺性的,将noncommittal设为1;如果答案是承诺性的,将noncommittal设为0。非承诺性答案是指回避、模糊或模棱两可的回答。例如,"我不知道"或"我不确定"就是非承诺性答案。
47-
48-
--------EXAMPLES-----------
49-
示例1
50-
输入: {{
51-
"response": "爱因斯坦出生于德国。"
52-
}}
53-
输出: {{
54-
"question": "爱因斯坦出生于哪里?",
55-
"noncommittal": 0
56-
}}
57-
58-
示例2
59-
输入: {{
60-
"response": "我不知道2023年发明的智能手机的突破性功能,因为我对2022年以后的信息不了解。"
61-
}}
62-
输出: {{
63-
"question": "2023年发明的智能手机的突破性功能是什么?",
64-
"noncommittal": 1
65-
}}
66-
-----------------------------
67-
68-
现在对以下输入执行相同的操作。请尝试从不同角度生成问题,使用不同的表述方式,但保持与原答案的相关性。
69-
输入: {{
70-
"response": {0}
71-
}}
72-
输出: """
45+
question_generation_prompt = """Task: Generate a question for the given answer and identify if the answer is noncommittal.
46+
47+
Instructions:
48+
1. Generate a single question that directly corresponds to the provided answer content.
49+
2. Determine if the answer is noncommittal:
50+
- Set "noncommittal" to 1 if the answer is evasive, vague, or ambiguous (e.g., "I don't know", "I'm not sure")
51+
- Set "noncommittal" to 0 if the answer provides a clear, direct response
52+
3. Ensure the generated question maintains a consistent language style throughout.
53+
54+
--------EXAMPLES-----------
55+
Example 1:
56+
Input: {{
57+
"response": "Albert Einstein was born in Germany."
58+
}}
59+
Output: {{
60+
"question": "Where was Albert Einstein born?",
61+
"noncommittal": 0
62+
}}
63+
64+
Example 2:
65+
Input: {{
66+
"response": "I don't know about the groundbreaking feature of the smartphone invented in 2023 as I'm unaware of information beyond 2022."
67+
}}
68+
Output: {{
69+
"question": "What was the groundbreaking feature of the smartphone invented in 2023?",
70+
"noncommittal": 1
71+
}}
72+
-----------------------------
73+
74+
Now perform the same with the following input:
75+
Input: {{
76+
"response": {0}
77+
}}
78+
Output: """
7379

7480
# 默认的embedding模型
7581
embedding_model = None
@@ -159,6 +165,10 @@ def calculate_similarity(cls, question: str, generated_questions: List[str]) ->
159165
if cls.embedding_model is None:
160166
cls.init_embedding_model()
161167

168+
# 检查生成的问题是否为空列表或全为空字符串
169+
if not generated_questions or all(q == "" for q in generated_questions):
170+
return np.array([])
171+
162172
# 生成embedding
163173
# 单个查询的embedding
164174
question_response = cls.embedding_model['client'].embeddings.create(
@@ -179,15 +189,15 @@ def calculate_similarity(cls, question: str, generated_questions: List[str]) ->
179189
return np.dot(gen_question_vec, question_vec.T).reshape(-1) / norm
180190

181191
@classmethod
182-
def calculate_score(cls, answers: List[Dict[str, Any]], original_question: str) -> float:
183-
"""计算答案相关性分数"""
192+
def calculate_score(cls, answers: List[Dict[str, Any]], original_question: str) -> tuple[float, List[Dict[str, Any]]]:
193+
"""计算答案相关性分数并收集详细信息"""
184194
# 提取生成的问题
185195
gen_questions = [answer.get("question", "") for answer in answers]
186196

187197
# 检查是否所有生成的问题都为空
188198
if all(q == "" for q in gen_questions):
189199
log.warning("Invalid response. Expected dictionary with key 'question'")
190-
return 0.0
200+
return 0.0, []
191201

192202
# 检查是否所有答案都是不置可否的
193203
all_noncommittal = np.all([answer.get("noncommittal", 0) for answer in answers])
@@ -196,12 +206,25 @@ def calculate_score(cls, answers: List[Dict[str, Any]], original_question: str)
196206
cosine_sim = cls.calculate_similarity(original_question, gen_questions)
197207

198208
# 计算最终分数
199-
score = cosine_sim.mean() * int(not all_noncommittal)
200-
201-
# 转换为0-10的分数范围
202-
score = float(score * 10)
203-
204-
return score
209+
if len(cosine_sim) == 0:
210+
score = 0.0
211+
else:
212+
score = cosine_sim.mean() * int(not all_noncommittal)
213+
# 转换为0-10的分数范围
214+
score = float(score * 10)
215+
216+
# 收集详细信息
217+
details = []
218+
for i, (answer, question, sim) in enumerate(zip(answers, gen_questions, cosine_sim)):
219+
is_noncommittal = answer.get("noncommittal", 0) == 1
220+
details.append({
221+
"question_index": i + 1,
222+
"generated_question": question,
223+
"similarity_score": sim,
224+
"is_noncommittal": is_noncommittal
225+
})
226+
227+
return score, details
205228

206229
@classmethod
207230
def eval(cls, input_data: Data) -> EvalDetail:
@@ -230,8 +253,8 @@ def eval(cls, input_data: Data) -> EvalDetail:
230253
# 生成多个相关问题
231254
generated_questions = cls.generate_multiple_questions(input_data, cls.strictness)
232255

233-
# 计算相关性分数
234-
score = cls.calculate_score(generated_questions, original_question)
256+
# 计算相关性分数和详细信息
257+
score, details = cls.calculate_score(generated_questions, original_question)
235258

236259
# 构建结果
237260
result = EvalDetail(metric=cls.__name__)
@@ -249,14 +272,24 @@ def eval(cls, input_data: Data) -> EvalDetail:
249272
if embedding_model_name:
250273
cls.init_embedding_model(embedding_model_name)
251274

275+
# 构建详细的reason文本
276+
all_reasons = []
277+
for detail in details:
278+
noncommittal_text = "(不置可否的回答)" if detail["is_noncommittal"] else ""
279+
all_reasons.append(f"生成的问题{detail['question_index']}: {detail['generated_question']}{noncommittal_text}\n与原始问题的相似度: {detail['similarity_score']:.4f}")
280+
281+
reason_text = "\n\n".join(all_reasons)
282+
if details:
283+
reason_text += f"\n\n平均相似度: {np.mean([d['similarity_score'] for d in details]):.4f}\n是否所有回答都不置可否: {'是' if np.all([d['is_noncommittal'] for d in details]) else '否'}"
284+
252285
if score >= threshold:
253286
result.status = False
254287
result.label = ["QUALITY_GOOD.ANSWER_RELEVANCY_PASS"]
255-
result.reason = [f"答案相关性评估通过 (分数: {score:.2f}/10)"]
288+
result.reason = [f"答案相关性评估通过 (分数: {score:.2f}/10)\n{reason_text}"]
256289
else:
257290
result.status = True
258291
result.label = ["QUALITY_BAD.ANSWER_RELEVANCY_FAIL"]
259-
result.reason = [f"答案相关性评估未通过 (分数: {score:.2f}/10)"]
292+
result.reason = [f"答案相关性评估未通过 (分数: {score:.2f}/10)\n{reason_text}"]
260293

261294
return result
262295

dingo/model/llm/rag/llm_rag_context_recall.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,8 +194,19 @@ def process_response(cls, response: str) -> EvalDetail:
194194
else:
195195
score = (attributed_statements / total_statements) * 10
196196

197-
# 生成reason
198-
reason = f"在 {total_statements} 个陈述中,有 {attributed_statements} 个可以从上下文中归因,{total_statements - attributed_statements} 个不能归因"
197+
# 生成详细的reason文本,包含每个陈述的信息
198+
all_reasons = []
199+
for i, item in enumerate(classifications):
200+
statement = item.get("statement", "")
201+
is_attributed = item.get("attributed", 0) == 1
202+
reason = item.get("reason", "")
203+
204+
status_text = "可归因于上下文" if is_attributed else "不可归因于上下文"
205+
all_reasons.append(f"陈述{i+1}: {statement}\n状态: {status_text}\n理由: {reason}")
206+
207+
# 构建完整的reason文本
208+
reason_text = "\n\n".join(all_reasons)
209+
reason_text += f"\n\n总共有 {total_statements} 个陈述,其中 {attributed_statements} 个可归因于上下文,{total_statements - attributed_statements} 个不可归因于上下文"
199210

200211
result = EvalDetail(metric=cls.__name__)
201212
result.score = score
@@ -208,10 +219,10 @@ def process_response(cls, response: str) -> EvalDetail:
208219
if score >= threshold:
209220
result.status = False
210221
result.label = ["QUALITY_GOOD.CONTEXT_RECALL_PASS"]
211-
result.reason = [f"上下文召回评估通过 (分数: {score:.2f}/10)\n{reason}"]
222+
result.reason = [f"上下文召回评估通过 (分数: {score:.2f}/10)\n{reason_text}"]
212223
else:
213224
result.status = True
214225
result.label = ["QUALITY_BAD.CONTEXT_RECALL_FAIL"]
215-
result.reason = [f"上下文召回评估未通过 (分数: {score:.2f}/10)\n{reason}"]
226+
result.reason = [f"上下文召回评估未通过 (分数: {score:.2f}/10)\n{reason_text}"]
216227

217228
return result

test/data/ragflow_eval_data_50.jsonl

Lines changed: 50 additions & 50 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)