Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions examples/rag/rag_mock_and_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def evaluate_rag_result(question: str, rag_result: Dict[str, Any]):
model=OPENAI_MODEL,
)
faith_result = LLMRAGFaithfulness.eval(data)
print(f"Faithfulness details: {faith_result.eval_details}")
print(f"Faithfulness details: {faith_result}")

# 2. 评测答案相关性
LLMRAGAnswerRelevancy.dynamic_config = EvaluatorLLMArgs(
Expand All @@ -208,7 +208,7 @@ def evaluate_rag_result(question: str, rag_result: Dict[str, Any]):
model=OPENAI_MODEL,
)
ans_rel_result = LLMRAGAnswerRelevancy.eval(data)
print(f"Answer Relevancy details: {ans_rel_result.eval_details}")
print(f"Answer Relevancy details: {ans_rel_result}")

# 3. 评测上下文相关性
LLMRAGContextRelevancy.dynamic_config = EvaluatorLLMArgs(
Expand All @@ -217,12 +217,12 @@ def evaluate_rag_result(question: str, rag_result: Dict[str, Any]):
model=OPENAI_MODEL,
)
ctx_rel_result = LLMRAGContextRelevancy.eval(data)
print(f"Context Relevancy details: {ctx_rel_result.eval_details}")
print(f"Context Relevancy details: {ctx_rel_result}")

return {
"faithfulness": faith_result.eval_details,
"answer_relevancy": ans_rel_result.eval_details,
"context_relevancy": ctx_rel_result.eval_details
"faithfulness": faith_result,
"answer_relevancy": ans_rel_result,
"context_relevancy": ctx_rel_result
}


Expand Down
36 changes: 18 additions & 18 deletions examples/rag/sdk_rag_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def test_faithfulness():

print("\n用例1 - 忠实的答案:")
result1 = LLMRAGFaithfulness.eval(data1)
print(f" 状态: {'✅ 通过' if not result1.eval_status else '❌ 未通过'}")
print(f" 详情: {result1.eval_details}")
print(f" 状态: {'✅ 通过' if not result1.status else '❌ 未通过'}")
print(f" 详情: {result1}")

# 测试用例2: 包含幻觉
data2 = Data(
Expand All @@ -63,8 +63,8 @@ def test_faithfulness():

print("\n用例2 - 包含幻觉:")
result2 = LLMRAGFaithfulness.eval(data2)
print(f" 状态: {'✅ 通过' if not result2.eval_status else '❌ 未通过'}")
print(f" 详情: {result2.eval_details}")
print(f" 状态: {'✅ 通过' if not result2.status else '❌ 未通过'}")
print(f" 详情: {result2}")
print("\n预期: 用例2分数 < 用例1分数")

return result1, result2
Expand Down Expand Up @@ -96,8 +96,8 @@ def test_context_precision():
)

result = LLMRAGContextPrecision.eval(data)
print(f" 状态: {'✅ 通过' if not result.eval_status else '❌ 未通过'}")
print(f" 详情: {result.eval_details}")
print(f" 状态: {'✅ 通过' if not result.status else '❌ 未通过'}")
print(f" 详情: {result}")
print("\n预期: 前3个上下文相关,最后1个不相关")

return result
Expand Down Expand Up @@ -125,8 +125,8 @@ def test_answer_relevancy():

print("\n用例1 - 直接回答:")
result1 = LLMRAGAnswerRelevancy.eval(data1)
print(f" 状态: {'✅ 通过' if not result1.eval_status else '❌ 未通过'}")
print(f" 详情: {result1.eval_details}")
print(f" 状态: {'✅ 通过' if not result1.status else '❌ 未通过'}")
print(f" 详情: {result1}")

# 测试用例2: 包含无关信息
data2 = Data(
Expand All @@ -137,8 +137,8 @@ def test_answer_relevancy():

print("\n用例2 - 包含无关信息:")
result2 = LLMRAGAnswerRelevancy.eval(data2)
print(f" 状态: {'✅ 通过' if not result2.eval_status else '❌ 未通过'}")
print(f" 详情: {result2.eval_details}")
print(f" 状态: {'✅ 通过' if not result2.status else '❌ 未通过'}")
print(f" 详情: {result2}")
print("\n预期: 用例2分数 < 用例1分数")

return result1, result2
Expand Down Expand Up @@ -170,8 +170,8 @@ def test_context_recall():

print("\n用例1 - 上下文完全支持:")
result1 = LLMRAGContextRecall.eval(data1)
print(f" 状态: {'✅ 通过' if not result1.eval_status else '❌ 未通过'}")
print(f" 详情: {result1.eval_details}")
print(f" 状态: {'✅ 通过' if not result1.status else '❌ 未通过'}")
print(f" 详情: {result1}")

# 测试用例2: 上下文部分支持答案
data2 = Data(
Expand All @@ -186,8 +186,8 @@ def test_context_recall():

print("\n用例2 - 上下文部分支持:")
result2 = LLMRAGContextRecall.eval(data2)
print(f" 状态: {'✅ 通过' if not result2.eval_status else '❌ 未通过'}")
print(f" 详情: {result2.eval_details}")
print(f" 状态: {'✅ 通过' if not result2.status else '❌ 未通过'}")
print(f" 详情: {result2}")
print("\n预期: 用例2分数 < 用例1分数")

return result1, result2
Expand Down Expand Up @@ -219,8 +219,8 @@ def test_context_relevancy():

print("\n用例1 - 所有上下文相关:")
result1 = LLMRAGContextRelevancy.eval(data1)
print(f" 状态: {'✅ 通过' if not result1.eval_status else '❌ 未通过'}")
print(f" 详情: {result1.eval_details}")
print(f" 状态: {'✅ 通过' if not result1.status else '❌ 未通过'}")
print(f" 详情: {result1}")

# 测试用例2: 包含不相关上下文
data2 = Data(
Expand All @@ -235,8 +235,8 @@ def test_context_relevancy():

print("\n用例2 - 包含不相关上下文:")
result2 = LLMRAGContextRelevancy.eval(data2)
print(f" 状态: {'✅ 通过' if not result2.eval_status else '❌ 未通过'}")
print(f" 详情: {result2.eval_details}")
print(f" 状态: {'✅ 通过' if not result2.status else '❌ 未通过'}")
print(f" 详情: {result2}")
print("\n预期: 用例2分数 < 用例1分数")

return result1, result2
Expand Down
27 changes: 14 additions & 13 deletions examples/rag/sdk_rag_eval_batch_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import logging
import os
import time
from pathlib import Path

from dingo.config.input_args import EvaluatorLLMArgs
from dingo.io.input import Data
Expand Down Expand Up @@ -50,7 +51,7 @@
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large")

# 输入文件路径配置
CSV_FILE_PATH = "ragflow_eval_data_50.jsonl" # 支持CSV和JSONL格式
CSV_FILE_PATH = Path("test/data/ragflow_eval_data_50.jsonl") # 支持CSV和JSONL格式
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The variable name CSV_FILE_PATH is misleading as the comment and the code logic indicate that it can also be a .jsonl file. To improve clarity, consider renaming it to something more generic like INPUT_FILE_PATH. Note that this will require updating its usages in the main function as well.

Suggested change
CSV_FILE_PATH = Path("test/data/ragflow_eval_data_50.jsonl") # 支持CSV和JSONL格式
INPUT_FILE_PATH = Path("test/data/ragflow_eval_data_50.jsonl") # 支持CSV和JSONL格式



def evaluate_from_jsonl(jsonl_path):
Expand Down Expand Up @@ -126,34 +127,34 @@ def evaluate_from_jsonl(jsonl_path):
# # 进行各项指标评测
print("\n1. 忠实度 (Faithfulness):")
faithfulness_result = LLMRAGFaithfulness.eval(data)
print(f" 状态: {'✅ 通过' if not faithfulness_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not faithfulness_result.status else '❌ 未通过'}")
print(f" 分数: {faithfulness_result.score}/10")
total_faithfulness += faithfulness_result.score

logger.info("\n2. 上下文精度 (Context Precision):")
print("\n2. 上下文精度 (Context Precision):")
precision_result = LLMRAGContextPrecision.eval(data)
logger.info(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}")
logger.info(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}")
logger.info(f" 分数: {precision_result.score}/10")
print(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}")
print(f" 分数: {precision_result.score}/10")
total_precision += precision_result.score
Comment on lines 134 to 141
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logging for 'Context Precision' is inconsistent with other metrics in this loop. It logs to both the logger and stdout, and the calls are duplicated, whereas other metrics only print to stdout. For consistency and to remove duplication, I suggest using only print here, similar to the other metrics.

Suggested change
logger.info("\n2. 上下文精度 (Context Precision):")
print("\n2. 上下文精度 (Context Precision):")
precision_result = LLMRAGContextPrecision.eval(data)
logger.info(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}")
logger.info(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}")
logger.info(f" 分数: {precision_result.score}/10")
print(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}")
print(f" 分数: {precision_result.score}/10")
total_precision += precision_result.score
print("\n2. 上下文精度 (Context Precision):")
precision_result = LLMRAGContextPrecision.eval(data)
print(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}")
print(f" 分数: {precision_result.score}/10")
total_precision += precision_result.score


print("\n3. 上下文召回 (Context Recall):")
recall_result = LLMRAGContextRecall.eval(data)
print(f" 状态: {'✅ 通过' if not recall_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not recall_result.status else '❌ 未通过'}")
print(f" 分数: {recall_result.score}/10")
total_recall += recall_result.score

print("\n4. 上下文相关性 (Context Relevancy):")
relevancy_result = LLMRAGContextRelevancy.eval(data)
print(f" 状态: {'✅ 通过' if not relevancy_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not relevancy_result.status else '❌ 未通过'}")
print(f" 分数: {relevancy_result.score}/10")
total_relevancy += relevancy_result.score
#
print("\n5. 答案相关性 (Answer Relevancy):")
answer_relevancy_result = LLMRAGAnswerRelevancy.eval(data)
print(f" 状态: {'✅ 通过' if not answer_relevancy_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not answer_relevancy_result.status else '❌ 未通过'}")
print(f" 分数: {answer_relevancy_result.score}/10")
total_answer_relevancy += answer_relevancy_result.score
Comment on lines 128 to 159
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This block of code for evaluating metrics is almost identical to the one in the evaluate_from_csv function (lines 271-302). This code duplication makes the code harder to maintain, as any change in the evaluation logic needs to be applied in two places. Consider extracting this logic into a separate helper function to improve maintainability and reduce redundancy.


Expand Down Expand Up @@ -269,34 +270,34 @@ def evaluate_from_csv(csv_path):
# # # # 进行各项指标评测
print("\n1. 忠实度 (Faithfulness):")
faithfulness_result = LLMRAGFaithfulness.eval(data)
print(f" 状态: {'✅ 通过' if not faithfulness_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not faithfulness_result.status else '❌ 未通过'}")
print(f" 分数: {faithfulness_result.score}/10")
total_faithfulness += faithfulness_result.score

logger.info("\n2. 上下文精度 (Context Precision):")
print("\n2. 上下文精度 (Context Precision):")
precision_result = LLMRAGContextPrecision.eval(data)
logger.info(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}")
logger.info(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}")
logger.info(f" 分数: {precision_result.score}/10")
print(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}")
print(f" 分数: {precision_result.score}/10")
total_precision += precision_result.score

print("\n3. 上下文召回 (Context Recall):")
recall_result = LLMRAGContextRecall.eval(data)
print(f" 状态: {'✅ 通过' if not recall_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not recall_result.status else '❌ 未通过'}")
print(f" 分数: {recall_result.score}/10")
total_recall += recall_result.score

print("\n4. 上下文相关性 (Context Relevancy):")
relevancy_result = LLMRAGContextRelevancy.eval(data)
print(f" 状态: {'✅ 通过' if not relevancy_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not relevancy_result.status else '❌ 未通过'}")
print(f" 分数: {relevancy_result.score}/10")
total_relevancy += relevancy_result.score

print("\n5. 答案相关性 (Answer Relevancy):")
answer_relevancy_result = LLMRAGAnswerRelevancy.eval(data)
print(f" 状态: {'✅ 通过' if not answer_relevancy_result.eval_status else '❌ 未通过'}")
print(f" 状态: {'✅ 通过' if not answer_relevancy_result.status else '❌ 未通过'}")
print(f" 分数: {answer_relevancy_result.score}/10")
total_answer_relevancy += answer_relevancy_result.score

Expand Down