diff --git a/examples/rag/rag_mock_and_eval.py b/examples/rag/rag_mock_and_eval.py index 29ef89e4..41499557 100644 --- a/examples/rag/rag_mock_and_eval.py +++ b/examples/rag/rag_mock_and_eval.py @@ -199,7 +199,7 @@ def evaluate_rag_result(question: str, rag_result: Dict[str, Any]): model=OPENAI_MODEL, ) faith_result = LLMRAGFaithfulness.eval(data) - print(f"Faithfulness details: {faith_result.eval_details}") + print(f"Faithfulness details: {faith_result}") # 2. 评测答案相关性 LLMRAGAnswerRelevancy.dynamic_config = EvaluatorLLMArgs( @@ -208,7 +208,7 @@ def evaluate_rag_result(question: str, rag_result: Dict[str, Any]): model=OPENAI_MODEL, ) ans_rel_result = LLMRAGAnswerRelevancy.eval(data) - print(f"Answer Relevancy details: {ans_rel_result.eval_details}") + print(f"Answer Relevancy details: {ans_rel_result}") # 3. 评测上下文相关性 LLMRAGContextRelevancy.dynamic_config = EvaluatorLLMArgs( @@ -217,12 +217,12 @@ def evaluate_rag_result(question: str, rag_result: Dict[str, Any]): model=OPENAI_MODEL, ) ctx_rel_result = LLMRAGContextRelevancy.eval(data) - print(f"Context Relevancy details: {ctx_rel_result.eval_details}") + print(f"Context Relevancy details: {ctx_rel_result}") return { - "faithfulness": faith_result.eval_details, - "answer_relevancy": ans_rel_result.eval_details, - "context_relevancy": ctx_rel_result.eval_details + "faithfulness": faith_result, + "answer_relevancy": ans_rel_result, + "context_relevancy": ctx_rel_result } diff --git a/examples/rag/sdk_rag_eval.py b/examples/rag/sdk_rag_eval.py index 7c664bd5..c6ef154f 100644 --- a/examples/rag/sdk_rag_eval.py +++ b/examples/rag/sdk_rag_eval.py @@ -48,8 +48,8 @@ def test_faithfulness(): print("\n用例1 - 忠实的答案:") result1 = LLMRAGFaithfulness.eval(data1) - print(f" 状态: {'✅ 通过' if not result1.eval_status else '❌ 未通过'}") - print(f" 详情: {result1.eval_details}") + print(f" 状态: {'✅ 通过' if not result1.status else '❌ 未通过'}") + print(f" 详情: {result1}") # 测试用例2: 包含幻觉 data2 = Data( @@ -63,8 +63,8 @@ def test_faithfulness(): print("\n用例2 - 包含幻觉:") result2 = LLMRAGFaithfulness.eval(data2) - print(f" 状态: {'✅ 通过' if not result2.eval_status else '❌ 未通过'}") - print(f" 详情: {result2.eval_details}") + print(f" 状态: {'✅ 通过' if not result2.status else '❌ 未通过'}") + print(f" 详情: {result2}") print("\n预期: 用例2分数 < 用例1分数") return result1, result2 @@ -96,8 +96,8 @@ def test_context_precision(): ) result = LLMRAGContextPrecision.eval(data) - print(f" 状态: {'✅ 通过' if not result.eval_status else '❌ 未通过'}") - print(f" 详情: {result.eval_details}") + print(f" 状态: {'✅ 通过' if not result.status else '❌ 未通过'}") + print(f" 详情: {result}") print("\n预期: 前3个上下文相关,最后1个不相关") return result @@ -125,8 +125,8 @@ def test_answer_relevancy(): print("\n用例1 - 直接回答:") result1 = LLMRAGAnswerRelevancy.eval(data1) - print(f" 状态: {'✅ 通过' if not result1.eval_status else '❌ 未通过'}") - print(f" 详情: {result1.eval_details}") + print(f" 状态: {'✅ 通过' if not result1.status else '❌ 未通过'}") + print(f" 详情: {result1}") # 测试用例2: 包含无关信息 data2 = Data( @@ -137,8 +137,8 @@ def test_answer_relevancy(): print("\n用例2 - 包含无关信息:") result2 = LLMRAGAnswerRelevancy.eval(data2) - print(f" 状态: {'✅ 通过' if not result2.eval_status else '❌ 未通过'}") - print(f" 详情: {result2.eval_details}") + print(f" 状态: {'✅ 通过' if not result2.status else '❌ 未通过'}") + print(f" 详情: {result2}") print("\n预期: 用例2分数 < 用例1分数") return result1, result2 @@ -170,8 +170,8 @@ def test_context_recall(): print("\n用例1 - 上下文完全支持:") result1 = LLMRAGContextRecall.eval(data1) - print(f" 状态: {'✅ 通过' if not result1.eval_status else '❌ 未通过'}") - print(f" 详情: {result1.eval_details}") + print(f" 状态: {'✅ 通过' if not result1.status else '❌ 未通过'}") + print(f" 详情: {result1}") # 测试用例2: 上下文部分支持答案 data2 = Data( @@ -186,8 +186,8 @@ def test_context_recall(): print("\n用例2 - 上下文部分支持:") result2 = LLMRAGContextRecall.eval(data2) - print(f" 状态: {'✅ 通过' if not result2.eval_status else '❌ 未通过'}") - print(f" 详情: {result2.eval_details}") + print(f" 状态: {'✅ 通过' if not result2.status else '❌ 未通过'}") + print(f" 详情: {result2}") print("\n预期: 用例2分数 < 用例1分数") return result1, result2 @@ -219,8 +219,8 @@ def test_context_relevancy(): print("\n用例1 - 所有上下文相关:") result1 = LLMRAGContextRelevancy.eval(data1) - print(f" 状态: {'✅ 通过' if not result1.eval_status else '❌ 未通过'}") - print(f" 详情: {result1.eval_details}") + print(f" 状态: {'✅ 通过' if not result1.status else '❌ 未通过'}") + print(f" 详情: {result1}") # 测试用例2: 包含不相关上下文 data2 = Data( @@ -235,8 +235,8 @@ def test_context_relevancy(): print("\n用例2 - 包含不相关上下文:") result2 = LLMRAGContextRelevancy.eval(data2) - print(f" 状态: {'✅ 通过' if not result2.eval_status else '❌ 未通过'}") - print(f" 详情: {result2.eval_details}") + print(f" 状态: {'✅ 通过' if not result2.status else '❌ 未通过'}") + print(f" 详情: {result2}") print("\n预期: 用例2分数 < 用例1分数") return result1, result2 diff --git a/examples/rag/sdk_rag_eval_batch_dataset.py b/examples/rag/sdk_rag_eval_batch_dataset.py index e361d755..5be45ede 100644 --- a/examples/rag/sdk_rag_eval_batch_dataset.py +++ b/examples/rag/sdk_rag_eval_batch_dataset.py @@ -11,6 +11,7 @@ import logging import os import time +from pathlib import Path from dingo.config.input_args import EvaluatorLLMArgs from dingo.io.input import Data @@ -50,7 +51,7 @@ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-large") # 输入文件路径配置 -CSV_FILE_PATH = "ragflow_eval_data_50.jsonl" # 支持CSV和JSONL格式 +CSV_FILE_PATH = Path("test/data/ragflow_eval_data_50.jsonl") # 支持CSV和JSONL格式 def evaluate_from_jsonl(jsonl_path): @@ -126,34 +127,34 @@ def evaluate_from_jsonl(jsonl_path): # # 进行各项指标评测 print("\n1. 忠实度 (Faithfulness):") faithfulness_result = LLMRAGFaithfulness.eval(data) - print(f" 状态: {'✅ 通过' if not faithfulness_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not faithfulness_result.status else '❌ 未通过'}") print(f" 分数: {faithfulness_result.score}/10") total_faithfulness += faithfulness_result.score logger.info("\n2. 上下文精度 (Context Precision):") print("\n2. 上下文精度 (Context Precision):") precision_result = LLMRAGContextPrecision.eval(data) - logger.info(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}") + logger.info(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}") logger.info(f" 分数: {precision_result.score}/10") - print(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}") print(f" 分数: {precision_result.score}/10") total_precision += precision_result.score print("\n3. 上下文召回 (Context Recall):") recall_result = LLMRAGContextRecall.eval(data) - print(f" 状态: {'✅ 通过' if not recall_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not recall_result.status else '❌ 未通过'}") print(f" 分数: {recall_result.score}/10") total_recall += recall_result.score print("\n4. 上下文相关性 (Context Relevancy):") relevancy_result = LLMRAGContextRelevancy.eval(data) - print(f" 状态: {'✅ 通过' if not relevancy_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not relevancy_result.status else '❌ 未通过'}") print(f" 分数: {relevancy_result.score}/10") total_relevancy += relevancy_result.score # print("\n5. 答案相关性 (Answer Relevancy):") answer_relevancy_result = LLMRAGAnswerRelevancy.eval(data) - print(f" 状态: {'✅ 通过' if not answer_relevancy_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not answer_relevancy_result.status else '❌ 未通过'}") print(f" 分数: {answer_relevancy_result.score}/10") total_answer_relevancy += answer_relevancy_result.score @@ -269,34 +270,34 @@ def evaluate_from_csv(csv_path): # # # # 进行各项指标评测 print("\n1. 忠实度 (Faithfulness):") faithfulness_result = LLMRAGFaithfulness.eval(data) - print(f" 状态: {'✅ 通过' if not faithfulness_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not faithfulness_result.status else '❌ 未通过'}") print(f" 分数: {faithfulness_result.score}/10") total_faithfulness += faithfulness_result.score logger.info("\n2. 上下文精度 (Context Precision):") print("\n2. 上下文精度 (Context Precision):") precision_result = LLMRAGContextPrecision.eval(data) - logger.info(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}") + logger.info(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}") logger.info(f" 分数: {precision_result.score}/10") - print(f" 状态: {'✅ 通过' if not precision_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not precision_result.status else '❌ 未通过'}") print(f" 分数: {precision_result.score}/10") total_precision += precision_result.score print("\n3. 上下文召回 (Context Recall):") recall_result = LLMRAGContextRecall.eval(data) - print(f" 状态: {'✅ 通过' if not recall_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not recall_result.status else '❌ 未通过'}") print(f" 分数: {recall_result.score}/10") total_recall += recall_result.score print("\n4. 上下文相关性 (Context Relevancy):") relevancy_result = LLMRAGContextRelevancy.eval(data) - print(f" 状态: {'✅ 通过' if not relevancy_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not relevancy_result.status else '❌ 未通过'}") print(f" 分数: {relevancy_result.score}/10") total_relevancy += relevancy_result.score print("\n5. 答案相关性 (Answer Relevancy):") answer_relevancy_result = LLMRAGAnswerRelevancy.eval(data) - print(f" 状态: {'✅ 通过' if not answer_relevancy_result.eval_status else '❌ 未通过'}") + print(f" 状态: {'✅ 通过' if not answer_relevancy_result.status else '❌ 未通过'}") print(f" 分数: {answer_relevancy_result.score}/10") total_answer_relevancy += answer_relevancy_result.score diff --git a/examples/rag/ragflow_eval_data_50.jsonl b/test/data/ragflow_eval_data_50.jsonl similarity index 100% rename from examples/rag/ragflow_eval_data_50.jsonl rename to test/data/ragflow_eval_data_50.jsonl