Merge pull request #198 from e06084/dev

e06084 · web-flow · commit f8502c59c269 · 2025-09-29T20:45:53.000+08:00
feat: add factcheck example
diff --git a/dingo/model/llm/llm_factcheck_public.py b/dingo/model/llm/llm_factcheck_public.py
@@ -99,6 +99,9 @@ def eval(cls, input_data: Data) -> ModelRes:
 
         except Exception as e:
             return ModelRes(
+                error_status=True,
+                type="QUALITY_BAD_FACTUALITY",
+                name="FACTUALITY_CHECK_ERROR",
                 score=0.0,
                 threshold=cls.threshold,
                 reason=[f"Evaluation failed: {str(e)}"],
@@ -210,41 +213,23 @@ def _parse_check_results(cls, text: str) -> List[FactCheckResult]:
 
             results = []
             for item in data:
-                evidence_list = [
-                    Evidence(**e) for e in item["supporting_evidence"]
-                ]
+                # 处理 evidence，确保所有必需字段都存在
+                evidence_list = []
+                for e in item.get("supporting_evidence", []):
+                    # 确保所有必需字段都存在，提供默认值
+                    evidence = Evidence(
+                        url=e.get("url", ""),
+                        snippet=e.get("snippet", ""),  # 提供默认值避免缺失
+                        summary=e.get("summary", "")
+                    )
+                    evidence_list.append(evidence)
+
                 results.append(FactCheckResult(
-                    claim=item["claim"],
-                    answer=item["answer"],
-                    reasoning=item["reasoning"],
+                    claim=item.get("claim", ""),
+                    answer=item.get("answer", "unsure"),  # 默认为 unsure
+                    reasoning=item.get("reasoning", ""),
                     supporting_evidence=evidence_list
                 ))
             return results
         except Exception as e:
             raise ValueError(f"Invalid results format: {str(e)}")
-
-    @classmethod
-    def send_messages(cls, messages: List) -> str:
-        """重写发送消息方法，避免使用 models.list()"""
-        if not cls.dynamic_config.model:
-            raise ValueError("model name must be specified")
-
-        params = cls.dynamic_config.parameters or {}
-        cls.validate_config(params)
-
-        completions = cls.client.chat.completions.create(
-            model=cls.dynamic_config.model,
-            messages=messages,
-            temperature=params.get("temperature", 0.3),
-            top_p=params.get("top_p", 1),
-            max_tokens=params.get("max_tokens", 4000),
-            presence_penalty=params.get("presence_penalty", 0),
-            frequency_penalty=params.get("frequency_penalty", 0),
-        )
-
-        if completions.choices[0].finish_reason == "length":
-            raise ExceedMaxTokens(
-                f"Exceed max tokens: {params.get('max_tokens', 4000)}"
-            )
-
-        return str(completions.choices[0].message.content)
diff --git a/docs/factcheck_guide.md b/docs/factcheck_guide.md
@@ -221,7 +221,6 @@ dingo/
   │       └── prompt_factcheck.py      # 评估提示词
   └── examples/
       └── factcheck/
-          ├── factcheck_demo.py        # 单条评估示例
           └── dataset_factcheck_evaluation.py  # 数据集评估示例
 ```
 
diff --git a/examples/factcheck/dataset_factcheck_evaluation.py b/examples/factcheck/dataset_factcheck_evaluation.py
@@ -0,0 +1,120 @@
+"""
+Dataset Factuality Evaluation Example
+
+This example demonstrates how to use Dingo's factuality evaluation capability
+for batch evaluation of datasets, particularly useful for:
+- LLM response validation
+- RAG system evaluation
+- SFT data quality assessment
+"""
+import os
+from pathlib import Path
+
+from dingo.config import InputArgs
+from dingo.exec import Executor
+from dingo.io import Data
+# Force import factuality evaluation modules
+from dingo.model.llm.llm_factcheck_public import LLMFactCheckPublic
+from dingo.model.prompt.prompt_factcheck import PromptFactCheck
+
+OPENAI_MODEL = 'deepseek-chat'
+OPENAI_URL = 'https://api.deepseek.com/v1'
+OPENAI_KEY = os.getenv("OPENAI_KEY")
+
+
+def evaluate_factuality_jsonl_dataset():
+    """
+    Example: Evaluate a JSONL dataset for factuality
+    Expected JSONL format:
+    {"data_id": "1", "prompt": "question", "content": "response"}
+    """
+    print("=== Dataset Factuality Evaluation ===")
+
+    input_data = {
+        "input_path": str(Path("test/data/factcheck_test.jsonl")),  # Your JSONL file path
+        "output_path": "output/factcheck_evaluation/",
+        "dataset": {
+            "source": "local",
+            "format": "jsonl",
+            "field": {
+                "prompt": "question",  # 注意这里使用 question 作为 prompt 字段
+                "content": "content"
+            }
+        },
+        "executor": {
+            "eval_group": "factuality",  # 使用 factuality 评估组
+            "result_save": {
+                "bad": True,  # 保存不实信息
+                "good": True  # 保存真实信息
+            }
+        },
+        "evaluator": {
+            "llm_config": {
+                "LLMFactCheckPublic": {
+                    "model": OPENAI_MODEL,
+                    "key": OPENAI_KEY,
+                    "api_url": OPENAI_URL,
+                }
+            }
+        }
+    }
+
+    input_args = InputArgs(**input_data)
+    executor = Executor.exec_map["local"](input_args)
+    result = executor.execute()
+
+    print("\n=== Evaluation Summary ===")
+    print(f"Total processed: {result.total}")
+    print(f"Factual responses: {result.num_good}")
+    print(f"Non-factual responses: {result.num_bad}")
+    print(f"Overall factuality score: {result.score:.2%}")
+    print(f"\nType distribution: {result.type_ratio}")
+    print(f"Name distribution: {result.name_ratio}")
+
+
+def evaluate_single_data_example():
+    """
+    Example: Evaluate a single piece of data for factuality
+    This is useful for testing or real-time evaluation
+    """
+    print("=== Single Data Factuality Evaluation ===")
+
+    # 配置评估器
+    evaluator = LLMFactCheckPublic()
+    evaluator.dynamic_config.model = OPENAI_MODEL
+    evaluator.dynamic_config.key = OPENAI_KEY
+    evaluator.dynamic_config.api_url = OPENAI_URL
+    evaluator.dynamic_config.parameters = {
+        "temperature": 0.1,  # 降低随机性以提高一致性
+        "max_tokens": 2000
+    }
+
+    # 创建测试数据
+    test_data = Data(
+        data_id="test_1",
+        prompt="Tell me about Albert Einstein's Nobel Prize.",
+        content="Albert Einstein won the Nobel Prize in Physics in 1921 for his work on the photoelectric effect. However, many people mistakenly think he won it for his theory of relativity, which actually never received a Nobel Prize due to the controversial nature of relativity at the time."
+    )
+    # 执行评估
+    result = evaluator.eval(test_data)
+
+    print("\n=== Evaluation Result ===")
+    print(f"Error Status: {result.error_status}")
+    print(f"Type: {result.type}")
+    print(f"Name: {result.name}")
+    print(f"Reason: {result.reason}")
+
+
+if __name__ == "__main__":
+    print("📊 Dingo Factuality Evaluation Examples")
+    print("=" * 60)
+    print()
+
+    # Run examples
+    # print("1. Dataset Evaluation Example")
+    # print("-" * 30)
+    # evaluate_factuality_jsonl_dataset()
+
+    print("2. Single Data Evaluation Example")
+    print("-" * 30)
+    evaluate_single_data_example()