MigoXLab
diff --git a/‎.github/workflows/lint.yml‎
Lines changed: 32 additions & 2 deletions b/‎.github/workflows/lint.yml‎
Lines changed: 32 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 0 deletions b/‎README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎README_ja.md‎
Lines changed: 1 addition & 0 deletions b/‎README_ja.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README_zh-CN.md‎
Lines changed: 5 additions & 0 deletions b/‎README_zh-CN.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎dingo/config/input_args.py‎
Lines changed: 2 additions & 6 deletions b/‎dingo/config/input_args.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎dingo/model/llm/llm_rag_answer_relevancy.py‎
Lines changed: 91 additions & 0 deletions b/‎dingo/model/llm/llm_rag_answer_relevancy.py‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎dingo/model/llm/llm_rag_context_precision.py‎
Lines changed: 110 additions & 0 deletions b/‎dingo/model/llm/llm_rag_context_precision.py‎
Lines changed: 110 additions & 0 deletions
@@ -15,7 +15,37 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: "3.10"
-      - name: Run pre-commit
+
+      - name: Install pre-commit
+        run: pip install pre-commit==3.8.0
+
+      - name: Run pre-commit (auto-fix)
+        id: pre_commit_auto_fix
+        run: |
+          # 运行 pre-commit，允许自动修复，不因修复而失败
+          pre-commit run --all-files || true
+
+      - name: Check for changes
+        id: check_changes
+        run: |
+          if [[ -n $(git status --porcelain) ]]; then
+            echo "changed=true" >> $GITHUB_OUTPUT
+            echo "📝 Files were modified by pre-commit auto-fix"
+          else
+            echo "changed=false" >> $GITHUB_OUTPUT
+            echo "✅ No auto-fix changes"
+          fi
+
+      - name: Commit auto-fix changes
+        if: steps.check_changes.outputs.changed == 'true' && github.event_name == 'push'
+        run: |
+          git config --local user.email "[email protected]"
+          git config --local user.name "GitHub Action"
+          git add -A
+          git commit -m "🎨 Auto-format code with pre-commit"
+          git push
+
+      - name: Run pre-commit (final check)
         run: |
-          pip install pre-commit==3.8.0
+          # 再次运行 pre-commit，这次如果有错误就真的失败
           pre-commit run --all-files
@@ -237,6 +237,10 @@ For detailed guidance on using Dingo's hallucination detection capabilities, inc
 
 📖 **[View Hallucination Detection Guide →](docs/hallucination_guide.md)**
 
+For comprehensive guidance on RAG evaluation metrics including Faithfulness, Context Precision, Answer Relevancy, Context Recall, and Context Relevancy:
+
+📖 **[View RAG Evaluation Metrics Guide →](docs/rag_evaluation_metrics_zh.md)**
+
 ### Factuality Assessment
 
 For comprehensive guidance on using Dingo's two-stage factuality evaluation system:
@@ -431,6 +435,7 @@ The current built-in detection rules and model methods focus on common data qual
 - [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
 - [mlflow](https://github.com/mlflow/mlflow)
 - [deepeval](https://github.com/confident-ai/deepeval)
+- [ragas](https://github.com/explodinggradients/ragas)
 
 # Contribution
 
 
@@ -428,6 +428,7 @@ result = executor.execute()
 - [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
 - [mlflow](https://github.com/mlflow/mlflow)
 - [deepeval](https://github.com/confident-ai/deepeval)
+- [ragas](https://github.com/explodinggradients/ragas)
 
 # 貢献
 
 
@@ -233,6 +233,10 @@ input_data = {
 
 📖 **[查看幻觉检测指南 →](docs/hallucination_guide.md)**
 
+有关RAG评估指标的完整指导，包括忠实度、上下文精度、答案相关性、上下文召回和上下文相关性：
+
+📖 **[查看RAG评估指标指南 →](docs/rag_evaluation_metrics_zh.md)**
+
 ### 事实性评估
 
 有关使用Dingo两阶段事实性评估系统的详细指导：
@@ -427,6 +431,7 @@ result = executor.execute()
 - [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
 - [mlflow](https://github.com/mlflow/mlflow)
 - [deepeval](https://github.com/confident-ai/deepeval)
+- [ragas](https://github.com/explodinggradients/ragas)
 
 # 贡献
 
 
@@ -1,10 +1,6 @@
-import json
-import os
-import time
-import uuid
 from typing import Dict, List, Optional
 
-from pydantic import BaseModel, ValidationError
+from pydantic import BaseModel
 
 
 class DatasetHFConfigArgs(BaseModel):
@@ -37,7 +33,7 @@ class DatasetArgs(BaseModel):
 
 
 class ExecutorResultSaveArgs(BaseModel):
-    bad: bool = False
+    bad: bool = True
     good: bool = False
     all_labels: bool = False
     raw: bool = False
 
@@ -0,0 +1,91 @@
+"""
+RAG Answer Relevancy (答案相关性) LLM评估器
+
+基于LLM评估答案是否直接回答了问题。
+"""
+
+import json
+from typing import List
+
+from dingo.io import Data
+from dingo.model import Model
+from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.model.modelres import ModelRes
+from dingo.model.prompt.prompt_rag_answer_relevancy import PromptRAGAnswerRelevancy
+from dingo.model.response.response_class import ResponseScoreReason
+from dingo.utils import log
+from dingo.utils.exception import ConvertJsonError
+
+
+@Model.llm_register("LLMRAGAnswerRelevancy")
+class LLMRAGAnswerRelevancy(BaseOpenAI):
+    """
+    RAG答案相关性评估LLM
+
+    输入要求:
+    - input_data.prompt 或 raw_data['question']: 用户问题
+    - input_data.content 或 raw_data['answer']: 生成的答案
+    """
+
+    prompt = PromptRAGAnswerRelevancy
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List:
+        """构建LLM输入消息"""
+        # 提取字段
+        question = input_data.prompt or input_data.raw_data.get("question", "")
+        answer = input_data.content or input_data.raw_data.get("answer", "")
+
+        if not question:
+            raise ValueError("Answer Relevancy评估需要question字段")
+        if not answer:
+            raise ValueError("Answer Relevancy评估需要answer字段")
+
+        # 构建prompt内容
+        prompt_content = cls.prompt.content.format(question, answer)
+
+        messages = [{"role": "user", "content": prompt_content}]
+
+        return messages
+
+    @classmethod
+    def process_response(cls, response: str) -> ModelRes:
+        """处理LLM响应"""
+        log.info(f"RAG Answer Relevancy response: {response}")
+
+        # 清理响应
+        if response.startswith("```json"):
+            response = response[7:]
+        if response.startswith("```"):
+            response = response[3:]
+        if response.endswith("```"):
+            response = response[:-3]
+
+        try:
+            response_json = json.loads(response.strip())
+        except json.JSONDecodeError:
+            raise ConvertJsonError(f"Convert to JSON format failed: {response}")
+
+        # 解析响应
+        response_model = ResponseScoreReason(**response_json)
+
+        result = ModelRes()
+        result.score = response_model.score
+
+        # 根据分数判断是否通过（默认阈值5，满分10分）
+        threshold = 5
+        if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters:
+            threshold = cls.dynamic_config.parameters.get('threshold', 5)
+
+        if response_model.score >= threshold:
+            result.error_status = False
+            result.type = "QUALITY_GOOD"
+            result.name = "ANSWER_RELEVANCY_PASS"
+            result.reason = [f"答案相关性评估通过 (分数: {response_model.score}/10)\n{response_model.reason}"]
+        else:
+            result.error_status = True
+            result.type = cls.prompt.metric_type
+            result.name = cls.prompt.__name__
+            result.reason = [f"答案相关性评估未通过 (分数: {response_model.score}/10)\n{response_model.reason}"]
+
+        return result
@@ -0,0 +1,110 @@
+"""
+RAG Context Precision (上下文精度) LLM评估器
+
+基于LLM评估检索上下文的精确度和排序质量。
+"""
+
+import json
+from typing import List
+
+from dingo.io import Data
+from dingo.model import Model
+from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.model.modelres import ModelRes
+from dingo.model.prompt.prompt_rag_context_precision import PromptRAGContextPrecision
+from dingo.model.response.response_class import ResponseScoreReason
+from dingo.utils import log
+from dingo.utils.exception import ConvertJsonError
+
+
+@Model.llm_register("LLMRAGContextPrecision")
+class LLMRAGContextPrecision(BaseOpenAI):
+    """
+    RAG上下文精度评估LLM
+
+    输入要求:
+    - input_data.prompt 或 raw_data['question']: 用户问题
+    - input_data.content 或 raw_data['answer']: 生成的答案
+    - input_data.context 或 raw_data['contexts']: 检索到的上下文列表
+    """
+
+    prompt = PromptRAGContextPrecision
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List:
+        """构建LLM输入消息"""
+        # 提取字段
+        question = input_data.prompt or input_data.raw_data.get("question", "")
+        answer = input_data.content or input_data.raw_data.get("answer", "")
+
+        if not answer:
+            raise ValueError("Context Precision评估需要answer字段")
+
+        # 处理contexts
+        contexts = None
+        if input_data.context:
+            if isinstance(input_data.context, list):
+                contexts = input_data.context
+            else:
+                contexts = [input_data.context]
+        elif "contexts" in input_data.raw_data:
+            raw_contexts = input_data.raw_data["contexts"]
+            if isinstance(raw_contexts, list):
+                contexts = raw_contexts
+            else:
+                contexts = [raw_contexts]
+
+        if not contexts:
+            raise ValueError("Context Precision评估需要contexts字段")
+
+        # 格式化上下文列表
+        contexts_formatted = "\n".join([f"{i + 1}. {ctx}" for i, ctx in enumerate(contexts)])
+
+        # 构建prompt内容
+        prompt_content = cls.prompt.content.format(question, answer, contexts_formatted)
+
+        messages = [{"role": "user", "content": prompt_content}]
+
+        return messages
+
+    @classmethod
+    def process_response(cls, response: str) -> ModelRes:
+        """处理LLM响应"""
+        log.info(f"RAG Context Precision response: {response}")
+
+        # 清理响应
+        if response.startswith("```json"):
+            response = response[7:]
+        if response.startswith("```"):
+            response = response[3:]
+        if response.endswith("```"):
+            response = response[:-3]
+
+        try:
+            response_json = json.loads(response.strip())
+        except json.JSONDecodeError:
+            raise ConvertJsonError(f"Convert to JSON format failed: {response}")
+
+        # 解析响应
+        response_model = ResponseScoreReason(**response_json)
+
+        result = ModelRes()
+        result.score = response_model.score
+
+        # 根据分数判断是否通过（默认阈值5，满分10分）
+        threshold = 5
+        if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters:
+            threshold = cls.dynamic_config.parameters.get('threshold', 5)
+
+        if response_model.score >= threshold:
+            result.error_status = False
+            result.type = "QUALITY_GOOD"
+            result.name = "CONTEXT_PRECISION_PASS"
+            result.reason = [f"上下文精度评估通过 (分数: {response_model.score}/10)\n{response_model.reason}"]
+        else:
+            result.error_status = True
+            result.type = cls.prompt.metric_type
+            result.name = cls.prompt.__name__
+            result.reason = [f"上下文精度评估未通过 (分数: {response_model.score}/10)\n{response_model.reason}"]
+
+        return result