diff --git a/README.md b/README.md index 27f51d12..7278c699 100644 --- a/README.md +++ b/README.md @@ -297,7 +297,8 @@ from dingo.model import Model from dingo.model.rule.base import BaseRule from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data -from dingo.model.modelres import ModelRes +from dingo.io.output.eval_detail import EvalDetail + @Model.rule_register('QUALITY_BAD_RELEVANCE', ['default']) class MyCustomRule(BaseRule): @@ -306,8 +307,8 @@ class MyCustomRule(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'your_pattern_here') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail() # Your rule implementation here return res ``` diff --git a/README_ja.md b/README_ja.md index 61023e97..5727140f 100644 --- a/README_ja.md +++ b/README_ja.md @@ -290,7 +290,8 @@ from dingo.model import Model from dingo.model.rule.base import BaseRule from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data -from dingo.model.modelres import ModelRes +from dingo.io.output.eval_detail import EvalDetail + @Model.rule_register('QUALITY_BAD_RELEVANCE', ['default']) class MyCustomRule(BaseRule): @@ -299,8 +300,8 @@ class MyCustomRule(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'your_pattern_here') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail() # ここにルール実装 return res ``` diff --git a/README_zh-CN.md b/README_zh-CN.md index 08c7601a..ebf7a2bc 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -296,7 +296,8 @@ from dingo.model import Model from dingo.model.rule.base import BaseRule from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data -from dingo.model.modelres import ModelRes +from dingo.io.output.eval_detail import EvalDetail + @Model.rule_register('QUALITY_BAD_RELEVANCE', ['default']) class MyCustomRule(BaseRule): @@ -305,8 +306,8 @@ class MyCustomRule(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'your_pattern_here') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail() # 您的规则实现 return res ``` diff --git a/dingo/exec/local.py b/dingo/exec/local.py index 185fd4a9..0b723355 100644 --- a/dingo/exec/local.py +++ b/dingo/exec/local.py @@ -10,14 +10,12 @@ from tqdm import tqdm from dingo.config import InputArgs -from dingo.config.input_args import EvalPipline from dingo.data import Dataset, DataSource, dataset_map, datasource_map from dingo.exec.base import ExecProto, Executor from dingo.io import Data, ResultInfo, SummaryModel +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base import BaseLLM -from dingo.model.modelres import EvalDetail, ModelRes -from dingo.model.rule.base import BaseRule from dingo.utils import log @@ -110,23 +108,20 @@ def execute(self) -> SummaryModel: futures_results = self.merge_result_info(futures_results, result_info) for result_info in futures_results: - # 统计eval_details,第一层key是字段名组合,第二层value是EvalDetail + # 统计eval_details,第一层key是字段名组合,第二层value是List[EvalDetail] # 错误类型从EvalDetail.label中获取 - for field_key, eval_detail in result_info.eval_details.items(): + for field_key, eval_detail_list in result_info.eval_details.items(): if field_key not in self.summary.type_ratio: self.summary.type_ratio[field_key] = {} - # 遍历 EvalDetail.label 中的每个错误类型 - # 兼容 dict 和 EvalDetail 对象两种情况 - if isinstance(eval_detail, dict): - label_list = eval_detail.get('label', []) - else: - label_list = eval_detail.label - - for eval_details_name in label_list: - if eval_details_name not in self.summary.type_ratio[field_key]: - self.summary.type_ratio[field_key][eval_details_name] = 1 - else: - self.summary.type_ratio[field_key][eval_details_name] += 1 + # 遍历 List[EvalDetail] + for eval_detail in eval_detail_list: + # 获取label列表 + label_list = eval_detail.label if eval_detail.label else [] + for label in label_list: + if label not in self.summary.type_ratio[field_key]: + self.summary.type_ratio[field_key][label] = 1 + else: + self.summary.type_ratio[field_key][label] += 1 if result_info.eval_status: self.summary.num_bad += 1 @@ -166,8 +161,7 @@ def evaluate_single_data(self, dingo_id: str, eval_fields: dict, eval_type: str, ResultInfo containing evaluation results """ result_info = ResultInfo(dingo_id=dingo_id) - bad_eval_details = None - good_eval_details = None + eval_detail_list = [] for e_c_i in eval_list: # Get model class and instantiate @@ -183,55 +177,32 @@ def evaluate_single_data(self, dingo_id: str, eval_fields: dict, eval_type: str, raise ValueError(f"Error eval_type: {eval_type}") # Execute evaluation - tmp: ModelRes = model.eval(Data(**map_data)) - if isinstance(tmp.eval_details, dict): - tmp.eval_details = EvalDetail(**tmp.eval_details) + tmp: EvalDetail = model.eval(Data(**map_data)) - # Collect eval_details from ModelRes - if tmp.eval_status: + # 直接添加EvalDetail到列表中,不再merge + eval_detail_list.append(tmp) + + # 如果任意一个EvalDetail的status为True,则result_info.eval_status为True + if tmp.status: result_info.eval_status = True - # 合并 bad 的 eval_details (ModelRes.eval_details 现在直接是 EvalDetail) - if isinstance(bad_eval_details, dict): - bad_eval_details = EvalDetail(**bad_eval_details) - if bad_eval_details: - bad_eval_details.merge(tmp.eval_details) - else: - bad_eval_details = tmp.eval_details.copy() - else: - # 合并 good 的 eval_details (ModelRes.eval_details 现在直接是 EvalDetail) - if isinstance(good_eval_details, dict): - good_eval_details = EvalDetail(**good_eval_details) - if good_eval_details: - good_eval_details.merge(tmp.eval_details) - else: - good_eval_details = tmp.eval_details.copy() - # Set result_info fields based on all_labels configuration and add field - join_fields = ','.join(eval_fields.values()) + # Set result_info fields + join_fields = ','.join(eval_fields.values()) if eval_fields else 'default' + # 根据配置决定保存哪些结果 if self.input_args.executor.result_save.all_labels: - # Always include both good and bad results when they exist - # The final eval_status is True if ANY evaluation failed - # 合并 good 和 bad 的 eval_details (现在是 EvalDetail 对象) - all_eval_details = None - if bad_eval_details: - all_eval_details = bad_eval_details.copy() - if good_eval_details: - if all_eval_details: - all_eval_details.merge(good_eval_details) - else: - all_eval_details = good_eval_details.copy() - # add field (ResultInfo.eval_details 现在是 Dict[str, EvalDetail]) - if all_eval_details: - result_info.eval_details = {join_fields: all_eval_details} + # 保存所有结果 + if eval_detail_list: + result_info.eval_details = {join_fields: eval_detail_list} else: - # add field (ResultInfo.eval_details 现在是 Dict[str, EvalDetail]) + # 只保存bad或good的结果 if result_info.eval_status: - if bad_eval_details: - result_info.eval_details = {join_fields: bad_eval_details} + # 有bad结果,只保留status=True的EvalDetail + result_info.eval_details = {join_fields: [mr for mr in eval_detail_list if mr.status]} else: - if good_eval_details and self.input_args.executor.result_save.good: - result_info.eval_details = {join_fields: good_eval_details} + # 都是good结果,根据配置决定是否保存,只保留status=False的EvalDetail + if self.input_args.executor.result_save.good: + result_info.eval_details = {join_fields: [mr for mr in eval_detail_list if not mr.status]} return result_info @@ -241,14 +212,14 @@ def merge_result_info(self, existing_list: List[ResultInfo], new_item: ResultInf if existing_item: existing_item.eval_status = existing_item.eval_status or new_item.eval_status - # 合并 eval_details 字典(第一层是字段名,第二层直接是 EvalDetail) + # 合并 eval_details 字典(第一层是字段名,第二层是List[EvalDetail]) for key, value in new_item.eval_details.items(): - # 第一层是字段名,如果存在,则合并 EvalDetail + # 第一层是字段名,如果存在,则extend List[EvalDetail] if key in existing_item.eval_details: - existing_item.eval_details[key].merge(value) - # 第一层是字段名,如果不存在,则创建副本 + existing_item.eval_details[key].extend(value) + # 第一层是字段名,如果不存在,则直接赋值 else: - existing_item.eval_details[key] = value.copy() + existing_item.eval_details[key] = value else: existing_list.append(new_item) @@ -279,42 +250,53 @@ def write_single_data( if not input_args.executor.result_save.good and not result_info.eval_status: return - # 遍历 eval_details 的第一层(字段名组合),第二层直接是 EvalDetail - for field_name, eval_detail in result_info.eval_details.items(): + # 用集合记录已经写过的(字段名, label名)组合,避免重复写入 + written_labels = set() + + # 遍历 eval_details 的第一层(字段名组合),第二层是List[EvalDetail] + for field_name, eval_detail_list in result_info.eval_details.items(): # 第一层:根据字段名创建文件夹 field_dir = os.path.join(path, field_name) if not os.path.exists(field_dir): os.makedirs(field_dir) - # 从 EvalDetail.label 中获取错误类型列表 - if isinstance(eval_detail, dict): - label_list = eval_detail.get('label', []) - else: - label_list = eval_detail.label - for eval_details_name in label_list: - # 按点分割错误类型名称,创建多层文件夹 - # 例如: "validity_errors.space_issues" -> ["validity_errors", "space_issues"] - parts = eval_details_name.split(".") - - # 除了最后一部分,其他部分都是文件夹 - if len(parts) > 1: - # 创建多层文件夹 - folder_path = os.path.join(field_dir, *parts[:-1]) - if not os.path.exists(folder_path): - os.makedirs(folder_path) - # 最后一部分作为文件名 - file_name = parts[-1] + ".jsonl" - f_n = os.path.join(folder_path, file_name) - else: - # 没有点分割,直接在字段文件夹下创建文件 - f_n = os.path.join(field_dir, parts[0] + ".jsonl") - - with open(f_n, "a", encoding="utf-8") as f: - if input_args.executor.result_save.raw: - str_json = json.dumps(result_info.to_raw_dict(), ensure_ascii=False) + # 遍历 List[EvalDetail] + for eval_detail in eval_detail_list: + # 从 EvalDetail.label 中获取错误类型列表 + label_list = eval_detail.label if eval_detail.label else [] + + for eval_details_name in label_list: + # 检查是否已经写过这个(字段名, label名)组合 + label_key = (field_name, eval_details_name) + if label_key in written_labels: + continue + + # 标记为已写入 + written_labels.add(label_key) + + # 按点分割错误类型名称,创建多层文件夹 + # 例如: "validity_errors.space_issues" -> ["validity_errors", "space_issues"] + parts = eval_details_name.split(".") + + # 除了最后一部分,其他部分都是文件夹 + if len(parts) > 1: + # 创建多层文件夹 + folder_path = os.path.join(field_dir, *parts[:-1]) + if not os.path.exists(folder_path): + os.makedirs(folder_path) + # 最后一部分作为文件名 + file_name = parts[-1] + ".jsonl" + f_n = os.path.join(folder_path, file_name) else: - str_json = json.dumps(result_info.to_dict(), ensure_ascii=False) - f.write(str_json + "\n") + # 没有点分割,直接在字段文件夹下创建文件 + f_n = os.path.join(field_dir, parts[0] + ".jsonl") + + with open(f_n, "a", encoding="utf-8") as f: + if input_args.executor.result_save.raw: + str_json = json.dumps(result_info.to_raw_dict(), ensure_ascii=False) + else: + str_json = json.dumps(result_info.to_dict(), ensure_ascii=False) + f.write(str_json + "\n") def write_summary(self, path: str, input_args: InputArgs, summary: SummaryModel): if not input_args.executor.result_save.bad: diff --git a/dingo/exec/spark.py b/dingo/exec/spark.py index 64256665..7d936bae 100644 --- a/dingo/exec/spark.py +++ b/dingo/exec/spark.py @@ -1,7 +1,7 @@ import copy import time import uuid -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from pyspark import SparkConf from pyspark.rdd import RDD @@ -10,11 +10,10 @@ from dingo.config import InputArgs from dingo.exec.base import ExecProto, Executor from dingo.io import Data, ResultInfo, SummaryModel +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model -from dingo.model.llm.base import BaseLLM -from dingo.model.modelres import ModelRes + # from dingo.model.prompt.base import BasePrompt -from dingo.model.rule.base import BaseRule @Executor.register("spark") @@ -154,20 +153,20 @@ def evaluate(self, data_rdd_item) -> Dict[str, Any]: else: raise ValueError(f"Error eval_type: {eval_type}") - if r_i.eval_status: - result_info.eval_status = True - for k,v in r_i.eval_details.items(): - if k not in result_info.eval_details: - result_info.eval_details[k] = v - else: - result_info.eval_details[k].merge(v) + if r_i.eval_status: + result_info.eval_status = True + # Merge eval_details: Dict[str, List[EvalDetail]] + for k, v in r_i.eval_details.items(): + if k not in result_info.eval_details: + result_info.eval_details[k] = v + else: + result_info.eval_details[k].extend(v) return result_info.to_dict() def evaluate_item(self, eval_fields: dict, eval_type: str, map_data: dict, eval_list: list) -> ResultInfo: result_info = ResultInfo() - bad_eval_details = None - good_eval_details = None + eval_detail_list = [] for e_c_i in eval_list: if eval_type == 'rule': @@ -178,40 +177,32 @@ def evaluate_item(self, eval_fields: dict, eval_type: str, map_data: dict, eval_ Model.set_config_llm(model, e_c_i.config) else: raise ValueError(f"Error eval_type: {eval_type}") - tmp: ModelRes = model.eval(Data(**map_data)) - # Collect eval_details from ModelRes - if tmp.eval_status: + + tmp: EvalDetail = model.eval(Data(**map_data)) + eval_detail_list.append(tmp) + + # If any EvalDetail's status is True, result_info.eval_status is True + if tmp.status: result_info.eval_status = True - if bad_eval_details: - bad_eval_details.merge(tmp.eval_details) - else: - bad_eval_details = tmp.eval_details.copy() - else: - if good_eval_details: - good_eval_details.merge(tmp.eval_details) - else: - good_eval_details = tmp.eval_details.copy() - # Set result_info fields based on all_labels configuration and add field - join_fields = ','.join(eval_fields.values()) + # Set result_info fields + join_fields = ','.join(eval_fields.values()) if eval_fields else 'default' + + # Decide which results to save based on configuration if self.input_args.executor.result_save.all_labels: - all_eval_details = None - if bad_eval_details: - all_eval_details = bad_eval_details.copy() - if good_eval_details: - if all_eval_details: - all_eval_details.merge(good_eval_details) - else: - all_eval_details = good_eval_details.copy() - if all_eval_details: - result_info.eval_details = {join_fields: all_eval_details} + # Save all results + if eval_detail_list: + result_info.eval_details = {join_fields: eval_detail_list} else: + # Only save bad or good results if result_info.eval_status: - if bad_eval_details: - result_info.eval_details = {join_fields: bad_eval_details} + # Has bad results, only keep EvalDetail with status=True + result_info.eval_details = {join_fields: [ed for ed in eval_detail_list if ed.status]} else: - if good_eval_details and self.input_args.executor.result_save.good: - result_info.eval_details = {join_fields: good_eval_details} + # All good results, decide whether to save based on configuration + if self.input_args.executor.result_save.good: + result_info.eval_details = {join_fields: [ed for ed in eval_detail_list if not ed.status]} + return result_info def summarize(self, summary: SummaryModel) -> SummaryModel: @@ -231,20 +222,22 @@ def aggregate_eval_detailss(acc, item): """聚合单个 item 的 eval_details 到累加器中""" eval_details_dict = item.get('eval_details', {}) - # 遍历第一层:字段名 - for field_key, eval_detail_dict in eval_details_dict.items(): + # 遍历第一层:字段名,第二层是 List[EvalDetail] (序列化为 list of dicts) + for field_key, eval_detail_list in eval_details_dict.items(): if field_key not in acc: acc[field_key] = {} - # 从 EvalDetail 的 label 列表中获取错误类型 - label_list = eval_detail_dict.get('label', []) if isinstance(eval_detail_dict, dict) else eval_detail_dict.label - - # 统计每个 label 的出现次数 - for label in label_list: - if label not in acc[field_key]: - acc[field_key][label] = 1 - else: - acc[field_key][label] += 1 + # 遍历 List[EvalDetail] + for eval_detail in eval_detail_list: + # 从 EvalDetail 的 label 列表中获取错误类型 + label_list = eval_detail.get('label', []) if isinstance(eval_detail, dict) else eval_detail.label + if label_list: + # 统计每个 label 的出现次数 + for label in label_list: + if label not in acc[field_key]: + acc[field_key][label] = 1 + else: + acc[field_key][label] += 1 return acc diff --git a/dingo/io/output/eval_detail.py b/dingo/io/output/eval_detail.py new file mode 100644 index 00000000..f2073dca --- /dev/null +++ b/dingo/io/output/eval_detail.py @@ -0,0 +1,18 @@ +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + + +class QualityLabel: + """质量标签常量类""" + QUALITY_GOOD = "QUALITY_GOOD" # Indicates pass the quality check + QUALITY_BAD_PREFIX = "QUALITY_BAD_" # Indicates not pass the quality check + + +class EvalDetail(BaseModel): + metric: str + status: bool = False + + score: Optional[float] = None + label: Optional[list[str]] = None + reason: Optional[list] = None diff --git a/dingo/io/output/result_info.py b/dingo/io/output/result_info.py index d604c446..50666446 100644 --- a/dingo/io/output/result_info.py +++ b/dingo/io/output/result_info.py @@ -1,28 +1,44 @@ -from typing import Any, Dict, List +from typing import Dict, List -from pydantic import BaseModel, Field +from pydantic import BaseModel -from dingo.model.modelres import EvalDetail +from dingo.io.output.eval_detail import EvalDetail class ResultInfo(BaseModel): dingo_id: str = '' raw_data: Dict = {} eval_status: bool = False - eval_details: Dict[str, EvalDetail] = {} + eval_details: Dict[str, List[EvalDetail]] = {} def to_dict(self): + """将ResultInfo转换为字典格式 + + Returns: + 包含所有字段的字典,其中eval_details被转换为嵌套字典结构 + """ return { 'dingo_id': self.dingo_id, 'raw_data': self.raw_data, 'eval_status': self.eval_status, - 'eval_details': {k: v.to_dict() for k,v in self.eval_details.items()}, + 'eval_details': { + k: [model_res.model_dump() for model_res in v] + for k, v in self.eval_details.items() + }, } def to_raw_dict(self): + """将ResultInfo合并到raw_data中 + + Returns: + 包含原始数据和dingo_result的字典 + """ dingo_result = { 'eval_status': self.eval_status, - 'eval_details': {k: v.to_dict() for k,v in self.eval_details.items()}, + 'eval_details': { + k: [model_res.model_dump() for model_res in v] + for k, v in self.eval_details.items() + }, } self.raw_data['dingo_result'] = dingo_result return self.raw_data diff --git a/dingo/model/llm/base.py b/dingo/model/llm/base.py index 237cd52b..778f7f1f 100644 --- a/dingo/model/llm/base.py +++ b/dingo/model/llm/base.py @@ -2,7 +2,7 @@ from dingo.config.input_args import EvaluatorLLMArgs from dingo.io import Data -from dingo.model.modelres import EvalDetail, ModelRes, QualityLabel +from dingo.io.output.eval_detail import EvalDetail class BaseLLM: @@ -12,5 +12,5 @@ class BaseLLM: dynamic_config: EvaluatorLLMArgs @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: raise NotImplementedError() diff --git a/dingo/model/llm/base_lmdeploy_apiclient.py b/dingo/model/llm/base_lmdeploy_apiclient.py index ac17541f..c3edc79a 100644 --- a/dingo/model/llm/base_lmdeploy_apiclient.py +++ b/dingo/model/llm/base_lmdeploy_apiclient.py @@ -6,8 +6,8 @@ from dingo.config.input_args import EvaluatorLLMArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model.llm.base import BaseLLM -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.response.response_class import ResponseScoreReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError, ExceedMaxTokens @@ -44,7 +44,7 @@ def send_messages(cls, messages: List): return str(response) @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -60,30 +60,20 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # eval_status if response_model.score == 1: - # result.reason = [response_model.reason] - result.eval_details = { - "label": [QualityLabel.QUALITY_GOOD], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.label = [QualityLabel.QUALITY_GOOD] + result.reason = [response_model.reason] else: - result.eval_status = True - # result.type = cls.prompt.metric_type - # result.name = cls.prompt.__name__ - # result.reason = [response_model.reason] - result.eval_details = { - "label": [f"QUALITY_BAD.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.status = True + result.label = [f"QUALITY_BAD.{cls.__name__}"] + result.reason = [response_model.reason] return result @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: if cls.client is None: cls.create_client() @@ -106,11 +96,8 @@ def eval(cls, input_data: Data) -> ModelRes: except_msg = str(e) except_name = e.__class__.__name__ - res = ModelRes() - res.eval_status = True - res.eval_details = { - "label": [f"QUALITY_BAD.{except_name}"], - "metric": [cls.__name__], - "reason": [except_msg] - } + res = EvalDetail(metric=cls.__name__) + res.status = True + res.label = [f"QUALITY_BAD.{except_name}"] + res.reason = [except_msg] return res diff --git a/dingo/model/llm/base_openai.py b/dingo/model/llm/base_openai.py index db717cf0..64ca31ec 100644 --- a/dingo/model/llm/base_openai.py +++ b/dingo/model/llm/base_openai.py @@ -6,8 +6,8 @@ from dingo.config.input_args import EvaluatorLLMArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model.llm.base import BaseLLM -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.response.response_class import ResponseScoreReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError, ExceedMaxTokens @@ -111,7 +111,7 @@ def validate_config(cls, parameters: Dict): ) @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -127,26 +127,31 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # eval_status if response_model.score == 1: - result.eval_details = { - "label": [QualityLabel.QUALITY_GOOD], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + # result.eval_details = { + # "label": [QualityLabel.QUALITY_GOOD], + # "metric": [cls.__name__], + # "reason": [response_model.reason] + # } + result.label = [QualityLabel.QUALITY_GOOD] + result.reason = [response_model.reason] else: - result.eval_status = True - result.eval_details = { - "label": [f"QUALITY_BAD.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + # result.eval_status = True + # result.eval_details = { + # "label": [f"QUALITY_BAD.{cls.__name__}"], + # "metric": [cls.__name__], + # "reason": [response_model.reason] + # } + result.status = True + result.label = [f"QUALITY_BAD.{cls.__name__}"] + result.reason = [response_model.reason] return result @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: if cls.client is None: cls.create_client() @@ -158,7 +163,7 @@ def eval(cls, input_data: Data) -> ModelRes: while attempts < 3: try: response = cls.send_messages(messages) - res: ModelRes = cls.process_response(response) + res: EvalDetail = cls.process_response(response) return res except (ValidationError, ExceedMaxTokens, ConvertJsonError) as e: except_msg = str(e) @@ -170,11 +175,14 @@ def eval(cls, input_data: Data) -> ModelRes: except_msg = str(e) except_name = e.__class__.__name__ - res = ModelRes() - res.eval_status = True - res.eval_details = { - "label": [f"QUALITY_BAD.{except_name}"], - "metric": [cls.__name__], - "reason": [except_msg] - } + res = EvalDetail(metric=cls.__name__) + # res.eval_status = True + # res.eval_details = { + # "label": [f"QUALITY_BAD.{except_name}"], + # "metric": [cls.__name__], + # "reason": [except_msg] + # } + res.status = True + res.label = [f"QUALITY_BAD.{except_name}"] + res.reason = [except_msg] return res diff --git a/dingo/model/llm/compare/llm_code_compare.py b/dingo/model/llm/compare/llm_code_compare.py index 7f5f7725..8aba3599 100644 --- a/dingo/model/llm/compare/llm_code_compare.py +++ b/dingo/model/llm/compare/llm_code_compare.py @@ -3,9 +3,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -138,7 +138,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) # 提取思考内容和清理响应 @@ -183,28 +183,22 @@ def _clean_response(response: str) -> str: return response @staticmethod - def _create_no_code_result(response_json: dict) -> ModelRes: - result = ModelRes() - result.eval_status = False - result.eval_details = { - "label": ["NO_CODE.code"], - "metric": ["LLMCodeCompare"], - "reason": [json.dumps(response_json, ensure_ascii=False)] - } + def _create_no_code_result(response_json: dict) -> EvalDetail: + result = EvalDetail(metric="LLMCodeCompare") + result.status = False + result.label = ["NO_CODE.code"] + result.reason = [json.dumps(response_json, ensure_ascii=False)] return result @staticmethod - def _create_normal_result(response_json: dict) -> ModelRes: - result = ModelRes() + def _create_normal_result(response_json: dict) -> EvalDetail: + result = EvalDetail(metric="LLMCodeCompare") score = response_json.get('score', 0) - result.eval_status = score != 1 + result.status = score != 1 tmp_type = {1: 'TOOL_ONE_BETTER', 2: 'TOOL_TWO_BETTER'}.get(score, 'TOOL_EQUAL') - result.eval_details = { - "label": [f"{tmp_type}.code"], - "metric": ["LLMCodeCompare"], - "reason": [json.dumps(response_json, ensure_ascii=False)] - } + result.label = [f"{tmp_type}.code"] + result.reason = [json.dumps(response_json, ensure_ascii=False)] return result diff --git a/dingo/model/llm/compare/llm_html_extract_compare.py b/dingo/model/llm/compare/llm_html_extract_compare.py index 0215b583..72b9836a 100644 --- a/dingo/model/llm/compare/llm_html_extract_compare.py +++ b/dingo/model/llm/compare/llm_html_extract_compare.py @@ -3,9 +3,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.model.response.response_class import ResponseScoreTypeNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -107,7 +107,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) response_think = "" @@ -133,10 +133,10 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreTypeNameReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # status if response_model.score != 1: - result.eval_status = True + result.status = True # type # if response_model.score == 1: @@ -159,11 +159,7 @@ def process_response(cls, response: str) -> ModelRes: tmp_type = "TOOL_TWO_BETTER" if response_model.score == 0: tmp_type = "TOOL_EQUAL" - - result.eval_details = { - "label": [f"{tmp_type}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [json.dumps(response_json, ensure_ascii=False)] - } + result.label = [f"{tmp_type}.{response_model.name}"] + result.reason = [json.dumps(response_json, ensure_ascii=False)] return result diff --git a/dingo/model/llm/compare/llm_html_extract_compare_en.py b/dingo/model/llm/compare/llm_html_extract_compare_en.py index f4b29234..fae84cc1 100644 --- a/dingo/model/llm/compare/llm_html_extract_compare_en.py +++ b/dingo/model/llm/compare/llm_html_extract_compare_en.py @@ -3,9 +3,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.model.response.response_class import ResponseScoreTypeNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -79,7 +79,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) response_think = "" @@ -105,10 +105,10 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreTypeNameReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # status if response_model.score != 1: - result.eval_status = True + result.status = True # type # if response_model.score == 1: @@ -131,11 +131,7 @@ def process_response(cls, response: str) -> ModelRes: tmp_type = "TOOL_TWO_BETTER" if response_model.score == 0: tmp_type = "TOOL_EQUAL" - - result.eval_details = { - "label": [f"{tmp_type}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [json.dumps(response_json, ensure_ascii=False)] - } + result.label = [f"{tmp_type}.{response_model.name}"] + result.reason = [json.dumps(response_json, ensure_ascii=False)] return result diff --git a/dingo/model/llm/compare/llm_html_extract_compare_v2.py b/dingo/model/llm/compare/llm_html_extract_compare_v2.py index 2f4c9410..891ac673 100644 --- a/dingo/model/llm/compare/llm_html_extract_compare_v2.py +++ b/dingo/model/llm/compare/llm_html_extract_compare_v2.py @@ -4,9 +4,9 @@ import diff_match_patch as dmp_module from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.model.response.response_class import ResponseNameReason from dingo.utils import log @@ -244,9 +244,9 @@ def _parse_response_to_structured(cls, response: str) -> ResponseNameReason: ) @classmethod - def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> ModelRes: + def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> EvalDetail: """ - 将结构化响应转换为 ModelRes 对象 + 将结构化响应转换为 EvalDetail 对象 映射规则: - A -> TOOL_ONE_BETTER (工具A更好,eval_status=False) @@ -257,9 +257,9 @@ def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> Mo structured_response: 结构化响应对象,name 字段存储判断结果 (A/B/C) Returns: - ModelRes: 评估结果对象 + EvalDetail: 评估结果对象 """ - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # 从 name 字段获取判断结果 judgement = structured_response.name @@ -287,29 +287,26 @@ def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> Mo if not mapping: raise ValueError(f"无效的判断结果: {judgement}") - result.eval_status = mapping["eval_status"] + result.status = mapping["eval_status"] # result.type = mapping["type"] # result.name = f"Judgement_{judgement}" # result.reason = [structured_response.reason] tmp_type = mapping["type"] tmp_name = f"Judgement_{judgement}" - result.eval_details = { - "label": [f"{tmp_type}.{tmp_name}"], - "metric": [cls.__name__], - "reason": [structured_response.reason] - } + result.label = [f"{tmp_type}.{tmp_name}"] + result.reason = [structured_response.reason] return result @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: """ 处理 LLM 返回结果 数据流: 1. 原始响应 (str) -> 结构化响应 (ResponseNameReason) - 2. 结构化响应 -> 评估结果 (ModelRes) + 2. 结构化响应 -> 评估结果 (EvalDetail) 这种分层设计的好处: - 更清晰的责任分离 @@ -321,7 +318,7 @@ def process_response(cls, response: str) -> ModelRes: response: LLM 原始响应文本 Returns: - ModelRes: 评估结果对象 + EvalDetail: 评估结果对象 """ # 步骤1: 解析为结构化响应 structured_response = cls._parse_response_to_structured(response) diff --git a/dingo/model/llm/compare/llm_math_compare.py b/dingo/model/llm/compare/llm_math_compare.py index 13285d0d..014b89cb 100644 --- a/dingo/model/llm/compare/llm_math_compare.py +++ b/dingo/model/llm/compare/llm_math_compare.py @@ -3,9 +3,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -136,7 +136,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) # 提取思考内容和清理响应 @@ -181,30 +181,25 @@ def _clean_response(response: str) -> str: return response @staticmethod - def _create_no_formula_result(response_json: dict) -> ModelRes: - result = ModelRes() - result.eval_status = False - result.eval_details = { - "label": ["NO_FORMULA.math"], - "metric": ["LLMMathCompare"], - "reason": [json.dumps(response_json, ensure_ascii=False)] - } + def _create_no_formula_result(response_json: dict) -> EvalDetail: + result = EvalDetail(metric="LLMMathCompare") + result.status = False + result.label = ["NO_FORMULA.math"] + + result.reason = [json.dumps(response_json, ensure_ascii=False)] return result @staticmethod - def _create_normal_result(response_json: dict) -> ModelRes: - result = ModelRes() + def _create_normal_result(response_json: dict) -> EvalDetail: + result = EvalDetail(metric="LLMMathCompare") score = response_json.get('score', 0) - result.eval_status = score != 1 + result.status = score != 1 # result.type = {1: 'TOOL_ONE_BETTER', 2: 'TOOL_TWO_BETTER'}.get(score, 'TOOL_EQUAL') # result.name = 'math' # result.reason = [json.dumps(response_json, ensure_ascii=False)] tmp_type = {1: 'TOOL_ONE_BETTER', 2: 'TOOL_TWO_BETTER'}.get(score, 'TOOL_EQUAL') - result.eval_details = { - "label": [f"{tmp_type}.math"], - "metric": ["LLMMathCompare"], - "reason": [json.dumps(response_json, ensure_ascii=False)] - } + result.label = [f"{tmp_type}.math"] + result.reason = [json.dumps(response_json, ensure_ascii=False)] return result diff --git a/dingo/model/llm/compare/llm_table_compare.py b/dingo/model/llm/compare/llm_table_compare.py index e1510a0e..1533e6ed 100644 --- a/dingo/model/llm/compare/llm_table_compare.py +++ b/dingo/model/llm/compare/llm_table_compare.py @@ -3,9 +3,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -136,7 +136,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) # 提取思考内容和清理响应 @@ -181,30 +181,25 @@ def _clean_response(response: str) -> str: return response @staticmethod - def _create_no_table_result(response_json: dict) -> ModelRes: - result = ModelRes() - result.eval_status = False - result.eval_details = { - "label": ["NO_TABLE.table"], - "metric": ["LLMTableCompare"], - "reason": [json.dumps(response_json, ensure_ascii=False)] - } + def _create_no_table_result(response_json: dict) -> EvalDetail: + result = EvalDetail(metric="LLMTableCompare") + result.status = False + result.label = ["NO_TABLE.table"] + + result.reason = [json.dumps(response_json, ensure_ascii=False)] return result @staticmethod - def _create_normal_result(response_json: dict) -> ModelRes: - result = ModelRes() + def _create_normal_result(response_json: dict) -> EvalDetail: + result = EvalDetail(metric="LLMTableCompare") score = response_json.get('score', 0) - result.eval_status = score != 1 + result.status = score != 1 # result.type = {1: 'TOOL_ONE_BETTER', 2: 'TOOL_TWO_BETTER'}.get(score, 'TOOL_EQUAL') # result.name = 'table' # result.reason = [json.dumps(response_json, ensure_ascii=False)] tmp_type = {1: 'TOOL_ONE_BETTER', 2: 'TOOL_TWO_BETTER'}.get(score, 'TOOL_EQUAL') - result.eval_details = { - "label": [f"{tmp_type}.table"], - "metric": ["LLMMathCompare"], - "reason": [json.dumps(response_json, ensure_ascii=False)] - } + result.label = [f"{tmp_type}.table"] + result.reason = [json.dumps(response_json, ensure_ascii=False)] return result diff --git a/dingo/model/llm/hhh/llm_text_3h.py b/dingo/model/llm/hhh/llm_text_3h.py index 5cdf0866..919d6bca 100644 --- a/dingo/model/llm/hhh/llm_text_3h.py +++ b/dingo/model/llm/hhh/llm_text_3h.py @@ -1,8 +1,7 @@ import json -from dingo.model import Model +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.response.response_class import ResponseScoreReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -21,7 +20,7 @@ def build_messages(cls, input_data): return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -37,23 +36,17 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # eval_status if response_model.score == 1: tmp_name = cls.prompt.__name__[8:].upper() - result.eval_details = { - "label": [f"{QualityLabel.QUALITY_GOOD}.{tmp_name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] if response_model.reason else ["Response meets quality criteria"] - } + result.label = [f"{QualityLabel.QUALITY_GOOD}.{tmp_name}"] + result.reason = [response_model.reason] if response_model.reason else ["Response meets quality criteria"] else: - result.eval_status = True + result.status = True tmp_name = "NOT_" + cls.prompt.__name__[8:].upper() - result.eval_details = { - "label": [f"QUALITY_BAD.{tmp_name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] if response_model.reason else ["Response fails quality criteria"] - } + result.label = [f"QUALITY_BAD.{tmp_name}"] + result.reason = [response_model.reason] if response_model.reason else ["Response fails quality criteria"] return result diff --git a/dingo/model/llm/llm_classify_qr.py b/dingo/model/llm/llm_classify_qr.py index 03fcf7fa..ebf9f28e 100644 --- a/dingo/model/llm/llm_classify_qr.py +++ b/dingo/model/llm/llm_classify_qr.py @@ -2,9 +2,9 @@ from typing import List from dingo.io.input import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.model.response.response_class import ResponseNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -44,7 +44,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -60,16 +60,9 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseNameReason(**response_json) - result = ModelRes() - result.eval_status = False - # result.type = cls.prompt.metric_type - # result.name = response_model.name - # result.reason = [response_model.reason] - - result.eval_details = { - "label": [f"{cls.__name__}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result = EvalDetail(metric=cls.__name__) + result.status = False + result.label = [f"{cls.__name__}.{response_model.name}"] + result.reason = [response_model.reason] return result diff --git a/dingo/model/llm/llm_classify_topic.py b/dingo/model/llm/llm_classify_topic.py index d36ffd6a..9dcf4a0b 100644 --- a/dingo/model/llm/llm_classify_topic.py +++ b/dingo/model/llm/llm_classify_topic.py @@ -1,8 +1,8 @@ import json +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.model.response.response_class import ResponseNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -46,7 +46,7 @@ class LLMClassifyTopic(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -62,16 +62,9 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseNameReason(**response_json) - result = ModelRes() - result.eval_status = False - # result.type = cls.prompt.metric_type - # result.name = response_model.name - # result.reason = [response_model.reason] - - result.eval_details = { - "label": [f"{cls.__name__}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result = EvalDetail(metric=cls.__name__) + result.status = False + result.label = [f"{cls.__name__}.{response_model.name}"] + result.reason = [response_model.reason] return result diff --git a/dingo/model/llm/llm_dataman_assessment.py b/dingo/model/llm/llm_dataman_assessment.py index 3163aaff..468cbc52 100644 --- a/dingo/model/llm/llm_dataman_assessment.py +++ b/dingo/model/llm/llm_dataman_assessment.py @@ -1,8 +1,8 @@ import json +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.model.response.response_class import ResponseScoreTypeNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -103,7 +103,7 @@ class LLMDatamanAssessment(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -121,26 +121,14 @@ def process_response(cls, response: str) -> ModelRes: # Parse the response using the ResponseScoreTypeNameReason model response_model = ResponseScoreTypeNameReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # Set eval_status based on score (1 = good quality, 0 = low quality) if response_model.score == 1: - result.eval_status = False + result.status = False else: - result.eval_status = True - - # # Set type to the domain classification - # result.type = response_model.type - # - # # Set name to the quality category - # result.name = response_model.name - # - # # Set reason to the detailed assessment - # result.reason = [response_model.reason] - - result.eval_details = { - "label": [f"{response_model.type}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.status = True + + result.label = [f"{response_model.type}.{response_model.name}"] + result.reason = [response_model.reason] return result diff --git a/dingo/model/llm/llm_document_parsing_ocr.py b/dingo/model/llm/llm_document_parsing_ocr.py index e58932e2..bb5465cd 100644 --- a/dingo/model/llm/llm_document_parsing_ocr.py +++ b/dingo/model/llm/llm_document_parsing_ocr.py @@ -1,15 +1,12 @@ -import base64 import json import re from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes -from dingo.model.response.response_class import ResponseScoreReason from dingo.utils import log -from dingo.utils.exception import ConvertJsonError @Model.llm_register("LLMMinerURecognizeQuality") @@ -100,7 +97,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) json_match = re.search(r'\{[\s\S]*"errors"[\s\S]*\}', response) types = [] @@ -124,18 +121,12 @@ def process_response(cls, response: str) -> ModelRes: else: log.error("未找到JSON内容") - result = ModelRes() - result.eval_status = False - # result.type = types - # result.name = names - # result.reason = [json_str] if 'json_str' in locals() else [response] + result = EvalDetail(metric=cls.__name__) + result.status = False tmp_type = '.'.join(types) tmp_name = '.'.join(names) - result.eval_details = { - "label": [f"{tmp_type}.{tmp_name}"], - "metric": [cls.__name__], - "reason": [json_str] if 'json_str' in locals() else [response] - } + result.label = [f"{tmp_type}.{tmp_name}"] + result.reason = [json_str] if 'json_str' in locals() else [response] return result diff --git a/dingo/model/llm/llm_factcheck_public.py b/dingo/model/llm/llm_factcheck_public.py index 59d20bbc..74b0177e 100644 --- a/dingo/model/llm/llm_factcheck_public.py +++ b/dingo/model/llm/llm_factcheck_public.py @@ -1,11 +1,10 @@ from dataclasses import dataclass -from typing import Dict, List, Literal, Optional +from typing import Dict, List, Literal from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel -from dingo.utils.exception import ExceedMaxTokens @dataclass @@ -191,7 +190,7 @@ class LLMFactCheckPublic(BaseOpenAI): } @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: """执行两阶段评估""" try: # 0. 初始化 client @@ -201,12 +200,9 @@ def eval(cls, input_data: Data) -> ModelRes: # 1. 提取声明 claims = cls._extract_claims(input_data.prompt, input_data.content) if not claims: - return ModelRes( - # score=0.0, - # threshold=cls.threshold, - reason=["No factual claims found"], - # raw_resp={"claims": [], "results": []} - ) + result = EvalDetail(metric=cls.__name__) + result.reason = ["No factual claims found"] + return result # 2. 分批验证 all_results = [] @@ -219,40 +215,24 @@ def eval(cls, input_data: Data) -> ModelRes: metrics = cls._calculate_metrics(all_results) # 4. 设置评估结果 - result = ModelRes( - # score=metrics["factual_ratio"], - # threshold=cls.threshold, - reason=[cls._format_reason(metrics)], - # raw_resp={ - # "claims": claims, - # "results": all_results, - # "metrics": metrics - # } - ) + result = EvalDetail(metric=cls.__name__) + result.reason = [cls._format_reason(metrics)] # 5. 根据分数设置状态 if metrics["factual_ratio"] < cls.threshold: - result.eval_status = True - # result.type = "QUALITY_BAD_FACTUALITY" - # result.name = "FACTUALITY_CHECK_FAILED" - result.eval_details.label = ["QUALITY_BAD_FACTUALITY.FACTUALITY_CHECK_FAILED"] + result.status = True + result.label = ["QUALITY_BAD_FACTUALITY.FACTUALITY_CHECK_FAILED"] else: - # result.type = "QUALITY_GOOD" - # result.name = "FACTUALITY_CHECK_PASSED" - result.eval_details.label = [f"{QualityLabel.QUALITY_GOOD}.FACTUALITY_CHECK_PASSED"] + result.label = [f"{QualityLabel.QUALITY_GOOD}.FACTUALITY_CHECK_PASSED"] return result except Exception as e: - return ModelRes( - eval_status=True, - type="QUALITY_BAD_FACTUALITY", - name="FACTUALITY_CHECK_ERROR", - # score=0.0, - # threshold=cls.threshold, - reason=[f"Evaluation failed: {str(e)}"], - # raw_resp={"error": str(e)} - ) + result = EvalDetail(metric=cls.__name__) + result.status = True + result.label = ["QUALITY_BAD_FACTUALITY.FACTUALITY_CHECK_ERROR"] + result.reason = [f"Evaluation failed: {str(e)}"] + return result @classmethod def _extract_claims(cls, prompt: str, response: str) -> List[str]: diff --git a/dingo/model/llm/llm_hallucination.py b/dingo/model/llm/llm_hallucination.py index 79407b77..36317858 100644 --- a/dingo/model/llm/llm_hallucination.py +++ b/dingo/model/llm/llm_hallucination.py @@ -1,11 +1,11 @@ import json -from typing import List, Union +from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel -from dingo.model.response.response_hallucination import HallucinationScoreReason, HallucinationVerdict, HallucinationVerdicts +from dingo.model.response.response_hallucination import HallucinationVerdict, HallucinationVerdicts from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -19,7 +19,7 @@ class LLMHallucination(BaseOpenAI): This implementation adapts DeepEval's verdict-based approach to Dingo's architecture: 1. Generates verdicts for each context against the actual output 2. Calculates hallucination score based on contradiction ratio - 3. Returns standardized ModelRes with eval_status based on threshold + 3. Returns standardized EvalDetail with eval_status based on threshold """ # Metadata for documentation generation _metric_info = { @@ -107,7 +107,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: """ Process LLM response to calculate hallucination score. Follows DeepEval's approach: @@ -142,27 +142,17 @@ def process_response(cls, response: str) -> ModelRes: # Generate detailed reason reason = cls._generate_reason(verdicts, score) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # Set eval_status based on threshold if score > cls.threshold: - result.eval_status = True - # result.type = "QUALITY_BAD_HALLUCINATION" - # result.name = "HALLUCINATION_DETECTED" - result.eval_details.label = ['QUALITY_BAD_HALLUCINATION.HALLUCINATION_DETECTED'] + result.status = True + result.label = ['QUALITY_BAD_HALLUCINATION.HALLUCINATION_DETECTED'] else: - # result.type = "QUALITY_GOOD" - # result.name = "NO_HALLUCINATION" - result.eval_details.label = [f'{QualityLabel.QUALITY_GOOD}.NO_HALLUCINATION'] + result.label = [f'{QualityLabel.QUALITY_GOOD}.NO_HALLUCINATION'] result.reason = [reason] - # Store additional metadata - # result.score = score - # result.verdict_details = [ - # f"{v.verdict}: {v.reason}" for v in verdicts - # ] - log.info(f"Hallucination score: {score:.3f}, threshold: {cls.threshold}") return result @@ -220,22 +210,17 @@ def _generate_reason(cls, verdicts: List[HallucinationVerdict], score: float) -> return "\n".join(reason_parts) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: """ Override eval to add context validation """ # Validate that context is provided if not hasattr(input_data, 'context') or not input_data.context: - return ModelRes( - eval_status=True, - # type="QUALITY_BAD", - # name="MISSING_CONTEXT", - # reason=["Context is required for hallucination detection but was not provided"] - eval_details = { - "label": ["QUALITY_BAD.MISSING_CONTEXT"], - "reason": ["Context is required for hallucination detection but was not provided"] - } - ) + result = EvalDetail(metric=cls.__name__) + result.status = True + result.label = ["QUALITY_BAD.MISSING_CONTEXT"] + result.reason = ["Context is required for hallucination detection but was not provided"] + return result # Call parent eval method return super().eval(input_data) diff --git a/dingo/model/llm/llm_long_video_qa.py b/dingo/model/llm/llm_long_video_qa.py index 54178a5f..17af0e22 100644 --- a/dingo/model/llm/llm_long_video_qa.py +++ b/dingo/model/llm/llm_long_video_qa.py @@ -1,8 +1,6 @@ -import json - +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log @@ -115,18 +113,11 @@ class LLMLongVideoQa(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) - result = ModelRes() - result.eval_status = False - # result.type = "text" - # result.name = "qa_pairs" - # result.reason = [response] - - result.eval_details = { - "label": ["text.qa_pairs"], - "metric": [cls.__name__], - "reason": [response] - } + result = EvalDetail(metric=cls.__name__) + result.status = False + result.label = ["text.qa_pairs"] + result.reason = [response] return result diff --git a/dingo/model/llm/llm_perspective.py b/dingo/model/llm/llm_perspective.py index 3fd86754..ec706f1a 100644 --- a/dingo/model/llm/llm_perspective.py +++ b/dingo/model/llm/llm_perspective.py @@ -2,9 +2,9 @@ from dingo.config.input_args import EvaluatorLLMArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base import BaseLLM -from dingo.model.modelres import ModelRes, QualityLabel from dingo.utils import log @@ -38,7 +38,7 @@ def create_client(cls): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: cls.create_client() analyze_request = { "comment": {"text": input_data.content}, @@ -69,43 +69,24 @@ def eval(cls, input_data: Data) -> ModelRes: error_list.append(e) if is_good: - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": [f"{QualityLabel.QUALITY_GOOD}.PERSPECTIVE"], - "metric": [cls.__name__], - "reason": [] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = [f"{QualityLabel.QUALITY_GOOD}.PERSPECTIVE"] + res.reason = [] return res else: - # return ModelRes( - # eval_status=True, - # type="QUALITY_BAD", - # name="PERSPECTIVE", - # reason=error_list, - # ) - res = ModelRes() - res.eval_status = True - res.eval_details = { - "label": ["QUALITY_BAD.PERSPECTIVE"], - "metric": [cls.__name__], - "reason": error_list - } + res = EvalDetail(metric=cls.__name__) + res.status = True + res.label = ["QUALITY_BAD.PERSPECTIVE"] + res.reason = error_list return res except Exception as e: attempts += 1 time.sleep(1) except_msg = str(e) - # return ModelRes( - # eval_status=True, type="QUALITY_BAD", name="API_LOSS", reason=[except_msg] - # ) - - res = ModelRes() - res.eval_status = True - res.eval_details = { - "label": ["QUALITY_BAD.API_LOSS"], - "metric": [cls.__name__], - "reason": [except_msg] - } + res = EvalDetail(metric=cls.__name__) + res.status = True + res.label = ["QUALITY_BAD.API_LOSS"] + res.reason = [except_msg] return res diff --git a/dingo/model/llm/llm_resume_quality.py b/dingo/model/llm/llm_resume_quality.py index 912b7afb..9b40a12e 100644 --- a/dingo/model/llm/llm_resume_quality.py +++ b/dingo/model/llm/llm_resume_quality.py @@ -1,8 +1,8 @@ import json +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.response.response_class import ResponseScoreTypeNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -88,7 +88,7 @@ class LLMResumeQuality(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) # Clean response format @@ -107,23 +107,16 @@ def process_response(cls, response: str) -> ModelRes: # Validate response using Pydantic model response_model = ResponseScoreTypeNameReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # Check if resume is good quality if response_model.type == "Good" and response_model.score == 1: - result.eval_status = False - # result.type = "QUALITY_GOOD" - # result.name = "ResumeQualityGood" - # result.reason = [response_model.reason] - - result.eval_details = { - "label": f"{QualityLabel.QUALITY_GOOD}.ResumeQualityGood", - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.status = False + result.label = [f"{QualityLabel.QUALITY_GOOD}.ResumeQualityGood"] + result.reason = [response_model.reason] else: # Resume has quality issues - result.eval_status = True + result.status = True # Map issue type to metric type type_mapping = { @@ -136,16 +129,9 @@ def process_response(cls, response: str) -> ModelRes: "Completeness": "RESUME_QUALITY_BAD_COMPLETENESS" } - # result.type = type_mapping.get(response_model.type, "RESUME_QUALITY_BAD") - # result.name = response_model.name - # result.reason = [response_model.reason] - tmp_type = type_mapping.get(response_model.type, "RESUME_QUALITY_BAD") tmp_name = response_model.name - result.eval_details = { - "label": [f"{tmp_type}.{tmp_name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.label = [f"{tmp_type}.{tmp_name}"] + result.reason = [response_model.reason] return result diff --git a/dingo/model/llm/llm_text_chaos.py b/dingo/model/llm/llm_text_chaos.py index fc52f844..f563d691 100644 --- a/dingo/model/llm/llm_text_chaos.py +++ b/dingo/model/llm/llm_text_chaos.py @@ -1,8 +1,8 @@ import json +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.response.response_class import ResponseScoreTypeNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -19,7 +19,7 @@ class LLMTextChaos(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -35,24 +35,14 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreTypeNameReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # eval_status if response_model.score == 1: - # result.reason = [response_model.reason] - result.eval_details = { - "label": [f"{QualityLabel.QUALITY_GOOD}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.label = [f"{QualityLabel.QUALITY_GOOD}.{cls.__name__}"] + result.reason = [response_model.reason] else: - result.eval_status = True - # result.type = response_model.type - # result.name = response_model.name - # result.reason = [response_model.reason] - result.eval_details = { - "label": [f"{response_model.type}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.status = True + result.label = [f"{response_model.type}.{response_model.name}"] + result.reason = [response_model.reason] return result diff --git a/dingo/model/llm/llm_text_code_list_issue.py b/dingo/model/llm/llm_text_code_list_issue.py index f1821373..47447e39 100644 --- a/dingo/model/llm/llm_text_code_list_issue.py +++ b/dingo/model/llm/llm_text_code_list_issue.py @@ -1,8 +1,8 @@ import json +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.response.response_class import ResponseScoreTypeNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -32,7 +32,7 @@ class LLMTextCodeListIssue(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -48,24 +48,14 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreTypeNameReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # eval_status if response_model.score == 1: - # result.reason = [response_model.reason] - result.eval_details = { - "label": [QualityLabel.QUALITY_GOOD], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.label = [QualityLabel.QUALITY_GOOD] + result.reason = [response_model.reason] else: - result.eval_status = True - # result.type = response_model.type - # result.name = response_model.name - # result.reason = [response_model.reason] - result.eval_details = { - "label": [f"{response_model.type}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.status = True + result.label = [f"{response_model.type}.{response_model.name}"] + result.reason = [response_model.reason] return result diff --git a/dingo/model/llm/meta_rater/llm_meta_rater_cleanliness.py b/dingo/model/llm/meta_rater/llm_meta_rater_cleanliness.py index ee200247..dedc3018 100644 --- a/dingo/model/llm/meta_rater/llm_meta_rater_cleanliness.py +++ b/dingo/model/llm/meta_rater/llm_meta_rater_cleanliness.py @@ -9,9 +9,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -95,7 +95,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: """ Process the LLM response for Meta-rater Cleanliness evaluation. @@ -103,7 +103,7 @@ def process_response(cls, response: str) -> ModelRes: response: Raw response string from the LLM Returns: - ModelRes: Processed evaluation results with score and reason + EvalDetail: Processed evaluation results with score and reason """ log.info(response) @@ -125,30 +125,24 @@ def process_response(cls, response: str) -> ModelRes: score = response_json.get('score', 0) reason = response_json.get('reason', '') - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # Meta-rater uses 1-5 scoring, with higher scores being better; # We normalize this to binary classification for compatibility # Scores >= 3 are considered "good quality", < 3 are "low quality" if score >= 3: - result.eval_status = False + result.status = False # result.type = cls.prompt.metric_type # result.name = "HighQuality" # result.reason = [f"Score: {score}/5. {reason}"] - result.eval_details = { - "label": [f"{cls.__name__}.HighQuality"], - "metric": [cls.__name__], - "reason": [f"Score: {score}/5. {reason}"] - } + result.label = [f"{cls.__name__}.HighQuality"] + result.reason = [f"Score: {score}/5. {reason}"] else: - result.eval_status = True + result.status = True # result.type = cls.prompt.metric_type # result.name = "LowQuality" # result.reason = [f"Score: {score}/5. {reason}"] - result.eval_details = { - "label": [f"{cls.__name__}.LowQuality"], - "metric": [cls.__name__], - "reason": [f"Score: {score}/5. {reason}"] - } + result.label = [f"{cls.__name__}.LowQuality"] + result.reason = [f"Score: {score}/5. {reason}"] return result diff --git a/dingo/model/llm/meta_rater/llm_meta_rater_professionalism.py b/dingo/model/llm/meta_rater/llm_meta_rater_professionalism.py index 513e8163..55b0ef13 100644 --- a/dingo/model/llm/meta_rater/llm_meta_rater_professionalism.py +++ b/dingo/model/llm/meta_rater/llm_meta_rater_professionalism.py @@ -10,9 +10,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -90,7 +90,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: """ Process the LLM response for Meta-rater evaluation. @@ -98,7 +98,7 @@ def process_response(cls, response: str) -> ModelRes: response: Raw response string from the LLM Returns: - ModelRes: Processed evaluation results with score and reason + EvalDetail: Processed evaluation results with score and reason """ log.info(response) @@ -120,30 +120,24 @@ def process_response(cls, response: str) -> ModelRes: score = response_json.get('score', 0) reason = response_json.get('reason', '') - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # Meta-rater uses 1-5 scoring, with higher scores being better; # We normalize this to binary classification for compatibility # Scores >= 3 are considered "good quality", < 3 are "low quality" if score >= 3: - result.eval_status = False + result.status = False # result.type = cls.prompt.metric_type # result.name = "HighQuality" # result.reason = [f"Score: {score}/5. {reason}"] - result.eval_details = { - "label": [f"{cls.__name__}.HighQuality"], - "metric": [cls.__name__], - "reason": [f"Score: {score}/5. {reason}"] - } + result.label = [f"{cls.__name__}.HighQuality"] + result.reason = [f"Score: {score}/5. {reason}"] else: - result.eval_status = True + result.status = True # result.type = cls.prompt.metric_type # result.name = "LowQuality" # result.reason = [f"Score: {score}/5. {reason}"] - result.eval_details = { - "label": [f"{cls.__name__}.LowQuality"], - "metric": [cls.__name__], - "reason": [f"Score: {score}/5. {reason}"] - } + result.label = [f"{cls.__name__}.LowQuality"] + result.reason = [f"Score: {score}/5. {reason}"] return result diff --git a/dingo/model/llm/meta_rater/llm_meta_rater_readability.py b/dingo/model/llm/meta_rater/llm_meta_rater_readability.py index b169978f..05f6670b 100644 --- a/dingo/model/llm/meta_rater/llm_meta_rater_readability.py +++ b/dingo/model/llm/meta_rater/llm_meta_rater_readability.py @@ -9,9 +9,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -86,7 +86,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: """ Process the LLM response for Meta-rater Readability evaluation. @@ -94,7 +94,7 @@ def process_response(cls, response: str) -> ModelRes: response: Raw response string from the LLM Returns: - ModelRes: Processed evaluation results with score and reason + EvalDetail: Processed evaluation results with score and reason """ log.info(response) @@ -116,30 +116,24 @@ def process_response(cls, response: str) -> ModelRes: score = response_json.get('score', 0) reason = response_json.get('reason', '') - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # Meta-rater uses 1-5 scoring, with higher scores being better; # We normalize this to binary classification for compatibility # Scores >= 3 are considered "good quality", < 3 are "low quality" if score >= 3: - result.eval_status = False + result.status = False # result.type = cls.prompt.metric_type # result.name = "HighQuality" # result.reason = [f"Score: {score}/5. {reason}"] - result.eval_details = { - "label": [f"{cls.__name__}.HighQuality"], - "metric": [cls.__name__], - "reason": [f"Score: {score}/5. {reason}"] - } + result.label = [f"{cls.__name__}.HighQuality"] + result.reason = [f"Score: {score}/5. {reason}"] else: - result.eval_status = True + result.status = True # result.type = cls.prompt.metric_type # result.name = "LowQuality" # result.reason = [f"Score: {score}/5. {reason}"] - result.eval_details = { - "label": [f"{cls.__name__}.LowQuality"], - "metric": [cls.__name__], - "reason": [f"Score: {score}/5. {reason}"] - } + result.label = [f"{cls.__name__}.LowQuality"] + result.reason = [f"Score: {score}/5. {reason}"] return result diff --git a/dingo/model/llm/meta_rater/llm_meta_rater_reasoning.py b/dingo/model/llm/meta_rater/llm_meta_rater_reasoning.py index b4b180cd..306b6e81 100644 --- a/dingo/model/llm/meta_rater/llm_meta_rater_reasoning.py +++ b/dingo/model/llm/meta_rater/llm_meta_rater_reasoning.py @@ -9,9 +9,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -86,7 +86,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: """ Process the LLM response for Meta-rater Reasoning evaluation. @@ -94,7 +94,7 @@ def process_response(cls, response: str) -> ModelRes: response: Raw response string from the LLM Returns: - ModelRes: Processed evaluation results with score and reason + EvalDetail: Processed evaluation results with score and reason """ log.info(response) @@ -116,30 +116,24 @@ def process_response(cls, response: str) -> ModelRes: score = response_json.get('score', 0) reason = response_json.get('reason', '') - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # Meta-rater uses 1-5 scoring, with higher scores being better; # We normalize this to binary classification for compatibility # Scores >= 3 are considered "good quality", < 3 are "low quality" if score >= 3: - result.eval_status = False + result.status = False # result.type = cls.prompt.metric_type # result.name = "HighQuality" # result.reason = [f"Score: {score}/5. {reason}"] - result.eval_details = { - "label": [f"{cls.__name__}.HighQuality"], - "metric": [cls.__name__], - "reason": [f"Score: {score}/5. {reason}"] - } + result.label = [f"{cls.__name__}.HighQuality"] + result.reason = [f"Score: {score}/5. {reason}"] else: - result.eval_status = True + result.status = True # result.type = cls.prompt.metric_type # result.name = "LowQuality" # result.reason = [f"Score: {score}/5. {reason}"] - result.eval_details = { - "label": [f"{cls.__name__}.LowQuality"], - "metric": [cls.__name__], - "reason": [f"Score: {score}/5. {reason}"] - } + result.label = [f"{cls.__name__}.LowQuality"] + result.reason = [f"Score: {score}/5. {reason}"] return result diff --git a/dingo/model/llm/mineru/vlm_document_parsing.py b/dingo/model/llm/mineru/vlm_document_parsing.py index d122ddf2..2f9a83bd 100644 --- a/dingo/model/llm/mineru/vlm_document_parsing.py +++ b/dingo/model/llm/mineru/vlm_document_parsing.py @@ -3,9 +3,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log @@ -192,7 +192,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) response = response.replace("```json", "") @@ -218,12 +218,12 @@ def process_response(cls, response: str) -> ModelRes: except json.JSONDecodeError as e: log.error(f"JSON解析错误: {e}") - result = ModelRes() - # result.eval_status = False + result = EvalDetail(metric=cls.__name__) + # result.status = False # result.type = types # result.name = names # result.reason = [response] - result.eval_details.label = tmp_types - result.eval_details.reason = [response] + result.label = tmp_types + result.reason = [response] return result diff --git a/dingo/model/llm/mineru/vlm_document_parsing_ocr_train.py b/dingo/model/llm/mineru/vlm_document_parsing_ocr_train.py index 861d5f9d..85dfea3e 100644 --- a/dingo/model/llm/mineru/vlm_document_parsing_ocr_train.py +++ b/dingo/model/llm/mineru/vlm_document_parsing_ocr_train.py @@ -4,12 +4,10 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes -from dingo.model.response.response_class import ResponseScoreReason from dingo.utils import log -from dingo.utils.exception import ConvertJsonError @Model.llm_register("VLMDocumentParsingOCRTrain") @@ -109,7 +107,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) json_match = re.search(r'\{[\s\S]*"errors"[\s\S]*\}', response) # types = [] @@ -135,12 +133,12 @@ def process_response(cls, response: str) -> ModelRes: else: log.error("未找到JSON内容") - result = ModelRes() - result.eval_status = False + result = EvalDetail(metric=cls.__name__) + result.status = False # result.type = types # result.name = names # result.reason = [json_str] if 'json_str' in locals() else [response] - result.eval_details.label = tmp_types - result.eval_details.reason = [json_str] if 'json_str' in locals() else [response] + result.label = tmp_types + result.reason = [json_str] if 'json_str' in locals() else [response] return result diff --git a/dingo/model/llm/rag/llm_rag_answer_relevancy.py b/dingo/model/llm/rag/llm_rag_answer_relevancy.py index 13e859f0..b9d7dbae 100644 --- a/dingo/model/llm/rag/llm_rag_answer_relevancy.py +++ b/dingo/model/llm/rag/llm_rag_answer_relevancy.py @@ -11,9 +11,9 @@ import numpy as np from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -204,7 +204,7 @@ def calculate_score(cls, answers: List[Dict[str, Any]], original_question: str) return score @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: """评估答案相关性""" # 初始化embedding模型(如果尚未初始化) if cls.embedding_model is None: @@ -234,7 +234,7 @@ def eval(cls, input_data: Data) -> ModelRes: score = cls.calculate_score(generated_questions, original_question) # 构建结果 - result = ModelRes() + result = EvalDetail(metric=cls.__name__) result.score = score # 根据分数判断是否通过,默认阈值为5 @@ -250,29 +250,20 @@ def eval(cls, input_data: Data) -> ModelRes: cls.init_embedding_model(embedding_model_name) if score >= threshold: - result.eval_status = False - result.eval_details = { - "label": ["QUALITY_GOOD.ANSWER_RELEVANCY_PASS"], - "metric": [cls.__name__], - "reason": [f"答案相关性评估通过 (分数: {score:.2f}/10)"] - } + result.status = False + result.label = ["QUALITY_GOOD.ANSWER_RELEVANCY_PASS"] + result.reason = [f"答案相关性评估通过 (分数: {score:.2f}/10)"] else: - result.eval_status = True - result.eval_details = { - "label": ["QUALITY_BAD.ANSWER_RELEVANCY_FAIL"], - "metric": [cls.__name__], - "reason": [f"答案相关性评估未通过 (分数: {score:.2f}/10)"] - } + result.status = True + result.label = ["QUALITY_BAD.ANSWER_RELEVANCY_FAIL"] + result.reason = [f"答案相关性评估未通过 (分数: {score:.2f}/10)"] return result except Exception as e: log.error(f"Answer Relevancy评估出错: {str(e)}") - result = ModelRes() - result.eval_status = True - result.eval_details = { - "label": ["QUALITY_BAD.ANSWER_RELEVANCY_ERROR"], - "metric": [cls.__name__], - "reason": [f"答案相关性评估出错: {str(e)}"] - } + result = EvalDetail(metric=cls.__name__) + result.status = True + result.label = ["QUALITY_BAD.ANSWER_RELEVANCY_ERROR"] + result.reason = [f"答案相关性评估出错: {str(e)}"] return result diff --git a/dingo/model/llm/rag/llm_rag_context_precision.py b/dingo/model/llm/rag/llm_rag_context_precision.py index 85a514e3..e9cefb5a 100644 --- a/dingo/model/llm/rag/llm_rag_context_precision.py +++ b/dingo/model/llm/rag/llm_rag_context_precision.py @@ -8,10 +8,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes -from dingo.model.response.response_class import ResponseScoreReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -114,7 +113,6 @@ def _calculate_average_precision(cls, verdicts: List[bool]) -> float: Returns: float: 平均精度分数 """ - import numpy as np # 转换为0/1列表 verdict_list = [1 if v else 0 for v in verdicts] @@ -197,14 +195,14 @@ def build_messages(cls, input_data: Data) -> List: return messages_list @classmethod - def process_response(cls, responses: List[str]) -> ModelRes: + def process_response(cls, responses: List[str]) -> EvalDetail: """处理LLM响应 Args: responses: 每个上下文的评估响应列表 Returns: - ModelRes: 评估结果 + EvalDetail: 评估结果 """ log.info(f"RAG Context Precision responses: {responses}") @@ -251,7 +249,7 @@ def process_response(cls, responses: List[str]) -> ModelRes: reason_text = "\n\n".join(all_reasons) reason_text += f"\n\n平均精度: {avg_precision:.4f},转换为0-10分: {score}/10" - result = ModelRes() + result = EvalDetail(metric=cls.__name__) result.score = score # 根据分数判断是否通过,默认阈值为5 @@ -260,24 +258,18 @@ def process_response(cls, responses: List[str]) -> ModelRes: threshold = cls.dynamic_config.parameters.get('threshold', 5) if score >= threshold: - result.eval_status = False - result.eval_details = { - "label": ["QUALITY_GOOD.CONTEXT_PRECISION_PASS"], - "metric": [cls.__name__], - "reason": [f"上下文精度评估通过 (分数: {score}/10)\n{reason_text}"] - } + result.status = False + result.label = ["QUALITY_GOOD.CONTEXT_PRECISION_PASS"] + result.reason = [f"上下文精度评估通过 (分数: {score}/10)\n{reason_text}"] else: - result.eval_status = True - result.eval_details = { - "label": ["QUALITY_BAD.CONTEXT_PRECISION_FAIL"], - "metric": [cls.__name__], - "reason": [f"上下文精度评估未通过 (分数: {score}/10)\n{reason_text}"] - } + result.status = True + result.label = ["QUALITY_BAD.CONTEXT_PRECISION_FAIL"] + result.reason = [f"上下文精度评估未通过 (分数: {score}/10)\n{reason_text}"] return result @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: """重写父类的eval方法,支持为每个上下文发送单独的请求""" if cls.client is None: cls.create_client() @@ -303,13 +295,16 @@ def eval(cls, input_data: Data) -> ModelRes: if response is None: # 如果所有尝试都失败,返回错误结果 - res = ModelRes() - res.eval_status = True - res.eval_details = { - "label": ["QUALITY_BAD.REQUEST_FAILED"], - "metric": [cls.__name__], - "reason": [f"为上下文{item['context_index']+1}发送请求失败"] - } + res = EvalDetail(metric=cls.__name__) + # res.eval_status = True + # res.eval_details = { + # "label": ["QUALITY_BAD.REQUEST_FAILED"], + # "metric": [cls.__name__], + # "reason": [f"为上下文{item['context_index']+1}发送请求失败"] + # } + res.status = True + res.label = ["QUALITY_BAD.REQUEST_FAILED"] + res.reason = [f"为上下文{item['context_index']+1}发送请求失败"] return res responses.append(response) diff --git a/dingo/model/llm/rag/llm_rag_context_recall.py b/dingo/model/llm/rag/llm_rag_context_recall.py index 0b6019b5..2b814101 100644 --- a/dingo/model/llm/rag/llm_rag_context_recall.py +++ b/dingo/model/llm/rag/llm_rag_context_recall.py @@ -8,10 +8,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes -from dingo.model.response.response_class import ResponseScoreReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -160,7 +159,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: """ 处理LLM响应 @@ -168,7 +167,7 @@ def process_response(cls, response: str) -> ModelRes: response: LLM原始响应 Returns: - ModelRes对象 + EvalDetail对象 """ log.info(f"RAG Context Recall response: {response}") @@ -198,7 +197,7 @@ def process_response(cls, response: str) -> ModelRes: # 生成reason reason = f"在 {total_statements} 个陈述中,有 {attributed_statements} 个可以从上下文中归因,{total_statements - attributed_statements} 个不能归因" - result = ModelRes() + result = EvalDetail(metric=cls.__name__) result.score = score # 根据分数判断是否通过,默认阈值为5 @@ -207,18 +206,12 @@ def process_response(cls, response: str) -> ModelRes: threshold = cls.dynamic_config.parameters.get('threshold', 5) if score >= threshold: - result.eval_status = False - result.eval_details = { - "label": ["QUALITY_GOOD.CONTEXT_RECALL_PASS"], - "metric": [cls.__name__], - "reason": [f"上下文召回评估通过 (分数: {score:.2f}/10)\n{reason}"] - } + result.status = False + result.label = ["QUALITY_GOOD.CONTEXT_RECALL_PASS"] + result.reason = [f"上下文召回评估通过 (分数: {score:.2f}/10)\n{reason}"] else: - result.eval_status = True - result.eval_details = { - "label": ["QUALITY_BAD.CONTEXT_RECALL_FAIL"], - "metric": [cls.__name__], - "reason": [f"上下文召回评估未通过 (分数: {score:.2f}/10)\n{reason}"] - } + result.status = True + result.label = ["QUALITY_BAD.CONTEXT_RECALL_FAIL"] + result.reason = [f"上下文召回评估未通过 (分数: {score:.2f}/10)\n{reason}"] return result diff --git a/dingo/model/llm/rag/llm_rag_context_relevancy.py b/dingo/model/llm/rag/llm_rag_context_relevancy.py index 734f7314..668f643b 100644 --- a/dingo/model/llm/rag/llm_rag_context_relevancy.py +++ b/dingo/model/llm/rag/llm_rag_context_relevancy.py @@ -8,10 +8,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes -from dingo.model.response.response_class import ResponseScoreReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -160,7 +159,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: """ 处理LLM响应 @@ -168,7 +167,7 @@ def process_response(cls, response: str) -> ModelRes: response: LLM原始响应 Returns: - ModelRes对象 + EvalDetail对象 """ log.info(f"RAG Context Relevancy response: {response}") @@ -199,7 +198,7 @@ def process_response(cls, response: str) -> ModelRes: else: # rating == 2 reason = "上下文包含与问题相关的信息" - result = ModelRes() + result = EvalDetail(metric=cls.__name__) result.score = score # 根据分数判断是否通过,默认阈值为5 @@ -208,18 +207,12 @@ def process_response(cls, response: str) -> ModelRes: threshold = cls.dynamic_config.parameters.get('threshold', 5) if score >= threshold: - result.eval_status = False - result.eval_details = { - "label": ["QUALITY_GOOD.CONTEXT_RELEVANCY_PASS"], - "metric": [cls.__name__], - "reason": [f"上下文相关性评估通过 (分数: {score:.2f}/10)\n{reason}"] - } + result.status = False + result.label = ["QUALITY_GOOD.CONTEXT_RELEVANCY_PASS"] + result.reason = [f"上下文相关性评估通过 (分数: {score:.2f}/10)\n{reason}"] else: - result.eval_status = True - result.eval_details = { - "label": ["QUALITY_BAD.CONTEXT_RELEVANCY_FAIL"], - "metric": [cls.__name__], - "reason": [f"上下文相关性评估未通过 (分数: {score:.2f}/10)\n{reason}"] - } + result.status = True + result.label = ["QUALITY_BAD.CONTEXT_RELEVANCY_FAIL"] + result.reason = [f"上下文相关性评估未通过 (分数: {score:.2f}/10)\n{reason}"] return result diff --git a/dingo/model/llm/rag/llm_rag_faithfulness.py b/dingo/model/llm/rag/llm_rag_faithfulness.py index c31a5a50..09409697 100644 --- a/dingo/model/llm/rag/llm_rag_faithfulness.py +++ b/dingo/model/llm/rag/llm_rag_faithfulness.py @@ -8,10 +8,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes -from dingo.model.response.response_class import ResponseScoreReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -241,7 +240,7 @@ def build_messages(cls, input_data: Data) -> List: return messages @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: """ 处理LLM响应 @@ -249,7 +248,7 @@ def process_response(cls, response: str) -> ModelRes: response: LLM原始响应 Returns: - ModelRes对象 + EvalDetail对象 """ log.info(f"RAG Faithfulness response: {response}") @@ -283,7 +282,7 @@ def process_response(cls, response: str) -> ModelRes: else: reason = "未提取到任何陈述" - result = ModelRes() + result = EvalDetail(metric=cls.__name__) result.score = score # 根据分数判断是否通过,默认阈值为5 @@ -292,18 +291,12 @@ def process_response(cls, response: str) -> ModelRes: threshold = cls.dynamic_config.parameters.get('threshold', 5) if score >= threshold: - result.eval_status = False - result.eval_details = { - "label": ["QUALITY_GOOD.FAITHFULNESS_PASS"], - "metric": [cls.__name__], - "reason": [f"忠实度评估通过 (分数: {score:.2f}/10)\n{reason}"] - } + result.status = False + result.label = ["QUALITY_GOOD.FAITHFULNESS_PASS"] + result.reason = [f"忠实度评估通过 (分数: {score:.2f}/10)\n{reason}"] else: - result.eval_status = True - result.eval_details = { - "label": ["QUALITY_BAD.FAITHFULNESS_FAIL"], - "metric": [cls.__name__], - "reason": [f"忠实度评估未通过 (分数: {score:.2f}/10)\n{reason}"] - } + result.status = True + result.label = ["QUALITY_BAD.FAITHFULNESS_FAIL"] + result.reason = [f"忠实度评估未通过 (分数: {score:.2f}/10)\n{reason}"] return result diff --git a/dingo/model/llm/security/llm_security.py b/dingo/model/llm/security/llm_security.py index 2d9d7aa3..287a5fb5 100644 --- a/dingo/model/llm/security/llm_security.py +++ b/dingo/model/llm/security/llm_security.py @@ -1,8 +1,7 @@ import json -from dingo.model import Model +from dingo.io.output.eval_detail import EvalDetail from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -10,7 +9,7 @@ # @Model.llm_register("LLMSecurity") class LLMSecurity(BaseOpenAI): @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -24,19 +23,13 @@ def process_response(cls, response: str) -> ModelRes: except json.JSONDecodeError: raise ConvertJsonError(f"Convert to JSON format failed: {response}") - result = ModelRes() + result = EvalDetail(metric=cls.__name__) tmp_reason = [] for k, v in response_json.items(): if v == "pos": - result.eval_status = True - # result.type = "Security" - # result.name = cls.prompt.__name__ - # result.reason.append(k) + result.status = True tmp_reason.append(k) - result.eval_details = { - "label": [f"Security.{cls.__name__}"], - "metric": [cls.__name__], - "reason": tmp_reason - } + result.label = [f"Security.{cls.__name__}"] + result.reason = tmp_reason return result diff --git a/dingo/model/llm/text_quality/llm_text_quality_v3.py b/dingo/model/llm/text_quality/llm_text_quality_v3.py index 995b3a35..51c08c7e 100644 --- a/dingo/model/llm/text_quality/llm_text_quality_v3.py +++ b/dingo/model/llm/text_quality/llm_text_quality_v3.py @@ -1,8 +1,8 @@ import json +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -49,7 +49,7 @@ class LLMTextQualityV3(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) # 清理 markdown 代码块 @@ -79,13 +79,10 @@ def process_response(cls, response: str) -> ModelRes: if not isinstance(reason_list, list): reason_list = [reason_list] if reason_list else [] - result = ModelRes() + result = EvalDetail(metric=cls.__name__) if score == 1: - result.eval_details = { - "label": [QualityLabel.QUALITY_GOOD], - "metric": [cls.__name__], - "reason": reason_list if reason_list else [""] - } + result.label = [QualityLabel.QUALITY_GOOD] + result.reason = reason_list if reason_list else [""] else: # 构建标签:type.name 格式 labels = [] @@ -94,11 +91,8 @@ def process_response(cls, response: str) -> ModelRes: if not labels: labels = [f"QUALITY_BAD.{cls.__name__}"] - result.eval_status = True - result.eval_details = { - "label": labels, - "metric": [cls.__name__], - "reason": reason_list if reason_list else [""] - } + result.status = True + result.label = labels + result.reason = reason_list if reason_list else [""] return result diff --git a/dingo/model/llm/text_quality/llm_text_repeat.py b/dingo/model/llm/text_quality/llm_text_repeat.py index 516c3386..5a162095 100644 --- a/dingo/model/llm/text_quality/llm_text_repeat.py +++ b/dingo/model/llm/text_quality/llm_text_repeat.py @@ -1,8 +1,8 @@ import json +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.response.response_class import ResponseScoreTypeNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -19,7 +19,7 @@ class LLMTextRepeat(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -35,24 +35,14 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreTypeNameReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # eval_status if response_model.score == 1: - # result.reason = [response_model.reason] - result.eval_details = { - "label": [QualityLabel.QUALITY_GOOD], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.label = [QualityLabel.QUALITY_GOOD] + result.reason = [response_model.reason] else: - result.eval_status = True - # result.type = response_model.type - # result.name = response_model.name - # result.reason = [response_model.reason] - result.eval_details = { - "label": [f"{response_model.type}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.status = True + result.label = [f"{response_model.type}.{response_model.name}"] + result.reason = [response_model.reason] return result diff --git a/dingo/model/llm/text_quality/llm_text_unread_issue.py b/dingo/model/llm/text_quality/llm_text_unread_issue.py index ab42fe38..155d5786 100644 --- a/dingo/model/llm/text_quality/llm_text_unread_issue.py +++ b/dingo/model/llm/text_quality/llm_text_unread_issue.py @@ -1,8 +1,8 @@ import json +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.response.response_class import ResponseScoreTypeNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -41,7 +41,7 @@ class LLMTextUnreadIssue(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -57,24 +57,14 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreTypeNameReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # eval_status if response_model.score == 1: - # result.reason = [response_model.reason] - result.eval_details = { - "label": [QualityLabel.QUALITY_GOOD], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.label = [QualityLabel.QUALITY_GOOD] + result.reason = [response_model.reason] else: - result.eval_status = True - # result.type = response_model.type - # result.name = response_model.name - # result.reason = [response_model.reason] - result.eval_details = { - "label": [f"{response_model.type}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.status = True + result.label = [f"{response_model.type}.{response_model.name}"] + result.reason = [response_model.reason] return result diff --git a/dingo/model/llm/text_quality/llm_text_word_stick.py b/dingo/model/llm/text_quality/llm_text_word_stick.py index 91164a7d..182a3608 100644 --- a/dingo/model/llm/text_quality/llm_text_word_stick.py +++ b/dingo/model/llm/text_quality/llm_text_word_stick.py @@ -1,8 +1,8 @@ import json +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.response.response_class import ResponseScoreTypeNameReason from dingo.utils import log from dingo.utils.exception import ConvertJsonError @@ -35,7 +35,7 @@ class LLMTextWordStick(BaseOpenAI): """ @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) if response.startswith("```json"): @@ -51,24 +51,14 @@ def process_response(cls, response: str) -> ModelRes: response_model = ResponseScoreTypeNameReason(**response_json) - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # eval_status if response_model.score == 1: - # result.reason = [response_model.reason] - result.eval_details = { - "label": [QualityLabel.QUALITY_GOOD], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.label = [QualityLabel.QUALITY_GOOD] + result.reason = [response_model.reason] else: - result.eval_status = True - # result.type = response_model.type - # result.name = response_model.name - # result.reason = [response_model.reason] - result.eval_details = { - "label": [f"{response_model.type}.{response_model.name}"], - "metric": [cls.__name__], - "reason": [response_model.reason] - } + result.status = True + result.label = [f"{response_model.type}.{response_model.name}"] + result.reason = [response_model.reason] return result diff --git a/dingo/model/llm/vlm_layout_quality.py b/dingo/model/llm/vlm_layout_quality.py index 91851541..95c9303c 100644 --- a/dingo/model/llm/vlm_layout_quality.py +++ b/dingo/model/llm/vlm_layout_quality.py @@ -4,9 +4,9 @@ from typing import List from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes from dingo.utils import log @@ -212,14 +212,13 @@ def send_messages(cls, messages: List): return str(completions.choices[0].message.content) @classmethod - def process_response(cls, response: str) -> ModelRes: + def process_response(cls, response: str) -> EvalDetail: log.info(response) response = response.replace("```json", "") response = response.replace("```", "") types = [] - # names = [] if response: try: @@ -231,16 +230,11 @@ def process_response(cls, response: str) -> ModelRes: if eval_details: types.append(eval_details) - # names.append(eval_details) except json.JSONDecodeError as e: log.error(f"JSON解析错误: {e}") - result = ModelRes() - # result.eval_status = False - # result.type = types - # result.name = names - # result.reason = [response] - result.eval_details.label = types - result.eval_details.reason = [response] + result = EvalDetail(metric=cls.__name__) + result.label = types + result.reason = [response] return result diff --git a/dingo/model/llm/vlm_ocr_understanding.py b/dingo/model/llm/vlm_ocr_understanding.py index 64d4336c..90047cd5 100644 --- a/dingo/model/llm/vlm_ocr_understanding.py +++ b/dingo/model/llm/vlm_ocr_understanding.py @@ -1,13 +1,7 @@ -import base64 -import json -import os -from typing import List - from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes -from dingo.utils import log @Model.llm_register("VLMOCRUnderstanding") @@ -181,5 +175,5 @@ class VLMOCRUnderstanding(BaseOpenAI): """ @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: pass # TODO diff --git a/dingo/model/modelres.py b/dingo/model/modelres.py deleted file mode 100644 index f66e2c03..00000000 --- a/dingo/model/modelres.py +++ /dev/null @@ -1,49 +0,0 @@ -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Field - - -class QualityLabel: - """质量标签常量类""" - QUALITY_GOOD = "QUALITY_GOOD" # Indicates pass the quality check - QUALITY_BAD_PREFIX = "QUALITY_BAD_" # Indicates not pass the quality check - - -class EvalDetail(BaseModel): - label: list[str] = [] - metric: list[str] = [] - reason: list = [] - - def merge(self, other: 'EvalDetail') -> None: - # 合并并去重 label 和 metric - self.label = list(set(self.label + other.label)) - self.metric = list(set(self.metric + other.metric)) - self.reason.extend(other.reason) - - def copy(self) -> 'EvalDetail': - """创建当前 EvalDetail 的深拷贝""" - return EvalDetail( - label=self.label.copy(), - metric=self.metric.copy(), - reason=self.reason.copy() - ) - - def to_dict(self) -> Dict[str, Any]: - """将 EvalDetail 转换为字典""" - return { - 'label': self.label, - 'metric': self.metric, - 'reason': self.reason - } - - -class ModelRes(BaseModel): - eval_status: bool = False - eval_details: EvalDetail = EvalDetail() - score: Optional[float] = None - - def __setattr__(self, name, value): - # 在赋值时拦截 eval_details 字段 - if name == 'eval_details' and isinstance(value, dict): - value = EvalDetail(**value) - super().__setattr__(name, value) diff --git a/dingo/model/rule/base.py b/dingo/model/rule/base.py index d6655e34..ff6dded6 100644 --- a/dingo/model/rule/base.py +++ b/dingo/model/rule/base.py @@ -2,7 +2,7 @@ from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data -from dingo.model.modelres import ModelRes +from dingo.io.output.eval_detail import EvalDetail class BaseRule: @@ -11,5 +11,5 @@ class BaseRule: dynamic_config: EvaluatorRuleArgs @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: raise NotImplementedError() diff --git a/dingo/model/rule/rule_audio.py b/dingo/model/rule/rule_audio.py index 3e869916..26d99fd2 100644 --- a/dingo/model/rule/rule_audio.py +++ b/dingo/model/rule/rule_audio.py @@ -4,8 +4,8 @@ from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model.model import Model -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.rule.base import BaseRule @@ -37,11 +37,11 @@ class RuleAudioDuration(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: import librosa from scipy.signal import welch - res = ModelRes() + res = EvalDetail(metric=cls.__name__) y, sr = librosa.load(input_data.content, sr=16000) f_signal, Pxx_signal = welch(y, fs=sr) @@ -51,26 +51,19 @@ def eval(cls, input_data: Data) -> ModelRes: noise_power = np.sum(Pxx_noise) if noise_power == 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The audio power is zero. Cannot calculate SNR."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The audio power is zero. Cannot calculate SNR."] + return res snr_dB = round(10 * np.log10(signal_power / noise_power), 2) if snr_dB < 8: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The audio signal-to-noise ratio is too low."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The audio signal-to-noise ratio is too low."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -102,10 +95,10 @@ class RuleAudioSnrQuality(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: import wave - res = ModelRes() + res = EvalDetail(metric=cls.__name__) if not input_data.content: return res if isinstance(input_data.content, str): @@ -115,16 +108,11 @@ def eval(cls, input_data: Data) -> ModelRes: duration = frame_count / sample_rate if duration > 10: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The audio duration is too long."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The audio duration is too long."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res diff --git a/dingo/model/rule/rule_common.py b/dingo/model/rule/rule_common.py index a8d1b879..2a415802 100644 --- a/dingo/model/rule/rule_common.py +++ b/dingo/model/rule/rule_common.py @@ -4,8 +4,8 @@ from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model.model import Model -from dingo.model.modelres import EvalDetail, ModelRes, QualityLabel from dingo.model.rule.base import BaseRule @@ -25,19 +25,18 @@ class RuleAbnormalChar(BaseRule): } @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) for r in [RuleSpecialCharacter, RuleInvisibleChar]: tmp_res = r.eval(input_data) - # print(tmp_res) - if tmp_res.eval_status: - res.eval_status = True - if isinstance(tmp_res.eval_details, dict): - tmp_res.eval_details = EvalDetail(**tmp_res.eval_details) - res.eval_details.merge(tmp_res.eval_details) + if tmp_res.status: + res.status = True + # res.merge(tmp_res) + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [] if res.reason is None else res.reason.extend(tmp_res.reason) # Set QUALITY_GOOD when all checks pass - if not res.eval_status: - res.eval_details = EvalDetail(label=[QualityLabel.QUALITY_GOOD]) + if not res.status: + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -56,18 +55,18 @@ class RuleAbnormalHtml(BaseRule): } @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) for r in [RuleHtmlEntity, RuleHtmlTag]: tmp_res = r.eval(input_data) - if tmp_res.eval_status: - res.eval_status = True - if isinstance(tmp_res.eval_details, dict): - tmp_res.eval_details = EvalDetail(**tmp_res.eval_details) - res.eval_details.merge(tmp_res.eval_details) + if tmp_res.status: + res.status = True + # res.merge(tmp_res) + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [] if res.reason is None else res.reason.extend(tmp_res.reason) # Set QUALITY_GOOD when all checks pass - if not res.eval_status: - res.eval_details = EvalDetail(label=[QualityLabel.QUALITY_GOOD]) + if not res.status: + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -87,17 +86,16 @@ class RuleAbnormalNumber(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r"\n{4}\d+\n{4}") @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content match = re.search(cls.dynamic_config.pattern, content) if match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [match.group(0).strip("\n")] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [match.group(0).strip("\n")] + else: + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -118,9 +116,9 @@ class RuleAlphaWords(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=0.6) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from nltk.tokenize import word_tokenize - res = ModelRes() + res = EvalDetail(metric=cls.__name__) content = input_data.content words = word_tokenize(content) n_words = len(words) @@ -129,19 +127,14 @@ def eval(cls, input_data: Data) -> ModelRes: n_alpha_words = sum([any((c.isalpha() for c in w)) for w in words]) ratio = n_alpha_words / n_words if ratio > cls.dynamic_config.threshold: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [ - "The ratio of words that contain at least one alphabetic character is: " - + str(ratio) - ] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [ + "The ratio of words that contain at least one alphabetic character is: " + + str(ratio) + ] return res @@ -173,23 +166,17 @@ class RuleAudioDataFormat(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) raw_data = input_data.raw_data key_list = ["id", "audio", "text"] if all(key in raw_data for key in key_list): - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } - return res + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Audio Data format error"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Audio Data format error"] return res @@ -211,9 +198,9 @@ class RuleCapitalWords(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=0.2) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from nltk.tokenize import WordPunctTokenizer - res = ModelRes() + res = EvalDetail(metric=cls.__name__) content = input_data.content words = WordPunctTokenizer().tokenize(content) num_words = len(words) @@ -222,16 +209,11 @@ def eval(cls, input_data: Data) -> ModelRes: num_caps_words = sum(map(str.isupper, words)) ratio = num_caps_words / num_words if ratio > cls.dynamic_config.threshold and num_words < 200: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["ratio: " + str(ratio)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["ratio: " + str(ratio)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -252,8 +234,8 @@ class RuleCharNumber(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=100) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) text = input_data.content text = text.strip() text = text.replace(" ", "") @@ -261,16 +243,11 @@ def eval(cls, input_data: Data) -> ModelRes: text = text.replace("\t", "") num_char = len(text) if num_char < cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The number of char is: " + str(num_char)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The number of char is: " + str(num_char)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -293,22 +270,17 @@ class RuleCharSplit(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content matches = re.findall(cls.dynamic_config.pattern, content) count = len(matches) if count >= cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": matches - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = matches else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -333,22 +305,26 @@ class RuleColonEnd(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if len(content) <= 0: return res if content[-1] == ":": - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [content[-100:]] - } + # res.eval_status = True + # res.eval_details = { + # "label": [f"{cls.metric_type}.{cls.__name__}"], + # "metric": [cls.__name__], + # "reason": [content[-100:]] + # } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [content[-100:]] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + # res.eval_details = { + # "label": [QualityLabel.QUALITY_GOOD] + # } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -389,20 +365,15 @@ class RuleContentNull(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) count = len(input_data.content.strip()) if count == 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content is empty."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content is empty."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -425,20 +396,15 @@ class RuleContentShort(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=20) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content.encode("utf-8") if len(content) <= cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content is too short."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content is too short."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -471,23 +437,18 @@ class RuleContentShortMultiLan(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=20) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from nltk.tokenize import WordPunctTokenizer - res = ModelRes() + res = EvalDetail(metric=cls.__name__) tk = WordPunctTokenizer() tokens = tk.tokenize(input_data.content) words = [word for word in tokens if word.isalpha()] if len(words) < cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content is too short."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content is too short."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -508,26 +469,21 @@ class RuleCurlyBracket(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=0.025) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if len(content) == 0: return res num = content.count("{") + content.count("}") ratio = num / len(content) if ratio > cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [ - "The ratio of curly bracket and characters is : " + str(ratio) - ] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [ + "The ratio of curly bracket and characters is : " + str(ratio) + ] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -567,24 +523,19 @@ class RuleDocRepeat(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=80) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import base_rps_frac_chars_in_dupe_ngrams - res = ModelRes() + res = EvalDetail(metric=cls.__name__) repeat_score = base_rps_frac_chars_in_dupe_ngrams(6, input_data.content) if repeat_score >= cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [ - "Repeatability of text is too high, with ratio: " + str(repeat_score) - ] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [ + "Repeatability of text is too high, with ratio: " + str(repeat_score) + ] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -613,8 +564,8 @@ class RuleDocFormulaRepeat(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=20) # 设置阈值为20 @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) # 提取所有公式 pattern = r'(?:\$\$(.*?)\$\$|\\\((.*?)\\\))' @@ -629,20 +580,15 @@ def eval(cls, input_data: Data) -> ModelRes: repeat_analysis = cls.analyze_repeats(formula_content) # 如果总连续重复长度超过阈值,则标记为错误 if repeat_analysis['total_repeat_length'] >= cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [ - f"Formula has too many consecutive repeated characters, " - f"total repeat length: {repeat_analysis['total_repeat_length']}, " - f"found {len(repeat_analysis['repeats'])} repeat patterns" - ] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [ + f"Formula has too many consecutive repeated characters, " + f"total repeat length: {repeat_analysis['total_repeat_length']}, " + f"found {len(repeat_analysis['repeats'])} repeat patterns" + ] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -693,18 +639,18 @@ class RuleEnterAndSpace(BaseRule): } @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) for r in [RuleEnterMore, RuleEnterRatioMore, RuleSpaceMore]: tmp_res = r.eval(input_data) - if tmp_res.eval_status: - res.eval_status = True - if isinstance(tmp_res.eval_details, dict): - tmp_res.eval_details = EvalDetail(**tmp_res.eval_details) - res.eval_details.merge(tmp_res.eval_details) + if tmp_res.status: + res.status = True + # res.merge(tmp_res) + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [] if res.reason is None else res.reason.extend(tmp_res.reason) # Set QUALITY_GOOD when all checks pass - if not res.eval_status: - res.eval_details = EvalDetail(label=[QualityLabel.QUALITY_GOOD]) + if not res.status: + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -740,23 +686,18 @@ class RuleEnterMore(BaseRule): dynamic_config = EvaluatorRuleArgs(key_list=[r"\n{8,}", r"\r\n{8,}"]) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content for p in cls.dynamic_config.key_list: SEARCH_REGEX = re.compile(p) match = SEARCH_REGEX.search(content) if match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has 8 consecutive carriage returns."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has 8 consecutive carriage returns."] return res - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -792,23 +733,18 @@ class RuleEnterRatioMore(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if len(content) == 0: return res ratio = content.count("\n") / len(content) if ratio > 0.25: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The number of enter / the number of content > 25%."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The number of enter / the number of content > 25%."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -829,23 +765,18 @@ class RuleHeadWordAr(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word - res = ModelRes() + res = EvalDetail(metric=cls.__name__) keyword = get_xyz_head_word("ar") content_tail = input_data.content[-100:] matches = re.findall("|".join(keyword), content_tail) if len(matches) > 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has irrelevance tail source info."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has irrelevance tail source info."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -866,23 +797,18 @@ class RuleHeadWordCs(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word - res = ModelRes() + res = EvalDetail(metric=cls.__name__) keyword = get_xyz_head_word("cs") content_tail = input_data.content[-100:] matches = re.findall("|".join(keyword), content_tail) if len(matches) > 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has irrelevance tail source info."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has irrelevance tail source info."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -903,23 +829,18 @@ class RuleHeadWordHu(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word - res = ModelRes() + res = EvalDetail(metric=cls.__name__) keyword = get_xyz_head_word("hu") content_tail = input_data.content[-100:] matches = re.findall("|".join(keyword), content_tail) if len(matches) > 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has irrelevance tail source info."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has irrelevance tail source info."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -940,23 +861,18 @@ class RuleHeadWordKo(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word - res = ModelRes() + res = EvalDetail(metric=cls.__name__) keyword = get_xyz_head_word("ko") content_tail = input_data.content[-100:] matches = re.findall("|".join(keyword), content_tail) if len(matches) > 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has irrelevance tail source info."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has irrelevance tail source info."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -977,23 +893,18 @@ class RuleHeadWordRu(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word - res = ModelRes() + res = EvalDetail(metric=cls.__name__) keyword = get_xyz_head_word("ru") content_tail = input_data.content[-100:] matches = re.findall("|".join(keyword), content_tail) if len(matches) > 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has irrelevance tail source info."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has irrelevance tail source info."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1014,23 +925,18 @@ class RuleHeadWordSr(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word - res = ModelRes() + res = EvalDetail(metric=cls.__name__) keyword = get_xyz_head_word("sr") content_tail = input_data.content[-100:] matches = re.findall("|".join(keyword), content_tail) if len(matches) > 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has irrelevance tail source info."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has irrelevance tail source info."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1051,23 +957,18 @@ class RuleHeadWordTh(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word - res = ModelRes() + res = EvalDetail(metric=cls.__name__) keyword = get_xyz_head_word("th") content_tail = input_data.content[-100:] matches = re.findall("|".join(keyword), content_tail) if len(matches) > 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has irrelevance tail source info."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has irrelevance tail source info."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1088,23 +989,18 @@ class RuleHeadWordVi(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.multi_lan_util import get_xyz_head_word - res = ModelRes() + res = EvalDetail(metric=cls.__name__) keyword = get_xyz_head_word("vi") content_tail = input_data.content[-100:] matches = re.findall("|".join(keyword), content_tail) if len(matches) > 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has irrelevance tail source info."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has irrelevance tail source info."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1159,8 +1055,8 @@ class RuleHtmlEntity(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if len(content) == 0: return res @@ -1186,16 +1082,11 @@ def eval(cls, input_data: Data) -> ModelRes: num += content.count(entity) error_entity.append(entity) if num / len(content) >= 0.01: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [list(set(error_entity))] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [list(set(error_entity))] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1232,24 +1123,19 @@ class RuleHtmlTag(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if len(content) == 0: return res matches = re.findall("|".join(cls.dynamic_config.key_list), content) num = len(matches) if num / len(content) >= 0.01: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": list(set(matches)) - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = list(set(matches)) else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1272,23 +1158,18 @@ class RuleIDCard(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import Extractor - res = ModelRes() + res = EvalDetail(metric=cls.__name__) match = re.search(cls.dynamic_config.pattern, input_data.content, re.I) if match: person_id = Extractor().extract_id_card(input_data.content) if len(person_id) != 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [str(person_id)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [str(person_id)] return res - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1324,24 +1205,19 @@ class RuleInvisibleChar(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if len(content) == 0: return res matches = re.findall(cls.dynamic_config.pattern, content) num = len(matches) if num / len(content) >= 0.01: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [repr(s) for s in list(set(matches))] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [repr(s) for s in list(set(matches))] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1373,23 +1249,17 @@ class RuleImageDataFormat(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) raw_data = input_data.raw_data key_list = ["img_id", "image"] if all(key in raw_data for key in key_list): - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } - return res + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Image Data format error"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Image Data format error"] return res @@ -1410,21 +1280,16 @@ class RuleLatexSpecialChar(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r"\$\$(.*?\!\!.*?)\$\$") @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content match = re.search(cls.dynamic_config.pattern, content) if match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [match.group(0).strip("\n")] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [match.group(0).strip("\n")] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1445,9 +1310,9 @@ class RuleLineEndWithEllipsis(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=0.3, key_list=["...", "…"]) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import TextSlice, split_paragraphs - res = ModelRes() + res = EvalDetail(metric=cls.__name__) raw_content = input_data.content raw_lines: Tuple[TextSlice] = split_paragraphs( text=raw_content, normalizer=lambda x: x, remove_empty=True @@ -1463,16 +1328,11 @@ def eval(cls, input_data: Data) -> ModelRes: ) ratio = num_occurrences / num_lines if ratio > cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The ratio of lines end with ellipsis is: " + str(ratio)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The ratio of lines end with ellipsis is: " + str(ratio)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1495,9 +1355,9 @@ class RuleLineEndWithTerminal(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import TextSlice, split_paragraphs - res = ModelRes() + res = EvalDetail(metric=cls.__name__) raw_content = input_data.content raw_lines: Tuple[TextSlice] = split_paragraphs( text=raw_content, normalizer=lambda x: x, remove_empty=True @@ -1518,16 +1378,11 @@ def eval(cls, input_data: Data) -> ModelRes: ) ratio = num_occurrences / num_lines if ratio < cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": list(set(terminal_marks)) - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = list(set(terminal_marks)) else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1562,9 +1417,9 @@ class RuleLineStartWithBulletpoint(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import TextSlice, split_paragraphs - res = ModelRes() + res = EvalDetail(metric=cls.__name__) raw_content = input_data.content raw_lines: Tuple[TextSlice] = split_paragraphs( text=raw_content, normalizer=lambda x: x, remove_empty=True @@ -1580,16 +1435,11 @@ def eval(cls, input_data: Data) -> ModelRes: ) ratio = num_occurrences / num_lines if ratio > cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The ratio of lines start with bulletpoint is: " + str(ratio)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The ratio of lines start with bulletpoint is: " + str(ratio)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1610,9 +1460,9 @@ class RuleLineJavascriptCount(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=3) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import TextSlice, normalize, split_paragraphs - res = ModelRes() + res = EvalDetail(metric=cls.__name__) raw_content = input_data.content normalized_lines: Tuple[TextSlice] = split_paragraphs( text=raw_content, normalizer=normalize, remove_empty=True @@ -1623,18 +1473,13 @@ def eval(cls, input_data: Data) -> ModelRes: num_occurrences = sum(["javascript" in line.text for line in normalized_lines]) num_not_occur = num_lines - num_occurrences if num_not_occur < cls.dynamic_config.threshold and num_lines > 3: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [ - "The lines with the word Javascript is: " + str(num_occurrences) - ] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [ + "The lines with the word Javascript is: " + str(num_occurrences) + ] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1655,9 +1500,9 @@ class RuleLoremIpsum(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=3e-08) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import normalize - res = ModelRes() + res = EvalDetail(metric=cls.__name__) normalized_content = normalize(input_data.content) num_normalized_content = len(normalized_content) if num_normalized_content == 0: @@ -1666,16 +1511,11 @@ def eval(cls, input_data: Data) -> ModelRes: num_occurrences = len(SEARCH_REGEX.findall(normalized_content)) ratio = num_occurrences / num_normalized_content if ratio > cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The ratio of lorem ipsum is: " + str(ratio)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The ratio of lorem ipsum is: " + str(ratio)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1696,9 +1536,9 @@ class RuleMeanWordLength(BaseRule): dynamic_config = EvaluatorRuleArgs(key_list=["3", "10"]) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import normalize - res = ModelRes() + res = EvalDetail(metric=cls.__name__) normalized_content = normalize(input_data.content) normalized_words = tuple(normalized_content.split()) num_normalized_words = len(normalized_words) @@ -1708,16 +1548,11 @@ def eval(cls, input_data: Data) -> ModelRes: mean_length = num_chars / num_normalized_words mean_length = round(mean_length, 2) if mean_length >= int(cls.dynamic_config.key_list[0]) and mean_length < int(cls.dynamic_config.key_list[1]): - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The mean length of word is: " + str(mean_length)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The mean length of word is: " + str(mean_length)] return res @@ -1749,23 +1584,17 @@ class RuleNlpDataFormat(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) raw_data = input_data.raw_data key_list = ["track_id", "content"] if all(key in raw_data for key in key_list): - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } - return res + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["NLP Data format error"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["NLP Data format error"] return res @@ -1805,8 +1634,8 @@ class RuleNoPunc(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=112) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content paragraphs = content.split("\n") longest_sentence = "" @@ -1822,16 +1651,11 @@ def eval(cls, input_data: Data) -> ModelRes: max_word_count = word_count longest_sentence = sentence.strip() if int(max_word_count) > cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [longest_sentence] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [longest_sentence] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1852,20 +1676,15 @@ class RulePatternSearch(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern="your pattern") @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) matches = re.findall(cls.dynamic_config.pattern, input_data.content) if matches: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": matches - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = matches else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1886,24 +1705,19 @@ class RuleSentenceNumber(BaseRule): dynamic_config = EvaluatorRuleArgs(key_list=["3", "7500"]) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) raw_content = input_data.content SENT_PATTERN = re.compile(r"\b[^.!?\n]+[.!?]*", flags=re.UNICODE) num_sentence = len(SENT_PATTERN.findall(raw_content)) if num_sentence < int(cls.dynamic_config.key_list[0]) or num_sentence > int( cls.dynamic_config.key_list[1] ): - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The number of sentence is: " + str(num_sentence)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The number of sentence is: " + str(num_sentence)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -1935,23 +1749,17 @@ class RuleSftDataFormat(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) raw_data = input_data.raw_data key_list = ["track_id", "type", "prompt", "completion"] if all(key in raw_data for key in key_list): - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } - return res + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["SFT Data format error"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["SFT Data format error"] return res @@ -1987,22 +1795,17 @@ class RuleSpaceMore(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=" {500,}") @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content SEARCH_REGEX = re.compile(cls.dynamic_config.pattern) match = SEARCH_REGEX.search(content) if match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content has 500 spaces."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content has 500 spaces."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -2051,8 +1854,8 @@ class RuleSpecialCharacter(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if len(content) == 0: return res @@ -2063,16 +1866,20 @@ def eval(cls, input_data: Data) -> ModelRes: num += len(m) matches = matches + m if num / len(content) >= 0.01: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": list(set(matches)) - } + # res.eval_status = True + # res.eval_details = { + # "label": [f"{cls.metric_type}.{cls.__name__}"], + # "metric": [cls.__name__], + # "reason": list(set(matches)) + # } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = list(set(matches)) else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + # res.eval_details = { + # "label": [QualityLabel.QUALITY_GOOD] + # } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -2093,11 +1900,11 @@ class RuleStopWord(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=0.06) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from nltk.tokenize import WordPunctTokenizer from dingo.model.rule.utils.util import get_stop_words - res = ModelRes() + res = EvalDetail(metric=cls.__name__) raw_content = input_data.content raw_words = list(WordPunctTokenizer().tokenize(raw_content)) raw_words = [str(w).lower() for w in raw_words] @@ -2108,16 +1915,11 @@ def eval(cls, input_data: Data) -> ModelRes: num_stop_words = len(list(filter(lambda word: word in STOP_WORDS, raw_words))) ratio = num_stop_words / num_raw_words if ratio < cls.dynamic_config.threshold or num_stop_words < 2: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The ratio of stop words is: " + str(ratio)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The ratio of stop words is: " + str(ratio)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -2138,9 +1940,9 @@ class RuleSymbolWordRatio(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=0.4, key_list=["#", "...", "…"]) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from nltk.tokenize import WordPunctTokenizer - res = ModelRes() + res = EvalDetail(metric=cls.__name__) raw_content = input_data.content raw_words = tuple(WordPunctTokenizer().tokenize(raw_content)) num_raw_words = len(raw_words) @@ -2152,16 +1954,11 @@ def eval(cls, input_data: Data) -> ModelRes: ) ratio = num_symbols / num_words if ratio > cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The ratio of symbol / word is: " + str(ratio)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The ratio of symbol / word is: " + str(ratio)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -2182,9 +1979,9 @@ class RuleUniqueWords(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=0.1) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import normalize - res = ModelRes() + res = EvalDetail(metric=cls.__name__) normalized_content = normalize(input_data.content) normalized_words = tuple(normalized_content.split()) num_normalized_words = len(normalized_words) @@ -2194,16 +1991,11 @@ def eval(cls, input_data: Data) -> ModelRes: num_unique_words = len(set(normalized_words)) ratio = num_unique_words / num_words if ratio > cls.dynamic_config.threshold: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The ratio of unique words is: " + str(ratio)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The ratio of unique words is: " + str(ratio)] return res @@ -2224,14 +2016,13 @@ class RuleUnsafeWords(BaseRule): dynamic_config = EvaluatorRuleArgs(refer_path=[]) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - import re + def eval(cls, input_data: Data) -> EvalDetail: import ahocorasick from dingo.model.rule.utils.util import get_unsafe_words - res = ModelRes() + res = EvalDetail(metric=cls.__name__) content = input_data.content key_list = cls.dynamic_config.key_list if key_list is None: @@ -2251,16 +2042,11 @@ def eval(cls, input_data: Data) -> ModelRes: matches.append((start_index, keyword)) if matches: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [value for index, value in matches] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [value for index, value in matches] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @classmethod @@ -2303,22 +2089,16 @@ class RuleVedioDataFormat(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) raw_data = input_data.raw_data key_list = ["id", "video", "text"] if all(key in raw_data for key in key_list): - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } - return res + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Vedio Data format error"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Vedio Data format error"] return res @@ -2357,24 +2137,19 @@ class RuleOnlyUrl(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if len(content.strip()) == 0: return res SEARCH_REGEX = re.compile(cls.dynamic_config.pattern) content_without_url = SEARCH_REGEX.sub("", content) if len(content_without_url.strip()) == 0: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Content is only an url link."] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Content is only an url link."] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -2395,20 +2170,15 @@ class RuleWatermark(BaseRule): dynamic_config = EvaluatorRuleArgs(key_list=[]) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) matches = re.findall("|".join(cls.dynamic_config.key_list), input_data.content) if matches: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": matches - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = matches else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -2429,25 +2199,20 @@ class RuleWordNumber(BaseRule): dynamic_config = EvaluatorRuleArgs(key_list=["20", "100000"]) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from dingo.model.rule.utils.util import normalize - res = ModelRes() + res = EvalDetail(metric=cls.__name__) normalized_content = normalize(input_data.content) normalized_words = tuple(normalized_content.split()) num_normalized_words = len(normalized_words) if num_normalized_words >= int( cls.dynamic_config.key_list[0] ) and num_normalized_words < int(cls.dynamic_config.key_list[1]): - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["The number of word is: " + str(num_normalized_words)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["The number of word is: " + str(num_normalized_words)] return res @@ -2468,21 +2233,16 @@ class RuleWordSplit(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r"[A-Za-z]+-\s*$") @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content match = re.findall(cls.dynamic_config.pattern, content) if match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": match - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = match else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -2525,12 +2285,12 @@ class RuleWordStuck(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: import wordninja from dingo.model.rule.utils.detect_lang import decide_language_by_str from dingo.model.rule.utils.util import is_sha256 - res = ModelRes() + res = EvalDetail(metric=cls.__name__) content = input_data.content for p in cls.dynamic_config.key_list: content = re.sub(p, "", content) @@ -2545,16 +2305,11 @@ def eval(cls, input_data: Data) -> ModelRes: lan = decide_language_by_str(longest_string) cut = wordninja.split(longest_string) if lan == "en" and len(cut) > 1: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [str(longest_string)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [str(longest_string)] return res - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res diff --git a/dingo/model/rule/rule_hallucination_hhem.py b/dingo/model/rule/rule_hallucination_hhem.py index 970456ff..ccd46982 100644 --- a/dingo/model/rule/rule_hallucination_hhem.py +++ b/dingo/model/rule/rule_hallucination_hhem.py @@ -12,12 +12,12 @@ """ import json -from typing import List, Union +from typing import List from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model import Model -from dingo.model.modelres import ModelRes from dingo.model.rule.base import BaseRule from dingo.utils import log @@ -71,7 +71,7 @@ def load_model(cls): raise RuntimeError(f"Failed to load HHEM model: {e}") @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: """ Evaluate hallucination using HHEM-2.1-Open model. @@ -79,7 +79,7 @@ def eval(cls, input_data: Data) -> ModelRes: input_data: Data object containing content and context Returns: - ModelRes with hallucination detection results + EvalDetail with hallucination detection results """ # Check if context is available if not hasattr(input_data, 'context') or not input_data.context: @@ -88,16 +88,13 @@ def eval(cls, input_data: Data) -> ModelRes: contexts = input_data.raw_data['context'] else: # No context available - cannot evaluate - result = ModelRes() - result.eval_status = True + result = EvalDetail(metric=cls.__name__) + result.status = True # result.type = cls.metric_type # result.name = "MISSING_CONTEXT" # result.reason = ["Context is required for HHEM hallucination detection but was not provided"] - result.eval_details = { - "label": [f"{cls.metric_type}.MISSING_CONTEXT"], - "metric": [cls.__name__], - "reason": ["Context is required for HHEM hallucination detection but was not provided"] - } + result.label = [f"{cls.metric_type}.MISSING_CONTEXT"] + result.reason = ["Context is required for HHEM hallucination detection but was not provided"] return result else: contexts = input_data.context @@ -139,15 +136,15 @@ def eval(cls, input_data: Data) -> ModelRes: avg_hallucination_score = sum(hallucination_scores) / len(hallucination_scores) # Create result - result = ModelRes() + result = EvalDetail(metric=cls.__name__) # result.score = avg_hallucination_score # Determine if hallucination detected based on threshold if avg_hallucination_score > cls.dynamic_config.threshold: - result.eval_status = True + result.status = True # result.type = cls.metric_type # result.name = "HALLUCINATION_DETECTED" - result.eval_details.label = [f"{cls.metric_type}.HALLUCINATION_DETECTED"] + result.label = [f"{cls.metric_type}.HALLUCINATION_DETECTED"] # Generate detailed analysis analysis_parts = [ @@ -190,12 +187,12 @@ def eval(cls, input_data: Data) -> ModelRes: ]) # result.reason = ["\n".join(analysis_parts)] - result.eval_details.reason = ["\n".join(analysis_parts)] + result.reason = ["\n".join(analysis_parts)] else: - result.eval_status = False + result.status = False # result.type = "QUALITY_GOOD" # result.name = "NO_HALLUCINATION" - result.eval_details.label = ['QUALITY_GOOD.NO_HALLUCINATION'] + result.label = ['QUALITY_GOOD.NO_HALLUCINATION'] # Generate analysis for non-hallucination case analysis = ( @@ -206,22 +203,19 @@ def eval(cls, input_data: Data) -> ModelRes: f"💡 模型信息: 使用 Vectara HHEM-2.1-Open (本地推理)" ) # result.reason = [analysis] - result.eval_details.reason = [analysis] + result.reason = [analysis] return result except Exception as e: # Handle model inference errors - result = ModelRes() - result.eval_status = True + result = EvalDetail(metric=cls.__name__) + result.status = True # result.type = cls.metric_type # result.name = "HHEM_ERROR" # result.reason = [f"HHEM model inference failed: {str(e)}"] - result.eval_details = { - "label": [f"{cls.metric_type}.HHEM_ERROR"], - "metric": [cls.__name__], - "reason": [f"HHEM model inference failed: {str(e)}"] - } + result.label = [f"{cls.metric_type}.HHEM_ERROR"] + result.reason = [f"HHEM model inference failed: {str(e)}"] return result @classmethod @@ -245,7 +239,7 @@ def evaluate_with_detailed_output(cls, input_data: Data) -> dict: } @classmethod - def batch_evaluate(cls, data_list: List[Data]) -> List[ModelRes]: + def batch_evaluate(cls, data_list: List[Data]) -> List[EvalDetail]: """ Batch evaluation for efficiency. @@ -253,7 +247,7 @@ def batch_evaluate(cls, data_list: List[Data]) -> List[ModelRes]: data_list: List of Data objects to evaluate Returns: - List of ModelRes objects + List of EvalDetail objects """ # Load model once for batch processing cls.load_model() diff --git a/dingo/model/rule/rule_image.py b/dingo/model/rule/rule_image.py index aef107f1..0429a794 100644 --- a/dingo/model/rule/rule_image.py +++ b/dingo/model/rule/rule_image.py @@ -12,8 +12,8 @@ from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model.model import Model -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.rule.base import BaseRule @@ -36,8 +36,8 @@ class RuleImageValid(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) if isinstance(input_data.image[0], str): img = Image.open(input_data.image[0]) else: @@ -45,16 +45,11 @@ def eval(cls, input_data: Data) -> ModelRes: img_new = img.convert("RGB") img_np = np.asarray(img_new) if np.all(img_np == (255, 255, 255)) or np.all(img_np == (0, 0, 0)): - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Image is not valid: all white or black"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Image is not valid: all white or black"] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -77,8 +72,8 @@ class RuleImageSizeValid(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) if isinstance(input_data.image[0], str): img = Image.open(input_data.image[0]) else: @@ -86,19 +81,14 @@ def eval(cls, input_data: Data) -> ModelRes: width, height = img.size aspect_ratio = width / height if aspect_ratio > 4 or aspect_ratio < 0.25: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [ - "Image size is not valid, the ratio of width to height: " - + str(aspect_ratio) - ] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [ + "Image size is not valid, the ratio of width to height: " + + str(aspect_ratio) + ] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -121,11 +111,11 @@ class RuleImageQuality(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=5.5) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: import pyiqa import torch - res = ModelRes() + res = EvalDetail(metric=cls.__name__) if isinstance(input_data.image[0], str): img = Image.open(input_data.image[0]) else: @@ -137,16 +127,11 @@ def eval(cls, input_data: Data) -> ModelRes: score_fr = iqa_metric(img) score = score_fr.item() if score < cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Image quality is not satisfied, ratio: " + str(score)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Image quality is not satisfied, ratio: " + str(score)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -170,10 +155,10 @@ class RuleImageRepeat(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: from imagededup.methods import CNN, PHash - res = ModelRes() + res = EvalDetail(metric=cls.__name__) image_dir = input_data.content if len(os.listdir(image_dir)) == 0: raise ZeroDivisionError( @@ -195,19 +180,14 @@ def eval(cls, input_data: Data) -> ModelRes: set(duplicates_cnn.keys()) ) if common_duplicates: - res.eval_status = True + res.status = True tmp_reason = [f"{image} -> {duplicates_cnn[image]}" for image in common_duplicates] tmp_reason.append({"duplicate_ratio": len(common_duplicates) / len(os.listdir(image_dir))}) - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": tmp_reason - } + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = tmp_reason else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -230,7 +210,7 @@ class RuleImageTextSimilarity(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=0.17) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: import nltk nltk.download("punkt_tab") @@ -239,7 +219,7 @@ def eval(cls, input_data: Data) -> ModelRes: from dingo.model.rule.utils.image_util import download_similar_tool - res = ModelRes() + res = EvalDetail(metric=cls.__name__) if not input_data.image or not input_data.content: return res if isinstance(input_data.image[0], str): @@ -258,16 +238,11 @@ def eval(cls, input_data: Data) -> ModelRes: scores.append(sim_score[0][0]) average_score = sum(scores) / len(scores) if average_score < cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Image quality is not satisfied, ratio: " + str(average_score)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Image quality is not satisfied, ratio: " + str(average_score)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -288,7 +263,7 @@ class RuleImageArtimuse(BaseRule): dynamic_config = EvaluatorRuleArgs(threshold=6, refer_path=['https://artimuse.intern-ai.org.cn/']) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: try: response_create_task = requests.post( cls.dynamic_config.refer_path[0] + 'api/v1/task/create_task', @@ -328,28 +303,20 @@ def eval(cls, input_data: Data) -> ModelRes: break time.sleep(5) - res = ModelRes() - res.eval_status = True if status_data['score_overall'] < cls.dynamic_config.threshold else False + res = EvalDetail(metric=cls.__name__) + res.status = True if status_data['score_overall'] < cls.dynamic_config.threshold else False tmp = "BadImage" if status_data['score_overall'] < cls.dynamic_config.threshold else "GoodImage" - if res.eval_status: - res.eval_details = { - "label": [f"Artimuse_Succeeded.{tmp}"], - "metric": [cls.__name__], - "reason": [json.dumps(status_data, ensure_ascii=False)] - } + if res.status: + res.label = [f"Artimuse_Succeeded.{tmp}"] + res.reason = [json.dumps(status_data, ensure_ascii=False)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res except Exception as e: - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["Artimuse_Fail.Exception"], - "metric": [cls.__name__], - "reason": [str(e)] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["Artimuse_Fail.Exception"] + res.reason = [str(e)] return res @@ -372,9 +339,9 @@ class RuleImageLabelOverlap(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: - res = ModelRes() + res = EvalDetail(metric=cls.__name__) try: # 1. 阈值参数 @@ -390,44 +357,32 @@ def eval(cls, input_data: Data) -> ModelRes: try: annotations = json.loads(content) except json.JSONDecodeError as e: - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelOverlap_Fail.ParseError"], - "metric": [cls.__name__], - "reason": [f"content解析失败:{str(e)},前50字符:{content[:50]}..."] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelOverlap_Fail.ParseError"] + res.reason = [f"content解析失败:{str(e)},前50字符:{content[:50]}..."] return res elif isinstance(content, dict): annotations = content else: - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelOverlap_Fail.InvalidContentType"], - "metric": [cls.__name__], - "reason": [f"content类型错误:需dict/str,实际是{type(content).__name__}"] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelOverlap_Fail.InvalidContentType"] + res.reason = [f"content类型错误:需dict/str,实际是{type(content).__name__}"] return res # 4. 验证数据有效性 if not annotations: - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelOverlap_Fail.EmptyAnnotations"], - "metric": [cls.__name__], - "reason": ["annotations为空"] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelOverlap_Fail.EmptyAnnotations"] + res.reason = ["annotations为空"] return res if not image_path or not os.path.exists(image_path): - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelOverlap_Fail.InvalidImagePath"], - "metric": [cls.__name__], - "reason": [f"图片路径无效:{image_path}"] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelOverlap_Fail.InvalidImagePath"] + res.reason = [f"图片路径无效:{image_path}"] return res # 5. 提取边界框并计算重叠 @@ -480,15 +435,12 @@ def eval(cls, input_data: Data) -> ModelRes: # 6. 根据重叠状态设置错误信息 if has_overlap: # 符合阈值重叠:标记为错误状态 - res.eval_status = True - res.eval_details = { - "label": ["LabelOverlap_Fail.RuleImageLabelOverlap"], - "metric": [cls.__name__], - "reason": [f"重叠检测:完全重叠={len(full_overlap_pairs)},部分重叠={len(partial_overlap_pairs)}"] - } + res.status = True + res.label = ["LabelOverlap_Fail.RuleImageLabelOverlap"] + res.reason = [f"重叠检测:完全重叠={len(full_overlap_pairs)},部分重叠={len(partial_overlap_pairs)}"] else: # 不符合阈值重叠:正常状态 - res.eval_status = False + res.status = False # 7. 生成可视化标注框重叠图片 vis_path = None # 初始化vis_path变量 @@ -560,13 +512,10 @@ def eval(cls, input_data: Data) -> ModelRes: # 8. 整理结果(结果已通过eval_status和eval_details返回) except Exception as global_e: - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelOverlap_Fail.GlobalError"], - "metric": [cls.__name__], - "reason": [f"全局处理错误:{str(global_e)}"] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelOverlap_Fail.GlobalError"] + res.reason = [f"全局处理错误:{str(global_e)}"] return res @@ -590,9 +539,9 @@ class RuleImageLabelVisualization(BaseRule): ) @classmethod - def eval(cls, input_data: Data) -> ModelRes: + def eval(cls, input_data: Data) -> EvalDetail: - res = ModelRes() + res = EvalDetail(metric=cls.__name__) try: # -------------------------- @@ -674,13 +623,10 @@ def draw_bboxes(draw_obj, elements, color_map, font_obj): # 验证图片路径有效性 if not image_path or not os.path.exists(image_path): - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelVisualization_Fail.InvalidImagePath"], - "metric": [cls.__name__], - "reason": [f"图片路径无效/不存在:{image_path}"] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelVisualization_Fail.InvalidImagePath"] + res.reason = [f"图片路径无效/不存在:{image_path}"] return res # 解析标注内容 @@ -688,41 +634,32 @@ def draw_bboxes(draw_obj, elements, color_map, font_obj): try: annotations = json.loads(content) except json.JSONDecodeError as e: - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelVisualization_Fail.ParseError"], - "metric": [cls.__name__], - "reason": [f"标注解析失败:{str(e)},前50字符:{content[:50]}..."] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelVisualization_Fail.ParseError"] + res.reason = [f"标注解析失败:{str(e)},前50字符:{content[:50]}..."] return res elif isinstance(content, dict): annotations = content else: - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelVisualization_Fail.InvalidAnnotationType"], - "metric": [cls.__name__], - "reason": [f"标注类型错误:需dict/str,实际{type(content).__name__}"] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelVisualization_Fail.InvalidAnnotationType"] + res.reason = [f"标注类型错误:需dict/str,实际{type(content).__name__}"] return res # 提取布局标注(适配"layout_dets"字段) layout_dets = annotations.get("layout_dets", []) if not layout_dets: # 无标注数据时的处理 - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelVisualization_Fail.EmptyLayoutData"], - "metric": [cls.__name__], - "reason": [json.dumps({ - "message": "无布局标注数据(layout_dets为空)", - "visualization_path": None, - "label_stats": {"total_labels": 0} - }, ensure_ascii=False)] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelVisualization_Fail.EmptyLayoutData"] + res.reason = [json.dumps({ + "message": "无布局标注数据(layout_dets为空)", + "visualization_path": None, + "label_stats": {"total_labels": 0} + }, ensure_ascii=False)] return res # -------------------------- @@ -770,30 +707,24 @@ def draw_bboxes(draw_obj, elements, color_map, font_obj): try: img.save(vis_path) except Exception as e: - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelVisualization_Fail.SaveImageError"], - "metric": [cls.__name__], - "reason": [f"保存图像失败:{str(e)}"] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelVisualization_Fail.SaveImageError"] + res.reason = [f"保存图像失败:{str(e)}"] return res # -------------------------- # 5. 整理结果(结果已通过eval_status返回) # -------------------------- - res.eval_status = False + res.status = False except Exception as global_e: # 全局异常处理 - res = ModelRes() - res.eval_status = False - res.eval_details = { - "label": ["LabelVisualization_Fail.GlobalError"], - "metric": [cls.__name__], - "reason": [f"可视化处理全局错误:{str(global_e)}"] - } + res = EvalDetail(metric=cls.__name__) + res.status = False + res.label = ["LabelVisualization_Fail.GlobalError"] + res.reason = [f"可视化处理全局错误:{str(global_e)}"] return res diff --git a/dingo/model/rule/rule_resume.py b/dingo/model/rule/rule_resume.py index 880be4f6..f0ac185c 100644 --- a/dingo/model/rule/rule_resume.py +++ b/dingo/model/rule/rule_resume.py @@ -2,8 +2,8 @@ from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model.model import Model -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.rule.base import BaseRule # ========== Privacy Issues ========== @@ -28,21 +28,16 @@ class RuleResumeIDCard(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'\b\d{17}[\dXx]\b') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content match = re.search(cls.dynamic_config.pattern, content) if match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Found ID card number: " + match.group(0)[:6] + "****" + match.group(0)[-4:]] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Found ID card number: " + match.group(0)[:6] + "****" + match.group(0)[-4:]] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -65,21 +60,16 @@ class RuleResumeDetailedAddress(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'(省|市|区|县|镇|街道|路|号|室|栋|单元|楼).{0,20}(省|市|区|县|镇|街道|路|号|室|栋|单元|楼)') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content match = re.search(cls.dynamic_config.pattern, content) if match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Found detailed address: " + match.group(0)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Found detailed address: " + match.group(0)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -105,21 +95,16 @@ class RuleResumeEmailMissing(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content match = re.search(cls.dynamic_config.pattern, content) if not match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Email address not found in resume"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Email address not found in resume"] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -142,21 +127,16 @@ class RuleResumePhoneMissing(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3,4}[-.\s]?\d{4}') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content match = re.search(cls.dynamic_config.pattern, content) if not match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Phone number not found in resume"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Phone number not found in resume"] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -179,22 +159,17 @@ class RuleResumePhoneFormat(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'\b\d{11}\b') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content matches = re.findall(cls.dynamic_config.pattern, content) invalid_phones = [m for m in matches if not m.startswith(('13', '14', '15', '16', '17', '18', '19'))] if invalid_phones: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Invalid phone format: " + ", ".join(invalid_phones)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Invalid phone format: " + ", ".join(invalid_phones)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -220,21 +195,16 @@ class RuleResumeExcessiveWhitespace(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r' {3,}', threshold=3) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content matches = re.findall(cls.dynamic_config.pattern, content) if len(matches) >= cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Found " + str(len(matches)) + " instances of excessive whitespace"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Found " + str(len(matches)) + " instances of excessive whitespace"] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -257,21 +227,16 @@ class RuleResumeMarkdown(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'(#{7,}|(\*{3,})|(\_{3,}))') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content match = re.search(cls.dynamic_config.pattern, content) if match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Markdown syntax error: " + match.group(0)] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Markdown syntax error: " + match.group(0)] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -297,22 +262,17 @@ class RuleResumeNameMissing(BaseRule): dynamic_config = EvaluatorRuleArgs() @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content first_section = content[:200] # Check if first section contains Chinese name pattern or heading if not re.search(r'(^#\s*.+|^.{2,4}$)', first_section, re.MULTILINE): - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Name or heading not found in the first section"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Name or heading not found in the first section"] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -335,21 +295,16 @@ class RuleResumeSectionMissing(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'(教育|学历|工作|经历|experience|education)', threshold=1) @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content.lower() matches = re.findall(cls.dynamic_config.pattern, content, re.IGNORECASE) if len(matches) < cls.dynamic_config.threshold: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Required sections (education/experience) not found"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Required sections (education/experience) not found"] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -375,21 +330,16 @@ class RuleResumeEmoji(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content matches = re.findall(cls.dynamic_config.pattern, content) if matches: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Found " + str(len(matches)) + " emoji characters"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Found " + str(len(matches)) + " emoji characters"] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -412,21 +362,16 @@ class RuleResumeInformal(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'(搞定|牛逼|厉害|哈哈|嘿嘿|呵呵|啊|呀|吧|哦)') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content matches = re.findall(cls.dynamic_config.pattern, content) if matches: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Found informal language: " + ", ".join(set(matches))] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Found informal language: " + ", ".join(set(matches))] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -452,8 +397,8 @@ class RuleResumeDateFormat(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'\d{4}[-./年]\d{1,2}') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content matches = re.findall(cls.dynamic_config.pattern, content) if matches: @@ -470,9 +415,7 @@ def eval(cls, input_data: Data) -> ModelRes: "label": [QualityLabel.QUALITY_GOOD] } else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -498,21 +441,16 @@ class RuleResumeEducationMissing(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'(教育|学历|education|university|college|bachelor|master|phd)') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content.lower() match = re.search(cls.dynamic_config.pattern, content, re.IGNORECASE) if not match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Education section not found in resume"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Education section not found in resume"] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res @@ -535,19 +473,14 @@ class RuleResumeExperienceMissing(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'(工作|经历|experience|employment|position|职位)') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content.lower() match = re.search(cls.dynamic_config.pattern, content, re.IGNORECASE) if not match: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": ["Work experience section not found in resume"] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = ["Work experience section not found in resume"] else: - res.eval_details = { - "label": [QualityLabel.QUALITY_GOOD] - } + res.label = [QualityLabel.QUALITY_GOOD] return res diff --git a/dingo/model/rule/rule_xinghe.py b/dingo/model/rule/rule_xinghe.py index 5432fae1..73cce5da 100644 --- a/dingo/model/rule/rule_xinghe.py +++ b/dingo/model/rule/rule_xinghe.py @@ -1,11 +1,9 @@ import re -import string -from typing import Tuple from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail, QualityLabel from dingo.model.model import Model -from dingo.model.modelres import ModelRes, QualityLabel from dingo.model.rule.base import BaseRule @@ -25,18 +23,15 @@ class RuleDoi(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern=r'^10\.\d{4,9}/([^A-Z\s]*)$') @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if re.match(cls.dynamic_config.pattern, content): - res.eval_details.label = [QualityLabel.QUALITY_GOOD] + res.label = [QualityLabel.QUALITY_GOOD] else: - res.eval_status = True - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [content] - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [content] return res @@ -94,9 +89,9 @@ def _validate_isbn13(cls, isbn: str) -> bool: return total % 10 == 0 @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() - res.eval_details.label = [QualityLabel.QUALITY_GOOD] + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) + res.label = [QualityLabel.QUALITY_GOOD] content = input_data.content content = str(content).replace('-', '') @@ -104,20 +99,17 @@ def eval(cls, input_data: Data) -> ModelRes: if cls._validate_isbn10(content): pass else: - res.eval_status = True + res.status = True elif len(content) == 13: if cls._validate_isbn13(content): pass else: - res.eval_status = True + res.status = True else: - res.eval_status = True + res.status = True # add details - if res.eval_status: - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": [content] - } + if res.status: + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = [content] return res diff --git a/docs/en/CONTRIBUTING.md b/docs/en/CONTRIBUTING.md index bf2226ba..169c8913 100644 --- a/docs/en/CONTRIBUTING.md +++ b/docs/en/CONTRIBUTING.md @@ -178,35 +178,35 @@ Style configurations can be found in `setup.cfg` and `.pre-commit-config.yaml`. from typing import List, Optional from dingo.io.input import Data -from dingo.model.modelres import ModelRes +from dingo.io.output.eval_detail import EvalDetail class ExampleRule: - """Example rule for demonstration purposes. + """Example rule for demonstration purposes. - This rule checks for specific patterns in text data. + This rule checks for specific patterns in text data. - Args: - pattern: Regular expression pattern to match - threshold: Minimum threshold for rule activation - """ + Args: + pattern: Regular expression pattern to match + threshold: Minimum threshold for rule activation + """ - def __init__(self, pattern: str, threshold: float = 0.5) -> None: - self.pattern = pattern - self.threshold = threshold + def __init__(self, pattern: str, threshold: float = 0.5) -> None: + self.pattern = pattern + self.threshold = threshold - def eval(self, input_data: Data) -> ModelRes: - """Evaluate input data against the rule. + def eval(self, input_data: Data) -> EvalDetail: + """Evaluate input data against the rule. - Args: - input_data: Input data to evaluate + Args: + input_data: Input data to evaluate - Returns: - ModelRes: Evaluation result - """ - res = ModelRes() - # Implementation here - return res + Returns: + EvalDetail: Evaluation result + """ + res = EvalDetail() + # Implementation here + return res ``` ## Contributing Guidelines @@ -227,24 +227,26 @@ class ExampleRule: 4. **Document the rule** with clear docstrings and examples Example: + ```python from dingo.model import Model from dingo.model.rule.base import BaseRule from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data -from dingo.model.modelres import ModelRes +from dingo.io.output.eval_detail import EvalDetail + @Model.rule_register('QUALITY_BAD_CUSTOM', ['default']) class CustomRule(BaseRule): - """Custom rule for specific quality check.""" + """Custom rule for specific quality check.""" - dynamic_config = EvaluatorRuleArgs(pattern=r'custom_pattern') + dynamic_config = EvaluatorRuleArgs(pattern=r'custom_pattern') - @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() - # Implementation - return res + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail() + # Implementation + return res ``` ### Adding New LLM Models diff --git a/examples/register/sdk_register_llm.py b/examples/register/sdk_register_llm.py index a06b57a8..c28ea179 100644 --- a/examples/register/sdk_register_llm.py +++ b/examples/register/sdk_register_llm.py @@ -1,12 +1,7 @@ -import json import os from dingo.model import Model from dingo.model.llm.base_openai import BaseOpenAI -from dingo.model.modelres import ModelRes -from dingo.model.response.response_class import ResponseScoreTypeNameReason -from dingo.utils import log -from dingo.utils.exception import ConvertJsonError OPENAI_MODEL = 'deepseek-chat' OPENAI_URL = 'https://api.deepseek.com/v1' diff --git a/examples/register/sdk_register_rule.py b/examples/register/sdk_register_rule.py index 31017af1..4b33f3de 100644 --- a/examples/register/sdk_register_rule.py +++ b/examples/register/sdk_register_rule.py @@ -2,8 +2,8 @@ from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model.model import Model -from dingo.model.modelres import ModelRes from dingo.model.rule.base import BaseRule @@ -13,19 +13,13 @@ class CommonPatternDemo(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern = "blue") @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) matches = re.findall(cls.dynamic_config.pattern, input_data.content) if matches: - res.eval_status = True - # res.type = cls.metric_type - # res.name = cls.__name__ - # res.reason = matches - res.eval_details = { - "label": [f"{cls.metric_type}.{cls.__name__}"], - "metric": [cls.__name__], - "reason": matches - } + res.status = True + res.label = [f"{cls.metric_type}.{cls.__name__}"] + res.reason = matches return res diff --git a/test/scripts/exec/test_local.py b/test/scripts/exec/test_local.py index aa50ad42..5b9a1836 100644 --- a/test/scripts/exec/test_local.py +++ b/test/scripts/exec/test_local.py @@ -3,6 +3,7 @@ from dingo.config import InputArgs from dingo.exec import Executor, LocalExecutor from dingo.io import ResultInfo +from dingo.io.output.eval_detail import EvalDetail class TestLocal: @@ -15,11 +16,14 @@ def test_merge_result_info(self): }, eval_status = True, eval_details = { - "content": { - "label": ["QUALITY_BAD_EFFECTIVENESS-RuleColonEnd"], - "metric": ["RuleColonEnd"], - "reason": ["�I am 8 years old. ^I love apple because:"] - } + "content": [ + EvalDetail( + metric="RuleColonEnd", + status=True, + label=["QUALITY_BAD_EFFECTIVENESS-RuleColonEnd"], + reason=["�I am 8 years old. ^I love apple because:"] + ) + ] } ) new_item2 = ResultInfo( @@ -29,11 +33,14 @@ def test_merge_result_info(self): }, eval_status = True, eval_details = { - "content": { - "label": ["QUALITY_BAD_EFFECTIVENESS-PromptContentChaos"], - "metric": ["PromptContentChaos"], - "reason": ["文本中包含不可见字符或乱码(如�和^),可能影响阅读理解。"] - } + "content": [ + EvalDetail( + metric="PromptContentChaos", + status=True, + label=["QUALITY_BAD_EFFECTIVENESS-PromptContentChaos"], + reason=["文本中包含不可见字符或乱码(如�和^),可能影响阅读理解。"] + ) + ] } ) @@ -46,13 +53,30 @@ def test_merge_result_info(self): new_existing_list = localexecutor.merge_result_info(existing_list, new_item1) new_existing_list = localexecutor.merge_result_info(new_existing_list, new_item2) assert len(new_existing_list) == 1 - assert len(new_existing_list[0].eval_details.get('content').label) == 2 - assert len(new_existing_list[0].eval_details.get('content').metric) == 2 - assert len(new_existing_list[0].eval_details.get('content').reason) == 2 - assert "QUALITY_BAD_EFFECTIVENESS-RuleColonEnd" in new_existing_list[0].eval_details.get('content').label - assert "QUALITY_BAD_EFFECTIVENESS-PromptContentChaos" in new_existing_list[0].eval_details.get('content').label - assert "�I am 8 years old. ^I love apple because:" in new_existing_list[0].eval_details.get('content').reason - assert "文本中包含不可见字符或乱码(如�和^),可能影响阅读理解。" in new_existing_list[0].eval_details.get('content').reason + + # 获取合并后的 content 字段的 EvalDetail 列表 + content_details = new_existing_list[0].eval_details.get('content') + assert len(content_details) == 2 + + # 收集所有的 label, metric, reason + all_labels = [] + all_metrics = [] + all_reasons = [] + for detail in content_details: + if detail.label: + all_labels.extend(detail.label) + if detail.metric: + all_metrics.append(detail.metric) + if detail.reason: + all_reasons.extend(detail.reason) + + assert len(all_labels) == 2 + assert len(all_metrics) == 2 + assert len(all_reasons) == 2 + assert "QUALITY_BAD_EFFECTIVENESS-RuleColonEnd" in all_labels + assert "QUALITY_BAD_EFFECTIVENESS-PromptContentChaos" in all_labels + assert "�I am 8 years old. ^I love apple because:" in all_reasons + assert "文本中包含不可见字符或乱码(如�和^),可能影响阅读理解。" in all_reasons def test_all_labels_config(self): input_data = { diff --git a/test/scripts/io/input/test_continue.py b/test/scripts/io/input/test_continue.py index b734265c..f260fb54 100644 --- a/test/scripts/io/input/test_continue.py +++ b/test/scripts/io/input/test_continue.py @@ -1,16 +1,20 @@ import json import os.path +from pathlib import Path import pytest from dingo.config import InputArgs from dingo.exec import Executor +# 获取项目根目录 +ROOT_DIR = Path(__file__).parent.parent.parent.parent.parent + class TestContinue: def test_continue_local_jsonl(self): input_data = { - "input_path": "test/data/test_local_jsonl.jsonl", + "input_path": str(ROOT_DIR / "test/data/test_local_jsonl.jsonl"), "dataset": { "source": "local", "format": "jsonl", diff --git a/test/scripts/io/input/test_write.py b/test/scripts/io/input/test_write.py index 044d6281..dc65a12a 100644 --- a/test/scripts/io/input/test_write.py +++ b/test/scripts/io/input/test_write.py @@ -1,16 +1,20 @@ import os import shutil +from pathlib import Path import pytest from dingo.config import InputArgs from dingo.exec import Executor +# 获取项目根目录 +ROOT_DIR = Path(__file__).parent.parent.parent.parent.parent + class TestWrite: def test_write_local_jsonl(self): input_data = { - "input_path": "test/data/test_local_jsonl.jsonl", + "input_path": str(ROOT_DIR / "test/data/test_local_jsonl.jsonl"), "dataset": { "source": "local", "format": "jsonl" diff --git a/test/scripts/model/rule/test_rule_common.py b/test/scripts/model/rule/test_rule_common.py index 4493c9f4..e872672e 100644 --- a/test/scripts/model/rule/test_rule_common.py +++ b/test/scripts/model/rule/test_rule_common.py @@ -1,7 +1,5 @@ -import pytest - from dingo.io import Data -from dingo.model.modelres import EvalDetail +from dingo.io.output.eval_detail import EvalDetail from dingo.model.rule.rule_common import RuleDocFormulaRepeat, RuleUnsafeWords @@ -10,21 +8,17 @@ def test_rule_doc_formula_repeat(self): data = Data(data_id="1",content="we are a $$x^2 + y^2 + z^2 == z^\\sqrt{4}\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots\\dots$$ , we are a $$x^2 + y^2 = z^2$$ ") res = RuleDocFormulaRepeat.eval(data) # print(res) - assert res.eval_status is True - if isinstance(res.eval_details, dict): - res.eval_details = EvalDetail(**res.eval_details) - assert res.eval_details.label == ["QUALITY_BAD_SIMILARITY.RuleDocFormulaRepeat"] - assert res.eval_details.metric == ["RuleDocFormulaRepeat"] - assert res.eval_details.reason == ["Formula has too many consecutive repeated characters, total repeat length: 130, found 1 repeat patterns"] + assert res.status is True + assert res.label == ["QUALITY_BAD_SIMILARITY.RuleDocFormulaRepeat"] + assert res.metric == "RuleDocFormulaRepeat" + assert res.reason == ["Formula has too many consecutive repeated characters, total repeat length: 130, found 1 repeat patterns"] def test_rule_unsafe_words(self): data = Data(data_id="", prompt="", content="java is good\n \n \n \n hello \n \n but python is better") r = RuleUnsafeWords r.dynamic_config.key_list = ['av', 'b', 'java'] tmp = r.eval(data) - assert tmp.eval_status is True - if isinstance(tmp.eval_details, dict): - tmp.eval_details = EvalDetail(**tmp.eval_details) - assert 'av' not in tmp.eval_details.reason - assert 'b' not in tmp.eval_details.reason - assert 'java' in tmp.eval_details.reason + assert tmp.status is True + assert 'av' not in tmp.reason + assert 'b' not in tmp.reason + assert 'java' in tmp.reason diff --git a/test/scripts/model/test_modelres.py b/test/scripts/model/test_modelres.py index efa6ee3e..b9a6211c 100644 --- a/test/scripts/model/test_modelres.py +++ b/test/scripts/model/test_modelres.py @@ -1,11 +1,9 @@ -import os -import re from typing import List from dingo.config.input_args import EvaluatorRuleArgs from dingo.io import Data +from dingo.io.output.eval_detail import EvalDetail from dingo.model.model import Model -from dingo.model.modelres import ModelRes from dingo.model.rule.base import BaseRule @@ -15,21 +13,24 @@ class RegisterRuleColon(BaseRule): dynamic_config = EvaluatorRuleArgs(pattern = "blue") @classmethod - def eval(cls, input_data: Data) -> ModelRes: - res = ModelRes() + def eval(cls, input_data: Data) -> EvalDetail: + res = EvalDetail(metric=cls.__name__) content = input_data.content if len(content) <= 0: return res if content[-1] == ":": - res.eval_status = True + # res.eval_status = True # res.type = [cls.metric_type, 'TestType'] # res.name = [cls.__name__, 'TestName'] # res.reason = [content[-100:]] - res.eval_details = { - "label": [cls.metric_type, 'TestType'], - "metric": [cls.__name__], - "reason": [content[-100:]] - } + # res.eval_details = { + # "label": [cls.metric_type, 'TestType'], + # "metric": [cls.__name__], + # "reason": [content[-100:]] + # } + res.status = True + res.label = [cls.metric_type, 'TestType'] + res.reason = [content[-100:]] return res @@ -44,7 +45,7 @@ def test_type_name_list(self): res = RegisterRuleColon().eval(data) # print(res) - assert isinstance(res.eval_details.label, List) - assert isinstance(res.eval_details.reason, List) - assert len(res.eval_details.label) == 2 - assert 'TestType' in res.eval_details.label + assert isinstance(res.label, List) + assert isinstance(res.reason, List) + assert len(res.label) == 2 + assert 'TestType' in res.label