|
| 1 | +import re |
| 2 | +from typing import List |
| 3 | + |
| 4 | +import diff_match_patch as dmp_module |
| 5 | + |
| 6 | +from dingo.io import Data |
| 7 | +from dingo.model import Model |
| 8 | +from dingo.model.llm.base_openai import BaseOpenAI |
| 9 | +from dingo.model.modelres import ModelRes |
| 10 | +from dingo.model.prompt.prompt_html_extract_compare_v2 import PromptHtmlExtractCompareV2 |
| 11 | +from dingo.model.response.response_class import ResponseNameReason |
| 12 | +from dingo.utils import log |
| 13 | + |
| 14 | + |
| 15 | +@Model.llm_register("LLMHtmlExtractCompareV2") |
| 16 | +class LLMHtmlExtractCompareV2(BaseOpenAI): |
| 17 | + """ |
| 18 | + HTML提取工具对比评估 V2 版本 |
| 19 | +
|
| 20 | + 主要改进: |
| 21 | + 1. 使用 diff-match-patch 算法预先提取文本差异 |
| 22 | + 2. 只向 LLM 提供独有内容和共同内容,大幅减少 token 消耗 |
| 23 | + 3. 支持中英文双语提示词 |
| 24 | + 4. 使用 A/B/C 判断格式,更清晰地表达哪个工具更好 |
| 25 | +
|
| 26 | + 输入数据要求: |
| 27 | + - input_data.prompt: 工具A提取的文本 |
| 28 | + - input_data.content: 工具B提取的文本 |
| 29 | + - input_data.raw_data.get("language", "en"): 语言类型 ("zh" 或 "en") |
| 30 | + """ |
| 31 | + |
| 32 | + prompt = PromptHtmlExtractCompareV2 |
| 33 | + |
| 34 | + @classmethod |
| 35 | + def extract_text_diff(cls, text_a: str, text_b: str, max_diff_length: int = 10000) -> dict: |
| 36 | + """ |
| 37 | + 使用 diff-match-patch 算法提取两段文本的差异 |
| 38 | +
|
| 39 | + Args: |
| 40 | + text_a: 工具A提取的文本 |
| 41 | + text_b: 工具B提取的文本 |
| 42 | + max_diff_length: 差异文本的最大长度限制 |
| 43 | +
|
| 44 | + Returns: |
| 45 | + dict: 包含 unique_a, unique_b, common 三个字段 |
| 46 | + """ |
| 47 | + dmp = dmp_module.diff_match_patch() |
| 48 | + |
| 49 | + # 计算差异 |
| 50 | + diff = dmp.diff_main(text_a, text_b) |
| 51 | + dmp.diff_cleanupEfficiency(diff) |
| 52 | + |
| 53 | + unique_a_list = [] |
| 54 | + unique_b_list = [] |
| 55 | + common_list = [] |
| 56 | + |
| 57 | + for single_diff in diff: |
| 58 | + if single_diff[0] == -1: # 仅在 text_a 中 |
| 59 | + unique_a_list.append(single_diff[1]) |
| 60 | + elif single_diff[0] == 1: # 仅在 text_b 中 |
| 61 | + unique_b_list.append(single_diff[1]) |
| 62 | + elif single_diff[0] == 0: # 共同内容 |
| 63 | + common_list.append(single_diff[1]) |
| 64 | + |
| 65 | + return { |
| 66 | + "unique_a": "".join(unique_a_list)[:max_diff_length], |
| 67 | + "unique_b": "".join(unique_b_list)[:max_diff_length], |
| 68 | + "common": "".join(common_list)[:max_diff_length], |
| 69 | + } |
| 70 | + |
| 71 | + @classmethod |
| 72 | + def build_messages(cls, input_data: Data) -> List: |
| 73 | + """ |
| 74 | + 构建 LLM 输入消息 |
| 75 | +
|
| 76 | + 主要流程: |
| 77 | + 1. 提取工具A和工具B的文本 |
| 78 | + 2. 使用 diff-match-patch 计算差异 |
| 79 | + 3. 根据语言选择合适的提示词 |
| 80 | + 4. 填充差异内容到提示词中 |
| 81 | + """ |
| 82 | + # 获取输入文本 |
| 83 | + text_tool_a = input_data.prompt |
| 84 | + text_tool_b = input_data.content |
| 85 | + |
| 86 | + # 获取配置参数 |
| 87 | + language = input_data.raw_data.get("language", "en") |
| 88 | + |
| 89 | + # 计算文本差异 |
| 90 | + diff_result = cls.extract_text_diff(text_tool_a, text_tool_b) |
| 91 | + |
| 92 | + # 根据语言选择提示词 |
| 93 | + if language == "zh": |
| 94 | + prompt_template = cls.prompt.content_cn |
| 95 | + else: |
| 96 | + prompt_template = cls.prompt.content_en |
| 97 | + |
| 98 | + # 填充提示词 |
| 99 | + prompt_content = prompt_template.format( |
| 100 | + text_unique_tool_a=diff_result["unique_a"], |
| 101 | + text_unique_tool_b=diff_result["unique_b"], |
| 102 | + text_common=diff_result["common"] |
| 103 | + ) |
| 104 | + |
| 105 | + messages = [ |
| 106 | + { |
| 107 | + "role": "user", |
| 108 | + "content": prompt_content |
| 109 | + } |
| 110 | + ] |
| 111 | + |
| 112 | + return messages |
| 113 | + |
| 114 | + @classmethod |
| 115 | + def _parse_response_to_structured(cls, response: str) -> ResponseNameReason: |
| 116 | + """ |
| 117 | + 将 LLM 原始响应解析为结构化的 ResponseNameReason 对象 |
| 118 | +
|
| 119 | + 解析格式: |
| 120 | + 1. 提取 <Judgement>A/B/C</Judgement> 标签中的判断结果 |
| 121 | + 2. 其余内容作为推理过程 |
| 122 | +
|
| 123 | + Args: |
| 124 | + response: LLM 原始响应文本 |
| 125 | +
|
| 126 | + Returns: |
| 127 | + ResponseNameReason: 结构化响应对象,name 字段存储判断结果 (A/B/C) |
| 128 | +
|
| 129 | + Raises: |
| 130 | + ValueError: 如果无法解析出有效的判断结果 |
| 131 | + """ |
| 132 | + log.info(response) |
| 133 | + |
| 134 | + # 提取判断结果 |
| 135 | + judgement_match = re.search(r"<Judgement>([ABC])</Judgement>", response) |
| 136 | + |
| 137 | + if not judgement_match: |
| 138 | + # 如果没有找到标准格式,尝试其他可能的格式 |
| 139 | + judgement_match = re.search(r"判断[::]\s*([ABC])", response) |
| 140 | + if not judgement_match: |
| 141 | + judgement_match = re.search(r"答案[::]\s*([ABC])", response) |
| 142 | + |
| 143 | + if not judgement_match: |
| 144 | + raise ValueError(f"无法从响应中提取判断结果: {response}") |
| 145 | + |
| 146 | + judgement = judgement_match.group(1) |
| 147 | + |
| 148 | + # 提取推理过程(去除判断标签) |
| 149 | + reason = re.sub(r"<Judgement>[ABC]</Judgement>", "", response).strip() |
| 150 | + |
| 151 | + # 使用 Pydantic 模型进行验证,name 字段存储判断结果 |
| 152 | + return ResponseNameReason( |
| 153 | + name=judgement, |
| 154 | + reason=reason |
| 155 | + ) |
| 156 | + |
| 157 | + @classmethod |
| 158 | + def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> ModelRes: |
| 159 | + """ |
| 160 | + 将结构化响应转换为 ModelRes 对象 |
| 161 | +
|
| 162 | + 映射规则: |
| 163 | + - A -> TOOL_ONE_BETTER (工具A更好,error_status=False) |
| 164 | + - B -> TOOL_EQUAL (两者相同,error_status=False) |
| 165 | + - C -> TOOL_TWO_BETTER (工具B更好,error_status=True) |
| 166 | +
|
| 167 | + Args: |
| 168 | + structured_response: 结构化响应对象,name 字段存储判断结果 (A/B/C) |
| 169 | +
|
| 170 | + Returns: |
| 171 | + ModelRes: 评估结果对象 |
| 172 | + """ |
| 173 | + result = ModelRes() |
| 174 | + |
| 175 | + # 从 name 字段获取判断结果 |
| 176 | + judgement = structured_response.name |
| 177 | + |
| 178 | + # 映射判断结果到类型和状态 |
| 179 | + judgement_mapping = { |
| 180 | + "A": { |
| 181 | + "type": "TOOL_ONE_BETTER", |
| 182 | + "error_status": False, # 工具A更好,正常 |
| 183 | + "description": "工具A提取的信息更完整" |
| 184 | + }, |
| 185 | + "B": { |
| 186 | + "type": "TOOL_EQUAL", |
| 187 | + "error_status": False, # 两者相同,正常 |
| 188 | + "description": "两个工具提取的信息量相同" |
| 189 | + }, |
| 190 | + "C": { |
| 191 | + "type": "TOOL_TWO_BETTER", |
| 192 | + "error_status": True, # 工具B更好,标记为问题 |
| 193 | + "description": "工具B提取的信息更完整" |
| 194 | + } |
| 195 | + } |
| 196 | + |
| 197 | + mapping = judgement_mapping.get(judgement) |
| 198 | + if not mapping: |
| 199 | + raise ValueError(f"无效的判断结果: {judgement}") |
| 200 | + |
| 201 | + result.type = mapping["type"] |
| 202 | + result.error_status = mapping["error_status"] |
| 203 | + result.name = f"Judgement_{judgement}" |
| 204 | + result.reason = [structured_response.reason] |
| 205 | + |
| 206 | + return result |
| 207 | + |
| 208 | + @classmethod |
| 209 | + def process_response(cls, response: str) -> ModelRes: |
| 210 | + """ |
| 211 | + 处理 LLM 返回结果 |
| 212 | +
|
| 213 | + 数据流: |
| 214 | + 1. 原始响应 (str) -> 结构化响应 (ResponseNameReason) |
| 215 | + 2. 结构化响应 -> 评估结果 (ModelRes) |
| 216 | +
|
| 217 | + 这种分层设计的好处: |
| 218 | + - 更清晰的责任分离 |
| 219 | + - 利用 Pydantic 的验证功能 |
| 220 | + - 便于单元测试 |
| 221 | + - 便于扩展和维护 |
| 222 | +
|
| 223 | + Args: |
| 224 | + response: LLM 原始响应文本 |
| 225 | +
|
| 226 | + Returns: |
| 227 | + ModelRes: 评估结果对象 |
| 228 | + """ |
| 229 | + # 步骤1: 解析为结构化响应 |
| 230 | + structured_response = cls._parse_response_to_structured(response) |
| 231 | + |
| 232 | + # 步骤2: 转换为模型结果 |
| 233 | + result = cls._convert_to_model_result(structured_response) |
| 234 | + |
| 235 | + return result |
0 commit comments