Skip to content

Commit 30a8fc1

Browse files
authored
Merge pull request #208 from e06084/dev
feat: add html_extract_compare_v2
2 parents 0dfa62a + f095d02 commit 30a8fc1

File tree

9 files changed

+1036
-0
lines changed

9 files changed

+1036
-0
lines changed
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
import re
2+
from typing import List
3+
4+
import diff_match_patch as dmp_module
5+
6+
from dingo.io import Data
7+
from dingo.model import Model
8+
from dingo.model.llm.base_openai import BaseOpenAI
9+
from dingo.model.modelres import ModelRes
10+
from dingo.model.prompt.prompt_html_extract_compare_v2 import PromptHtmlExtractCompareV2
11+
from dingo.model.response.response_class import ResponseNameReason
12+
from dingo.utils import log
13+
14+
15+
@Model.llm_register("LLMHtmlExtractCompareV2")
16+
class LLMHtmlExtractCompareV2(BaseOpenAI):
17+
"""
18+
HTML提取工具对比评估 V2 版本
19+
20+
主要改进:
21+
1. 使用 diff-match-patch 算法预先提取文本差异
22+
2. 只向 LLM 提供独有内容和共同内容,大幅减少 token 消耗
23+
3. 支持中英文双语提示词
24+
4. 使用 A/B/C 判断格式,更清晰地表达哪个工具更好
25+
26+
输入数据要求:
27+
- input_data.prompt: 工具A提取的文本
28+
- input_data.content: 工具B提取的文本
29+
- input_data.raw_data.get("language", "en"): 语言类型 ("zh" 或 "en")
30+
"""
31+
32+
prompt = PromptHtmlExtractCompareV2
33+
34+
@classmethod
35+
def extract_text_diff(cls, text_a: str, text_b: str, max_diff_length: int = 10000) -> dict:
36+
"""
37+
使用 diff-match-patch 算法提取两段文本的差异
38+
39+
Args:
40+
text_a: 工具A提取的文本
41+
text_b: 工具B提取的文本
42+
max_diff_length: 差异文本的最大长度限制
43+
44+
Returns:
45+
dict: 包含 unique_a, unique_b, common 三个字段
46+
"""
47+
dmp = dmp_module.diff_match_patch()
48+
49+
# 计算差异
50+
diff = dmp.diff_main(text_a, text_b)
51+
dmp.diff_cleanupEfficiency(diff)
52+
53+
unique_a_list = []
54+
unique_b_list = []
55+
common_list = []
56+
57+
for single_diff in diff:
58+
if single_diff[0] == -1: # 仅在 text_a 中
59+
unique_a_list.append(single_diff[1])
60+
elif single_diff[0] == 1: # 仅在 text_b 中
61+
unique_b_list.append(single_diff[1])
62+
elif single_diff[0] == 0: # 共同内容
63+
common_list.append(single_diff[1])
64+
65+
return {
66+
"unique_a": "".join(unique_a_list)[:max_diff_length],
67+
"unique_b": "".join(unique_b_list)[:max_diff_length],
68+
"common": "".join(common_list)[:max_diff_length],
69+
}
70+
71+
@classmethod
72+
def build_messages(cls, input_data: Data) -> List:
73+
"""
74+
构建 LLM 输入消息
75+
76+
主要流程:
77+
1. 提取工具A和工具B的文本
78+
2. 使用 diff-match-patch 计算差异
79+
3. 根据语言选择合适的提示词
80+
4. 填充差异内容到提示词中
81+
"""
82+
# 获取输入文本
83+
text_tool_a = input_data.prompt
84+
text_tool_b = input_data.content
85+
86+
# 获取配置参数
87+
language = input_data.raw_data.get("language", "en")
88+
89+
# 计算文本差异
90+
diff_result = cls.extract_text_diff(text_tool_a, text_tool_b)
91+
92+
# 根据语言选择提示词
93+
if language == "zh":
94+
prompt_template = cls.prompt.content_cn
95+
else:
96+
prompt_template = cls.prompt.content_en
97+
98+
# 填充提示词
99+
prompt_content = prompt_template.format(
100+
text_unique_tool_a=diff_result["unique_a"],
101+
text_unique_tool_b=diff_result["unique_b"],
102+
text_common=diff_result["common"]
103+
)
104+
105+
messages = [
106+
{
107+
"role": "user",
108+
"content": prompt_content
109+
}
110+
]
111+
112+
return messages
113+
114+
@classmethod
115+
def _parse_response_to_structured(cls, response: str) -> ResponseNameReason:
116+
"""
117+
将 LLM 原始响应解析为结构化的 ResponseNameReason 对象
118+
119+
解析格式:
120+
1. 提取 <Judgement>A/B/C</Judgement> 标签中的判断结果
121+
2. 其余内容作为推理过程
122+
123+
Args:
124+
response: LLM 原始响应文本
125+
126+
Returns:
127+
ResponseNameReason: 结构化响应对象,name 字段存储判断结果 (A/B/C)
128+
129+
Raises:
130+
ValueError: 如果无法解析出有效的判断结果
131+
"""
132+
log.info(response)
133+
134+
# 提取判断结果
135+
judgement_match = re.search(r"<Judgement>([ABC])</Judgement>", response)
136+
137+
if not judgement_match:
138+
# 如果没有找到标准格式,尝试其他可能的格式
139+
judgement_match = re.search(r"判断[::]\s*([ABC])", response)
140+
if not judgement_match:
141+
judgement_match = re.search(r"答案[::]\s*([ABC])", response)
142+
143+
if not judgement_match:
144+
raise ValueError(f"无法从响应中提取判断结果: {response}")
145+
146+
judgement = judgement_match.group(1)
147+
148+
# 提取推理过程(去除判断标签)
149+
reason = re.sub(r"<Judgement>[ABC]</Judgement>", "", response).strip()
150+
151+
# 使用 Pydantic 模型进行验证,name 字段存储判断结果
152+
return ResponseNameReason(
153+
name=judgement,
154+
reason=reason
155+
)
156+
157+
@classmethod
158+
def _convert_to_model_result(cls, structured_response: ResponseNameReason) -> ModelRes:
159+
"""
160+
将结构化响应转换为 ModelRes 对象
161+
162+
映射规则:
163+
- A -> TOOL_ONE_BETTER (工具A更好,error_status=False)
164+
- B -> TOOL_EQUAL (两者相同,error_status=False)
165+
- C -> TOOL_TWO_BETTER (工具B更好,error_status=True)
166+
167+
Args:
168+
structured_response: 结构化响应对象,name 字段存储判断结果 (A/B/C)
169+
170+
Returns:
171+
ModelRes: 评估结果对象
172+
"""
173+
result = ModelRes()
174+
175+
# 从 name 字段获取判断结果
176+
judgement = structured_response.name
177+
178+
# 映射判断结果到类型和状态
179+
judgement_mapping = {
180+
"A": {
181+
"type": "TOOL_ONE_BETTER",
182+
"error_status": False, # 工具A更好,正常
183+
"description": "工具A提取的信息更完整"
184+
},
185+
"B": {
186+
"type": "TOOL_EQUAL",
187+
"error_status": False, # 两者相同,正常
188+
"description": "两个工具提取的信息量相同"
189+
},
190+
"C": {
191+
"type": "TOOL_TWO_BETTER",
192+
"error_status": True, # 工具B更好,标记为问题
193+
"description": "工具B提取的信息更完整"
194+
}
195+
}
196+
197+
mapping = judgement_mapping.get(judgement)
198+
if not mapping:
199+
raise ValueError(f"无效的判断结果: {judgement}")
200+
201+
result.type = mapping["type"]
202+
result.error_status = mapping["error_status"]
203+
result.name = f"Judgement_{judgement}"
204+
result.reason = [structured_response.reason]
205+
206+
return result
207+
208+
@classmethod
209+
def process_response(cls, response: str) -> ModelRes:
210+
"""
211+
处理 LLM 返回结果
212+
213+
数据流:
214+
1. 原始响应 (str) -> 结构化响应 (ResponseNameReason)
215+
2. 结构化响应 -> 评估结果 (ModelRes)
216+
217+
这种分层设计的好处:
218+
- 更清晰的责任分离
219+
- 利用 Pydantic 的验证功能
220+
- 便于单元测试
221+
- 便于扩展和维护
222+
223+
Args:
224+
response: LLM 原始响应文本
225+
226+
Returns:
227+
ModelRes: 评估结果对象
228+
"""
229+
# 步骤1: 解析为结构化响应
230+
structured_response = cls._parse_response_to_structured(response)
231+
232+
# 步骤2: 转换为模型结果
233+
result = cls._convert_to_model_result(structured_response)
234+
235+
return result
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
from dingo.model.model import Model
2+
from dingo.model.prompt.base import BasePrompt
3+
4+
5+
@Model.prompt_register("Html_Extract_Compare_V2", ['html_extract_compare'], ['LLMHtmlExtractCompareV2'])
6+
class PromptHtmlExtractCompareV2(BasePrompt):
7+
_metric_info = {
8+
'category': 'SFT Data Assessment Metrics',
9+
'metric_name': 'PromptHtmlExtractCompareV2',
10+
'description': 'Compares HTML extraction results using diff-match-patch algorithm to identify unique and common content, then evaluates core informational content differences',
11+
'paper_title': '',
12+
'paper_url': '',
13+
'paper_authors': '',
14+
'evaluation_results': ''
15+
}
16+
17+
content_en = r"""Please compare the following two texts, each extracted from the same webpage using different HTML parsing methods. Your task is to determine whether there is a difference in the core informational content between them.
18+
19+
Guidelines:
20+
21+
Core informational content refers to: main facts, key ideas, central explanations, important data, and the primary textual body of the page.
22+
23+
DO NOT consider the following as core content:
24+
25+
Related questions
26+
Related topics
27+
Recommended articles
28+
"You might also like" sections
29+
Titles or section headings
30+
Author names, credentials, affiliations, or bylines
31+
Reference lists, citations, or bibliographies (e.g., "[1] Smith, J. 2020…")
32+
Hyperlinks, URLs, or navigation elements (e.g., "Back to homepage", "Related articles", "Next/Previous")
33+
34+
Other autogenerated content
35+
These elements are considered supplementary and should not influence your assessment of content differences.
36+
37+
You should ignore differences in formatting, word order, or minor stylistic variations unless they affect the actual meaning or presence of important information.
38+
39+
content 1:
40+
{text_unique_tool_a}
41+
42+
content 2:
43+
{text_unique_tool_b}
44+
45+
content 3:
46+
{text_common}
47+
48+
Text A contains content 1 + content 3
49+
Text B contains content 2 + content 3
50+
You should focus on the intrinsic logic between the unique content (content 1, content 2) and the common content (content 3) as the crucial basis for judging whether there is significant informational content.
51+
Explain your reasoning briefly. Then judge the compare result as one of:
52+
A. Text A contains more core informational content than Text B
53+
B. Text A contains the same amount of core informational content as Text B
54+
C. Text A contains less core informational content than Text B
55+
56+
Return the judgment using this format:
57+
<Judgement>A</Judgement> or <Judgement>B</Judgement> or <Judgement>C</Judgement>
58+
Please output your thought process first, and then provide your final judgement.
59+
"""
60+
61+
content_cn = r"""请比较以下两段文本,它们是使用不同的 HTML 解析方法从同一网页中提取的。你的任务是判断这两段文本在核心信息内容上是否存在差异。
62+
63+
评判指南:
64+
65+
"核心信息内容"是指:主要事实、关键信息、核心解释、重要数据以及网页的主要正文内容。
66+
67+
请不要将以下内容视为核心信息:
68+
69+
- 相关问题
70+
- 相关主题
71+
- 推荐文章
72+
- "你可能还喜欢" 类内容
73+
- 标题或章节标题
74+
- 作者姓名、资历、机构或署名
75+
- 参考文献、引用或文献列表
76+
- 超链接、网址或导航元素
77+
- 其他自动生成的内容
78+
- 主题总结
79+
80+
这些元素被视为附加信息,不应影响你对信息差异的判断。
81+
82+
除非会影响实际含义或重要信息的存在,否则请忽略格式、措辞顺序或轻微风格差异。
83+
84+
content 1:
85+
{text_unique_tool_a}
86+
87+
content 2:
88+
{text_unique_tool_b}
89+
90+
content 3:
91+
{text_common}
92+
93+
Text A 由 content 1 + content 3 构成
94+
Text B 由 content 2 + content 3 构成
95+
你应重点关注"独有内容(content 1、content 2)"与"共同内容(content 3)"之间的内在逻辑,作为判断是否存在重要信息差异的关键依据。
96+
97+
请简要说明你的推理过程。然后给出如下三种判断之一:
98+
99+
A. Text A 包含的核心信息内容多于 Text B
100+
B. Text A 与 Text B 包含相同量的核心信息内容
101+
C. Text A 包含的核心信息内容少于 Text B
102+
103+
请按以下格式返回你的判断:
104+
<Judgement>A</Judgement> 或 <Judgement>B</Judgement> 或 <Judgement>C</Judgement>
105+
请首先输出思考过程,最后再输出你的答案。
106+
"""

0 commit comments

Comments
 (0)