Skip to content

Commit 0001252

Browse files
committed
feat: add pre main html
1 parent b9d6f5d commit 0001252

File tree

4 files changed

+301
-11
lines changed

4 files changed

+301
-11
lines changed

examples/multi_extractor_compare.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,24 @@ def all_extractor_comparison():
1616
config = {
1717
"use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式
1818
"preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名
19+
"llm_response_html_field": "llm_response_html", # 指定LLM响应HTML字段名
20+
"typical_raw_tag_html_field": "typical_raw_tag_html", # 指定典型原始标签HTML字段名
1921
}
2022

2123
webkit_extractor = ExtractorFactory.create("llm-webkit", config=config)
2224
# 创建magic-extractor抽取器
23-
magic_extractor = ExtractorFactory.create("magic-html")
24-
# 创建trafilatura抽取器,抽取成markdown
25-
trafilatura_extractor = ExtractorFactory.create("trafilatura")
26-
# 创建trafilatura抽取器,抽取成txt
27-
trafilatura_txt_extractor = ExtractorFactory.create("trafilatura_txt")
28-
# 创建resiliparse抽取器
29-
resiliparse_extractor = ExtractorFactory.create("resiliparse")
25+
# magic_extractor = ExtractorFactory.create("magic-html")
26+
# # 创建trafilatura抽取器,抽取成markdown
27+
# trafilatura_extractor = ExtractorFactory.create("trafilatura")
28+
# # 创建trafilatura抽取器,抽取成txt
29+
# trafilatura_txt_extractor = ExtractorFactory.create("trafilatura_txt")
30+
# # 创建resiliparse抽取器
31+
# resiliparse_extractor = ExtractorFactory.create("resiliparse")
3032

3133
# 运行对比
3234
evaluator = Evaluator()
33-
extractors = [webkit_extractor, magic_extractor, trafilatura_extractor,trafilatura_txt_extractor, resiliparse_extractor]
34-
# extractors = [webkit_extractor]
35+
# extractors = [webkit_extractor, magic_extractor, trafilatura_extractor,trafilatura_txt_extractor, resiliparse_extractor]
36+
extractors = [webkit_extractor]
3537

3638

3739
results = evaluator.compare_extractors(

scripts/process_dataset.py

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
#!/usr/bin/env python3
2+
"""
3+
为数据集添加 llm_webkit430_main_html 字段
4+
5+
根据 llm_webkit_extractor.py 的逻辑,从现有字段构建 main_html
6+
7+
使用方法:
8+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
9+
"""
10+
11+
import json
12+
import argparse
13+
from pathlib import Path
14+
15+
16+
def process_single_item(data: dict, verbose: bool = False) -> dict:
17+
"""
18+
为单个JSON对象添加 llm_webkit430_main_html 字段
19+
20+
参考 llm_webkit_extractor.py:665-670 的逻辑
21+
"""
22+
try:
23+
# 导入必要的模块
24+
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
25+
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
26+
27+
# 从数据中获取字段
28+
typical_raw_tag_html = data.get('llm_webkit_html', '') # 预处理HTML
29+
html = data.get('llm_webkit_html', '') # 预处理HTML
30+
llm_response = data.get('llm_response_html', '') # LLM响应HTML
31+
32+
if verbose:
33+
print(f"\n📝 字段信息:")
34+
print(f" llm_webkit_html 长度: {len(typical_raw_tag_html)}")
35+
print(f" llm_response_html 长度: {len(llm_response)}")
36+
37+
# 检查必要字段
38+
if not typical_raw_tag_html:
39+
if verbose:
40+
print(" ⚠️ llm_webkit_html 字段为空,跳过")
41+
data['llm_webkit430_main_html'] = ""
42+
return data
43+
44+
# 构建 pre_data(参考 llm_webkit_extractor.py:665)
45+
pre_data = {
46+
'typical_raw_tag_html': typical_raw_tag_html,
47+
'typical_raw_html': html,
48+
'llm_response': llm_response,
49+
'html_source': typical_raw_tag_html
50+
}
51+
52+
if verbose:
53+
print(f" 构建 pre_data 完成")
54+
55+
# 转换为 PreDataJson 对象
56+
pre_data = PreDataJson(pre_data)
57+
58+
if verbose:
59+
print(f" PreDataJson 创建完成")
60+
61+
# 映射 - 使用 MapItemToHtmlTagsParser
62+
parser = MapItemToHtmlTagsParser({})
63+
pre_data = parser.parse_single(pre_data)
64+
65+
if verbose:
66+
print(f" 映射解析完成")
67+
68+
# 提取 main_html
69+
main_html = pre_data.get(PreDataJsonKey.TYPICAL_MAIN_HTML, "")
70+
71+
if verbose:
72+
print(f" main_html 长度: {len(main_html)}")
73+
74+
# 添加新字段
75+
data['llm_webkit430_main_html'] = main_html
76+
77+
return data
78+
79+
except ImportError as e:
80+
if verbose:
81+
print(f"\n❌ 导入错误: {e}")
82+
print(" 请确保安装了 llm_web_kit: pip install llm-webkit")
83+
data['llm_webkit430_main_html'] = ""
84+
return data
85+
except Exception as e:
86+
if verbose:
87+
import traceback
88+
print(f"\n⚠️ 处理失败: {e}")
89+
print(f" 错误详情: {traceback.format_exc()}")
90+
# 失败时添加空字段
91+
data['llm_webkit430_main_html'] = ""
92+
return data
93+
94+
95+
def process_dataset(input_file: str, output_file: str = None, verbose: bool = False, test_first: int = None):
96+
"""
97+
处理整个数据集
98+
99+
Args:
100+
input_file: 输入JSONL文件路径
101+
output_file: 输出JSONL文件路径(默认为输入文件名_with_main_html.jsonl)
102+
verbose: 是否显示详细信息
103+
test_first: 仅处理前N条数据(用于测试)
104+
"""
105+
input_path = Path(input_file)
106+
107+
if not input_path.exists():
108+
print(f"❌ 文件不存在: {input_file}")
109+
return
110+
111+
# 确定输出文件名
112+
if output_file is None:
113+
output_file = str(input_path.parent / f"{input_path.stem}_with_main_html.jsonl")
114+
115+
print(f"📄 输入文件: {input_file}")
116+
print(f"📄 输出文件: {output_file}")
117+
if test_first:
118+
print(f"🧪 测试模式: 仅处理前 {test_first} 条数据")
119+
120+
# 检查依赖
121+
print("\n🔍 检查依赖...")
122+
try:
123+
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
124+
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
125+
print("✅ llm_web_kit 模块可用")
126+
except ImportError as e:
127+
print(f"❌ llm_web_kit 模块未安装: {e}")
128+
print(" 请运行: pip install llm-webkit")
129+
return
130+
131+
# 统计信息
132+
total = 0
133+
success = 0
134+
failed = 0
135+
136+
# 先统计总行数(用于进度条)
137+
print("\n📊 统计总行数...")
138+
with open(input_file, 'r', encoding='utf-8') as f:
139+
total_lines = sum(1 for _ in f)
140+
141+
if test_first:
142+
total_lines = min(total_lines, test_first)
143+
144+
print(f"📦 总共 {total_lines:,} 条数据\n")
145+
146+
# 处理数据
147+
print("🔄 开始处理...\n")
148+
try:
149+
with open(input_file, 'r', encoding='utf-8') as fin, \
150+
open(output_file, 'w', encoding='utf-8') as fout:
151+
152+
for idx, line in enumerate(fin, 1):
153+
# 测试模式:只处理前N条
154+
if test_first and idx > test_first:
155+
break
156+
157+
if not line.strip():
158+
continue
159+
160+
try:
161+
# 解析JSON
162+
data = json.loads(line)
163+
total += 1
164+
165+
# 显示进度(每100条显示一次)
166+
if total % 100 == 0:
167+
print(f" 处理进度: {total}/{total_lines} ({total/total_lines*100:.1f}%)")
168+
169+
# 处理单条数据
170+
if verbose and idx <= 3:
171+
print(f"\n处理第 {idx} 条数据...")
172+
173+
processed_data = process_single_item(data, verbose=(verbose and idx <= 3))
174+
175+
# 检查是否成功添加字段
176+
if processed_data.get('llm_webkit430_main_html'):
177+
success += 1
178+
else:
179+
failed += 1
180+
181+
# 写入输出文件
182+
fout.write(json.dumps(processed_data, ensure_ascii=False) + '\n')
183+
184+
except json.JSONDecodeError as e:
185+
print(f"\n⚠️ 行 {idx} JSON解析错误: {e}")
186+
failed += 1
187+
# 写入原始行
188+
fout.write(line)
189+
except Exception as e:
190+
print(f"\n❌ 行 {idx} 处理错误: {e}")
191+
if verbose:
192+
import traceback
193+
print(traceback.format_exc())
194+
failed += 1
195+
# 写入原始数据
196+
try:
197+
data['llm_webkit430_main_html'] = ""
198+
fout.write(json.dumps(data, ensure_ascii=False) + '\n')
199+
except:
200+
fout.write(line)
201+
202+
except Exception as e:
203+
print(f"\n❌ 处理过程中发生严重错误: {e}")
204+
import traceback
205+
print(traceback.format_exc())
206+
return
207+
208+
# 输出统计信息
209+
print("\n" + "="*60)
210+
print("✅ 处理完成!")
211+
print("="*60)
212+
print(f"总处理数: {total:,}")
213+
print(f"成功: {success:,} ({success/total*100:.1f}%)" if total > 0 else "成功: 0")
214+
print(f"失败: {failed:,} ({failed/total*100:.1f}%)" if total > 0 else "失败: 0")
215+
print(f"\n输出文件: {output_file}")
216+
print("="*60)
217+
218+
219+
def main():
220+
parser = argparse.ArgumentParser(
221+
description='为数据集添加 llm_webkit430_main_html 字段',
222+
formatter_class=argparse.RawDescriptionHelpFormatter,
223+
epilog='''
224+
示例:
225+
# 基本使用
226+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
227+
228+
# 指定输出文件
229+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
230+
--output data/WebMainBench_7887_with_main_html.jsonl
231+
232+
# 测试前10条数据
233+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
234+
--test-first 10 --verbose
235+
236+
# 详细模式(显示前3条的处理细节)
237+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
238+
--verbose
239+
'''
240+
)
241+
242+
parser.add_argument(
243+
'input_file',
244+
help='输入JSONL文件路径'
245+
)
246+
247+
parser.add_argument(
248+
'--output',
249+
'-o',
250+
help='输出JSONL文件路径(默认:输入文件名_with_main_html.jsonl)'
251+
)
252+
253+
parser.add_argument(
254+
'--verbose',
255+
'-v',
256+
action='store_true',
257+
help='显示详细处理信息(仅显示前3条)'
258+
)
259+
260+
parser.add_argument(
261+
'--test-first',
262+
'-t',
263+
type=int,
264+
help='仅处理前N条数据(用于测试)'
265+
)
266+
267+
args = parser.parse_args()
268+
269+
# 处理数据集
270+
process_dataset(args.input_file, args.output, args.verbose, args.test_first)
271+
272+
273+
if __name__ == '__main__':
274+
main()
275+

webmainbench/data/dataset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class DataSample:
3030
tags: Optional[List[str]] = None
3131
llm_webkit_md: Optional[str] = None
3232
llm_webkit_html: Optional[str] = None # 预处理HTML字段
33+
llm_response_html: Optional[str] = None # LLM响应HTML字段
34+
typical_raw_tag_html: Optional[str] = None # 典型原始标签HTML字段
3335
main_html: Optional[str] = None # 主要HTML内容字段
3436

3537
# Extracted results (populated during evaluation)

webmainbench/extractors/llm_webkit_extractor.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class LLMInferenceConfig:
3030
enforce_eager: bool = True # 使用eager模式
3131
use_preprocessed_html: bool = False # 是否使用预处理的HTML(跳过HTML简化步骤)
3232
preprocessed_html_field: str = "llm_webkit_html" # 预处理HTML字段名
33+
llm_response_html_field: str = "llm_response_html" # LLM响应HTML字段名
34+
typical_raw_tag_html_field: str = "typical_raw_tag_html" # 典型原始标签HTML字段名
3335

3436

3537
class TokenState(Enum):
@@ -654,11 +656,20 @@ def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
654656
start_time = time.time()
655657

656658
try:
657-
# 检查是否使用预处理的HTML(跳过HTML简化步骤)
659+
# 检查是否使用预处理的HTML(跳过HTML简化步骤)d
658660
if self.inference_config.use_preprocessed_html:
659661
# 传入的html已经是预处理的内容(由Evaluator从指定字段提取),直接用作main_html
660662
print(f"📥 使用预处理HTML,跳过HTML简化步骤")
661-
content, content_list = self._extract_content_from_main_html(html, url)
663+
llm_response = self.inference_config.llm_response_html_field
664+
typical_raw_tag_html = self.inference_config.typical_raw_tag_html_field
665+
pre_data = {'typical_raw_tag_html': typical_raw_tag_html, 'typical_raw_html': html, 'llm_response': llm_response, 'html_source': typical_raw_tag_html}
666+
pre_data = self._PreDataJson(pre_data)
667+
# 映射
668+
parser = self._MapItemToHtmlTagsParser({})
669+
pre_data = parser.parse_single(pre_data)
670+
main_html = pre_data.get(self._PreDataJsonKey.TYPICAL_MAIN_HTML, "")
671+
672+
content, content_list = self._extract_content_from_main_html(main_html, url)
662673

663674
extraction_time = time.time() - start_time
664675

0 commit comments

Comments
 (0)