Skip to content

Commit 3396bbd

Browse files
committed
feat: add pre main html
1 parent b9d6f5d commit 3396bbd

File tree

1 file changed

+272
-0
lines changed

1 file changed

+272
-0
lines changed

scripts/process_dataset.py

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
#!/usr/bin/env python3
2+
"""
3+
为数据集添加 llm_webkit430_main_html 字段
4+
根据 llm_webkit_extractor.py 的逻辑,从现有字段构建 main_html
5+
使用方法:
6+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
7+
"""
8+
9+
import json
10+
import argparse
11+
from pathlib import Path
12+
13+
14+
def process_single_item(data: dict, verbose: bool = False) -> dict:
15+
"""
16+
为单个JSON对象添加 llm_webkit430_main_html 字段
17+
18+
参考 llm_webkit_extractor.py:665-670 的逻辑
19+
"""
20+
try:
21+
# 导入必要的模块
22+
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
23+
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
24+
25+
# 从数据中获取字段
26+
typical_raw_tag_html = data.get('llm_webkit_html', '') # 预处理HTML
27+
html = data.get('llm_webkit_html', '') # 预处理HTML
28+
llm_response = data.get('llm_response_html', '') # LLM响应HTML
29+
30+
if verbose:
31+
print(f"\n📝 字段信息:")
32+
print(f" llm_webkit_html 长度: {len(typical_raw_tag_html)}")
33+
print(f" llm_response_html 长度: {len(llm_response)}")
34+
35+
# 检查必要字段
36+
if not typical_raw_tag_html:
37+
if verbose:
38+
print(" ⚠️ llm_webkit_html 字段为空,跳过")
39+
data['llm_webkit430_main_html'] = ""
40+
return data
41+
42+
# 构建 pre_data(参考 llm_webkit_extractor.py:665)
43+
pre_data = {
44+
'typical_raw_tag_html': typical_raw_tag_html,
45+
'typical_raw_html': html,
46+
'llm_response': llm_response,
47+
'html_source': typical_raw_tag_html
48+
}
49+
50+
if verbose:
51+
print(f" 构建 pre_data 完成")
52+
53+
# 转换为 PreDataJson 对象
54+
pre_data = PreDataJson(pre_data)
55+
56+
if verbose:
57+
print(f" PreDataJson 创建完成")
58+
59+
# 映射 - 使用 MapItemToHtmlTagsParser
60+
parser = MapItemToHtmlTagsParser({})
61+
pre_data = parser.parse_single(pre_data)
62+
63+
if verbose:
64+
print(f" 映射解析完成")
65+
66+
# 提取 main_html
67+
main_html = pre_data.get(PreDataJsonKey.TYPICAL_MAIN_HTML, "")
68+
69+
if verbose:
70+
print(f" main_html 长度: {len(main_html)}")
71+
72+
# 添加新字段
73+
data['llm_webkit430_main_html'] = main_html
74+
75+
return data
76+
77+
except ImportError as e:
78+
if verbose:
79+
print(f"\n❌ 导入错误: {e}")
80+
print(" 请确保安装了 llm_web_kit: pip install llm-webkit")
81+
data['llm_webkit430_main_html'] = ""
82+
return data
83+
except Exception as e:
84+
if verbose:
85+
import traceback
86+
print(f"\n⚠️ 处理失败: {e}")
87+
print(f" 错误详情: {traceback.format_exc()}")
88+
# 失败时添加空字段
89+
data['llm_webkit430_main_html'] = ""
90+
return data
91+
92+
93+
def process_dataset(input_file: str, output_file: str = None, verbose: bool = False, test_first: int = None):
94+
"""
95+
处理整个数据集
96+
97+
Args:
98+
input_file: 输入JSONL文件路径
99+
output_file: 输出JSONL文件路径(默认为输入文件名_with_main_html.jsonl)
100+
verbose: 是否显示详细信息
101+
test_first: 仅处理前N条数据(用于测试)
102+
"""
103+
input_path = Path(input_file)
104+
105+
if not input_path.exists():
106+
print(f"❌ 文件不存在: {input_file}")
107+
return
108+
109+
# 确定输出文件名
110+
if output_file is None:
111+
output_file = str(input_path.parent / f"{input_path.stem}_with_main_html.jsonl")
112+
113+
print(f"📄 输入文件: {input_file}")
114+
print(f"📄 输出文件: {output_file}")
115+
if test_first:
116+
print(f"🧪 测试模式: 仅处理前 {test_first} 条数据")
117+
118+
# 检查依赖
119+
print("\n🔍 检查依赖...")
120+
try:
121+
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
122+
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
123+
print("✅ llm_web_kit 模块可用")
124+
except ImportError as e:
125+
print(f"❌ llm_web_kit 模块未安装: {e}")
126+
print(" 请运行: pip install llm-webkit")
127+
return
128+
129+
# 统计信息
130+
total = 0
131+
success = 0
132+
failed = 0
133+
134+
# 先统计总行数(用于进度条)
135+
print("\n📊 统计总行数...")
136+
with open(input_file, 'r', encoding='utf-8') as f:
137+
total_lines = sum(1 for _ in f)
138+
139+
if test_first:
140+
total_lines = min(total_lines, test_first)
141+
142+
print(f"📦 总共 {total_lines:,} 条数据\n")
143+
144+
# 处理数据
145+
print("🔄 开始处理...\n")
146+
try:
147+
with open(input_file, 'r', encoding='utf-8') as fin, \
148+
open(output_file, 'w', encoding='utf-8') as fout:
149+
150+
for idx, line in enumerate(fin, 1):
151+
# 测试模式:只处理前N条
152+
if test_first and idx > test_first:
153+
break
154+
155+
if not line.strip():
156+
continue
157+
158+
try:
159+
# 解析JSON
160+
data = json.loads(line)
161+
total += 1
162+
163+
# 显示进度(每100条显示一次)
164+
if total % 100 == 0:
165+
print(f" 处理进度: {total}/{total_lines} ({total/total_lines*100:.1f}%)")
166+
167+
# 处理单条数据
168+
if verbose and idx <= 3:
169+
print(f"\n处理第 {idx} 条数据...")
170+
171+
processed_data = process_single_item(data, verbose=(verbose and idx <= 3))
172+
173+
# 检查是否成功添加字段
174+
if processed_data.get('llm_webkit430_main_html'):
175+
success += 1
176+
else:
177+
failed += 1
178+
179+
# 写入输出文件
180+
fout.write(json.dumps(processed_data, ensure_ascii=False) + '\n')
181+
182+
except json.JSONDecodeError as e:
183+
print(f"\n⚠️ 行 {idx} JSON解析错误: {e}")
184+
failed += 1
185+
# 写入原始行
186+
fout.write(line)
187+
except Exception as e:
188+
print(f"\n❌ 行 {idx} 处理错误: {e}")
189+
if verbose:
190+
import traceback
191+
print(traceback.format_exc())
192+
failed += 1
193+
# 写入原始数据
194+
try:
195+
data['llm_webkit430_main_html'] = ""
196+
fout.write(json.dumps(data, ensure_ascii=False) + '\n')
197+
except:
198+
fout.write(line)
199+
200+
except Exception as e:
201+
print(f"\n❌ 处理过程中发生严重错误: {e}")
202+
import traceback
203+
print(traceback.format_exc())
204+
return
205+
206+
# 输出统计信息
207+
print("\n" + "="*60)
208+
print("✅ 处理完成!")
209+
print("="*60)
210+
print(f"总处理数: {total:,}")
211+
print(f"成功: {success:,} ({success/total*100:.1f}%)" if total > 0 else "成功: 0")
212+
print(f"失败: {failed:,} ({failed/total*100:.1f}%)" if total > 0 else "失败: 0")
213+
print(f"\n输出文件: {output_file}")
214+
print("="*60)
215+
216+
217+
def main():
218+
parser = argparse.ArgumentParser(
219+
description='为数据集添加 llm_webkit430_main_html 字段',
220+
formatter_class=argparse.RawDescriptionHelpFormatter,
221+
epilog='''
222+
示例:
223+
# 基本使用
224+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
225+
226+
# 指定输出文件
227+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
228+
--output data/WebMainBench_7887_with_main_html.jsonl
229+
230+
# 测试前10条数据
231+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
232+
--test-first 10 --verbose
233+
234+
# 详细模式(显示前3条的处理细节)
235+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
236+
--verbose
237+
'''
238+
)
239+
240+
parser.add_argument(
241+
'input_file',
242+
help='输入JSONL文件路径'
243+
)
244+
245+
parser.add_argument(
246+
'--output',
247+
'-o',
248+
help='输出JSONL文件路径(默认:输入文件名_with_main_html.jsonl)'
249+
)
250+
251+
parser.add_argument(
252+
'--verbose',
253+
'-v',
254+
action='store_true',
255+
help='显示详细处理信息(仅显示前3条)'
256+
)
257+
258+
parser.add_argument(
259+
'--test-first',
260+
'-t',
261+
type=int,
262+
help='仅处理前N条数据(用于测试)'
263+
)
264+
265+
args = parser.parse_args()
266+
267+
# 处理数据集
268+
process_dataset(args.input_file, args.output, args.verbose, args.test_first)
269+
270+
271+
if __name__ == '__main__':
272+
main()

0 commit comments

Comments
 (0)