Skip to content

Commit 1571c65

Browse files
authored
Merge pull request #58 from e06084/main
feat: add pre main html
2 parents b9d6f5d + ef3321d commit 1571c65

File tree

1 file changed

+254
-0
lines changed

1 file changed

+254
-0
lines changed

scripts/process_dataset.py

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
#!/usr/bin/env python3
2+
"""
3+
为数据集添加 llm_webkit430_main_html 字段
4+
根据 llm_webkit_extractor.py 的逻辑,从现有字段构建 main_html
5+
使用方法:
6+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
7+
"""
8+
9+
import json
10+
import argparse
11+
from pathlib import Path
12+
13+
14+
def process_single_item(data: dict, verbose: bool = False) -> dict:
15+
"""
16+
为单个JSON对象添加 llm_webkit430_main_html 字段
17+
18+
参考 llm_webkit_extractor.py:665-670 的逻辑
19+
"""
20+
try:
21+
# 导入必要的模块
22+
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
23+
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
24+
25+
# 从数据中获取字段
26+
typical_raw_tag_html = data.get('typical_raw_tag_html', '') # 预处理HTML
27+
llm_response = data.get('llm_response_html', '') # LLM响应HTML
28+
29+
# 检查必要字段
30+
if not typical_raw_tag_html:
31+
if verbose:
32+
print(" ⚠️ llm_webkit_html 字段为空,跳过")
33+
data['llm_webkit430_main_html'] = ""
34+
return data
35+
36+
# 构建 pre_data(参考 llm_webkit_extractor.py:665)
37+
pre_data = {
38+
'typical_raw_tag_html': typical_raw_tag_html,
39+
'typical_raw_html': typical_raw_tag_html,
40+
'llm_response': llm_response,
41+
'html_source': typical_raw_tag_html
42+
}
43+
44+
# 转换为 PreDataJson 对象
45+
pre_data = PreDataJson(pre_data)
46+
47+
# 映射 - 使用 MapItemToHtmlTagsParser
48+
parser = MapItemToHtmlTagsParser({})
49+
pre_data = parser.parse(pre_data)
50+
51+
# 提取 main_html
52+
main_html = pre_data.get(PreDataJsonKey.TYPICAL_MAIN_HTML, "")
53+
54+
# 添加新字段
55+
data['llm_webkit430_main_html'] = main_html
56+
57+
return data
58+
59+
except ImportError as e:
60+
if verbose:
61+
print(f"\n❌ 导入错误: {e}")
62+
print(" 请确保安装了 llm_web_kit: pip install llm-webkit")
63+
data['llm_webkit430_main_html'] = ""
64+
return data
65+
except Exception as e:
66+
if verbose:
67+
import traceback
68+
print(f"\n⚠️ 处理失败: {e}")
69+
print(f" 错误详情: {traceback.format_exc()}")
70+
# 失败时添加空字段
71+
data['llm_webkit430_main_html'] = ""
72+
return data
73+
74+
75+
def process_dataset(input_file: str, output_file: str = None, verbose: bool = False, test_first: int = None):
76+
"""
77+
处理整个数据集
78+
79+
Args:
80+
input_file: 输入JSONL文件路径
81+
output_file: 输出JSONL文件路径(默认为输入文件名_with_main_html.jsonl)
82+
verbose: 是否显示详细信息
83+
test_first: 仅处理前N条数据(用于测试)
84+
"""
85+
input_path = Path(input_file)
86+
87+
if not input_path.exists():
88+
print(f"❌ 文件不存在: {input_file}")
89+
return
90+
91+
# 确定输出文件名
92+
if output_file is None:
93+
output_file = str(input_path.parent / f"{input_path.stem}_with_main_html.jsonl")
94+
95+
print(f"📄 输入文件: {input_file}")
96+
print(f"📄 输出文件: {output_file}")
97+
if test_first:
98+
print(f"🧪 测试模式: 仅处理前 {test_first} 条数据")
99+
100+
# 检查依赖
101+
print("\n🔍 检查依赖...")
102+
try:
103+
from llm_web_kit.input.pre_data_json import PreDataJson, PreDataJsonKey
104+
from llm_web_kit.main_html_parser.parser.tag_mapping import MapItemToHtmlTagsParser
105+
print("✅ llm_web_kit 模块可用")
106+
except ImportError as e:
107+
print(f"❌ llm_web_kit 模块未安装: {e}")
108+
print(" 请运行: pip install llm-webkit")
109+
return
110+
111+
# 统计信息
112+
total = 0
113+
success = 0
114+
failed = 0
115+
116+
# 先统计总行数(用于进度条)
117+
print("\n📊 统计总行数...")
118+
with open(input_file, 'r', encoding='utf-8') as f:
119+
total_lines = sum(1 for _ in f)
120+
121+
if test_first:
122+
total_lines = min(total_lines, test_first)
123+
124+
print(f"📦 总共 {total_lines:,} 条数据\n")
125+
126+
# 处理数据
127+
print("🔄 开始处理...\n")
128+
try:
129+
with open(input_file, 'r', encoding='utf-8') as fin, \
130+
open(output_file, 'w', encoding='utf-8') as fout:
131+
132+
for idx, line in enumerate(fin, 1):
133+
# 测试模式:只处理前N条
134+
if test_first and idx > test_first:
135+
break
136+
137+
if not line.strip():
138+
continue
139+
140+
try:
141+
# 解析JSON
142+
data = json.loads(line)
143+
total += 1
144+
145+
# 显示进度(每100条显示一次)
146+
if total % 100 == 0:
147+
print(f" 处理进度: {total}/{total_lines} ({total/total_lines*100:.1f}%)")
148+
149+
# 处理单条数据
150+
if verbose and idx <= 3:
151+
print(f"\n处理第 {idx} 条数据...")
152+
153+
processed_data = process_single_item(data, verbose=(verbose and idx <= 3))
154+
155+
# 检查是否成功添加字段
156+
if processed_data.get('llm_webkit430_main_html'):
157+
success += 1
158+
else:
159+
failed += 1
160+
161+
# 写入输出文件
162+
fout.write(json.dumps(processed_data, ensure_ascii=False) + '\n')
163+
164+
except json.JSONDecodeError as e:
165+
print(f"\n⚠️ 行 {idx} JSON解析错误: {e}")
166+
failed += 1
167+
# 写入原始行
168+
fout.write(line)
169+
except Exception as e:
170+
print(f"\n❌ 行 {idx} 处理错误: {e}")
171+
if verbose:
172+
import traceback
173+
print(traceback.format_exc())
174+
failed += 1
175+
# 写入原始数据
176+
try:
177+
data['llm_webkit430_main_html'] = ""
178+
fout.write(json.dumps(data, ensure_ascii=False) + '\n')
179+
except:
180+
fout.write(line)
181+
182+
except Exception as e:
183+
print(f"\n❌ 处理过程中发生严重错误: {e}")
184+
import traceback
185+
print(traceback.format_exc())
186+
return
187+
188+
# 输出统计信息
189+
print("\n" + "="*60)
190+
print("✅ 处理完成!")
191+
print("="*60)
192+
print(f"总处理数: {total:,}")
193+
print(f"成功: {success:,} ({success/total*100:.1f}%)" if total > 0 else "成功: 0")
194+
print(f"失败: {failed:,} ({failed/total*100:.1f}%)" if total > 0 else "失败: 0")
195+
print(f"\n输出文件: {output_file}")
196+
print("="*60)
197+
198+
199+
def main():
200+
parser = argparse.ArgumentParser(
201+
description='为数据集添加 llm_webkit430_main_html 字段',
202+
formatter_class=argparse.RawDescriptionHelpFormatter,
203+
epilog='''
204+
示例:
205+
# 基本使用
206+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
207+
208+
# 指定输出文件
209+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
210+
--output data/WebMainBench_7887_with_main_html.jsonl
211+
212+
# 测试前10条数据
213+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
214+
--test-first 10 --verbose
215+
216+
# 详细模式(显示前3条的处理细节)
217+
python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
218+
--verbose
219+
'''
220+
)
221+
222+
parser.add_argument(
223+
'input_file',
224+
help='输入JSONL文件路径'
225+
)
226+
227+
parser.add_argument(
228+
'--output',
229+
'-o',
230+
help='输出JSONL文件路径(默认:输入文件名_with_main_html.jsonl)'
231+
)
232+
233+
parser.add_argument(
234+
'--verbose',
235+
'-v',
236+
action='store_true',
237+
help='显示详细处理信息(仅显示前3条)'
238+
)
239+
240+
parser.add_argument(
241+
'--test-first',
242+
'-t',
243+
type=int,
244+
help='仅处理前N条数据(用于测试)'
245+
)
246+
247+
args = parser.parse_args()
248+
249+
# 处理数据集
250+
process_dataset(args.input_file, args.output, args.verbose, args.test_first)
251+
252+
253+
if __name__ == '__main__':
254+
main()

0 commit comments

Comments
 (0)