1+ #!/usr/bin/env python3
2+ """
3+ 为数据集添加 llm_webkit430_main_html 字段
4+ 根据 llm_webkit_extractor.py 的逻辑,从现有字段构建 main_html
5+ 使用方法:
6+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
7+ """
8+
9+ import json
10+ import argparse
11+ from pathlib import Path
12+
13+
14+ def process_single_item (data : dict , verbose : bool = False ) -> dict :
15+ """
16+ 为单个JSON对象添加 llm_webkit430_main_html 字段
17+
18+ 参考 llm_webkit_extractor.py:665-670 的逻辑
19+ """
20+ try :
21+ # 导入必要的模块
22+ from llm_web_kit .input .pre_data_json import PreDataJson , PreDataJsonKey
23+ from llm_web_kit .main_html_parser .parser .tag_mapping import MapItemToHtmlTagsParser
24+
25+ # 从数据中获取字段
26+ typical_raw_tag_html = data .get ('llm_webkit_html' , '' ) # 预处理HTML
27+ html = data .get ('llm_webkit_html' , '' ) # 预处理HTML
28+ llm_response = data .get ('llm_response_html' , '' ) # LLM响应HTML
29+
30+ if verbose :
31+ print (f"\n 📝 字段信息:" )
32+ print (f" llm_webkit_html 长度: { len (typical_raw_tag_html )} " )
33+ print (f" llm_response_html 长度: { len (llm_response )} " )
34+
35+ # 检查必要字段
36+ if not typical_raw_tag_html :
37+ if verbose :
38+ print (" ⚠️ llm_webkit_html 字段为空,跳过" )
39+ data ['llm_webkit430_main_html' ] = ""
40+ return data
41+
42+ # 构建 pre_data(参考 llm_webkit_extractor.py:665)
43+ pre_data = {
44+ 'typical_raw_tag_html' : typical_raw_tag_html ,
45+ 'typical_raw_html' : html ,
46+ 'llm_response' : llm_response ,
47+ 'html_source' : typical_raw_tag_html
48+ }
49+
50+ if verbose :
51+ print (f" 构建 pre_data 完成" )
52+
53+ # 转换为 PreDataJson 对象
54+ pre_data = PreDataJson (pre_data )
55+
56+ if verbose :
57+ print (f" PreDataJson 创建完成" )
58+
59+ # 映射 - 使用 MapItemToHtmlTagsParser
60+ parser = MapItemToHtmlTagsParser ({})
61+ pre_data = parser .parse_single (pre_data )
62+
63+ if verbose :
64+ print (f" 映射解析完成" )
65+
66+ # 提取 main_html
67+ main_html = pre_data .get (PreDataJsonKey .TYPICAL_MAIN_HTML , "" )
68+
69+ if verbose :
70+ print (f" main_html 长度: { len (main_html )} " )
71+
72+ # 添加新字段
73+ data ['llm_webkit430_main_html' ] = main_html
74+
75+ return data
76+
77+ except ImportError as e :
78+ if verbose :
79+ print (f"\n ❌ 导入错误: { e } " )
80+ print (" 请确保安装了 llm_web_kit: pip install llm-webkit" )
81+ data ['llm_webkit430_main_html' ] = ""
82+ return data
83+ except Exception as e :
84+ if verbose :
85+ import traceback
86+ print (f"\n ⚠️ 处理失败: { e } " )
87+ print (f" 错误详情: { traceback .format_exc ()} " )
88+ # 失败时添加空字段
89+ data ['llm_webkit430_main_html' ] = ""
90+ return data
91+
92+
93+ def process_dataset (input_file : str , output_file : str = None , verbose : bool = False , test_first : int = None ):
94+ """
95+ 处理整个数据集
96+
97+ Args:
98+ input_file: 输入JSONL文件路径
99+ output_file: 输出JSONL文件路径(默认为输入文件名_with_main_html.jsonl)
100+ verbose: 是否显示详细信息
101+ test_first: 仅处理前N条数据(用于测试)
102+ """
103+ input_path = Path (input_file )
104+
105+ if not input_path .exists ():
106+ print (f"❌ 文件不存在: { input_file } " )
107+ return
108+
109+ # 确定输出文件名
110+ if output_file is None :
111+ output_file = str (input_path .parent / f"{ input_path .stem } _with_main_html.jsonl" )
112+
113+ print (f"📄 输入文件: { input_file } " )
114+ print (f"📄 输出文件: { output_file } " )
115+ if test_first :
116+ print (f"🧪 测试模式: 仅处理前 { test_first } 条数据" )
117+
118+ # 检查依赖
119+ print ("\n 🔍 检查依赖..." )
120+ try :
121+ from llm_web_kit .input .pre_data_json import PreDataJson , PreDataJsonKey
122+ from llm_web_kit .main_html_parser .parser .tag_mapping import MapItemToHtmlTagsParser
123+ print ("✅ llm_web_kit 模块可用" )
124+ except ImportError as e :
125+ print (f"❌ llm_web_kit 模块未安装: { e } " )
126+ print (" 请运行: pip install llm-webkit" )
127+ return
128+
129+ # 统计信息
130+ total = 0
131+ success = 0
132+ failed = 0
133+
134+ # 先统计总行数(用于进度条)
135+ print ("\n 📊 统计总行数..." )
136+ with open (input_file , 'r' , encoding = 'utf-8' ) as f :
137+ total_lines = sum (1 for _ in f )
138+
139+ if test_first :
140+ total_lines = min (total_lines , test_first )
141+
142+ print (f"📦 总共 { total_lines :,} 条数据\n " )
143+
144+ # 处理数据
145+ print ("🔄 开始处理...\n " )
146+ try :
147+ with open (input_file , 'r' , encoding = 'utf-8' ) as fin , \
148+ open (output_file , 'w' , encoding = 'utf-8' ) as fout :
149+
150+ for idx , line in enumerate (fin , 1 ):
151+ # 测试模式:只处理前N条
152+ if test_first and idx > test_first :
153+ break
154+
155+ if not line .strip ():
156+ continue
157+
158+ try :
159+ # 解析JSON
160+ data = json .loads (line )
161+ total += 1
162+
163+ # 显示进度(每100条显示一次)
164+ if total % 100 == 0 :
165+ print (f" 处理进度: { total } /{ total_lines } ({ total / total_lines * 100 :.1f} %)" )
166+
167+ # 处理单条数据
168+ if verbose and idx <= 3 :
169+ print (f"\n 处理第 { idx } 条数据..." )
170+
171+ processed_data = process_single_item (data , verbose = (verbose and idx <= 3 ))
172+
173+ # 检查是否成功添加字段
174+ if processed_data .get ('llm_webkit430_main_html' ):
175+ success += 1
176+ else :
177+ failed += 1
178+
179+ # 写入输出文件
180+ fout .write (json .dumps (processed_data , ensure_ascii = False ) + '\n ' )
181+
182+ except json .JSONDecodeError as e :
183+ print (f"\n ⚠️ 行 { idx } JSON解析错误: { e } " )
184+ failed += 1
185+ # 写入原始行
186+ fout .write (line )
187+ except Exception as e :
188+ print (f"\n ❌ 行 { idx } 处理错误: { e } " )
189+ if verbose :
190+ import traceback
191+ print (traceback .format_exc ())
192+ failed += 1
193+ # 写入原始数据
194+ try :
195+ data ['llm_webkit430_main_html' ] = ""
196+ fout .write (json .dumps (data , ensure_ascii = False ) + '\n ' )
197+ except :
198+ fout .write (line )
199+
200+ except Exception as e :
201+ print (f"\n ❌ 处理过程中发生严重错误: { e } " )
202+ import traceback
203+ print (traceback .format_exc ())
204+ return
205+
206+ # 输出统计信息
207+ print ("\n " + "=" * 60 )
208+ print ("✅ 处理完成!" )
209+ print ("=" * 60 )
210+ print (f"总处理数: { total :,} " )
211+ print (f"成功: { success :,} ({ success / total * 100 :.1f} %)" if total > 0 else "成功: 0" )
212+ print (f"失败: { failed :,} ({ failed / total * 100 :.1f} %)" if total > 0 else "失败: 0" )
213+ print (f"\n 输出文件: { output_file } " )
214+ print ("=" * 60 )
215+
216+
217+ def main ():
218+ parser = argparse .ArgumentParser (
219+ description = '为数据集添加 llm_webkit430_main_html 字段' ,
220+ formatter_class = argparse .RawDescriptionHelpFormatter ,
221+ epilog = '''
222+ 示例:
223+ # 基本使用
224+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
225+
226+ # 指定输出文件
227+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
228+ --output data/WebMainBench_7887_with_main_html.jsonl
229+
230+ # 测试前10条数据
231+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
232+ --test-first 10 --verbose
233+
234+ # 详细模式(显示前3条的处理细节)
235+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
236+ --verbose
237+ '''
238+ )
239+
240+ parser .add_argument (
241+ 'input_file' ,
242+ help = '输入JSONL文件路径'
243+ )
244+
245+ parser .add_argument (
246+ '--output' ,
247+ '-o' ,
248+ help = '输出JSONL文件路径(默认:输入文件名_with_main_html.jsonl)'
249+ )
250+
251+ parser .add_argument (
252+ '--verbose' ,
253+ '-v' ,
254+ action = 'store_true' ,
255+ help = '显示详细处理信息(仅显示前3条)'
256+ )
257+
258+ parser .add_argument (
259+ '--test-first' ,
260+ '-t' ,
261+ type = int ,
262+ help = '仅处理前N条数据(用于测试)'
263+ )
264+
265+ args = parser .parse_args ()
266+
267+ # 处理数据集
268+ process_dataset (args .input_file , args .output , args .verbose , args .test_first )
269+
270+
271+ if __name__ == '__main__' :
272+ main ()
0 commit comments