1+ #!/usr/bin/env python3
2+ """
3+ 为数据集添加 llm_webkit430_main_html 字段
4+ 根据 llm_webkit_extractor.py 的逻辑,从现有字段构建 main_html
5+ 使用方法:
6+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
7+ """
8+
9+ import json
10+ import argparse
11+ from pathlib import Path
12+
13+
14+ def process_single_item (data : dict , verbose : bool = False ) -> dict :
15+ """
16+ 为单个JSON对象添加 llm_webkit430_main_html 字段
17+
18+ 参考 llm_webkit_extractor.py:665-670 的逻辑
19+ """
20+ try :
21+ # 导入必要的模块
22+ from llm_web_kit .input .pre_data_json import PreDataJson , PreDataJsonKey
23+ from llm_web_kit .main_html_parser .parser .tag_mapping import MapItemToHtmlTagsParser
24+
25+ # 从数据中获取字段
26+ typical_raw_tag_html = data .get ('typical_raw_tag_html' , '' ) # 预处理HTML
27+ llm_response = data .get ('llm_response_html' , '' ) # LLM响应HTML
28+
29+ # 检查必要字段
30+ if not typical_raw_tag_html :
31+ if verbose :
32+ print (" ⚠️ llm_webkit_html 字段为空,跳过" )
33+ data ['llm_webkit430_main_html' ] = ""
34+ return data
35+
36+ # 构建 pre_data(参考 llm_webkit_extractor.py:665)
37+ pre_data = {
38+ 'typical_raw_tag_html' : typical_raw_tag_html ,
39+ 'typical_raw_html' : typical_raw_tag_html ,
40+ 'llm_response' : llm_response ,
41+ 'html_source' : typical_raw_tag_html
42+ }
43+
44+ # 转换为 PreDataJson 对象
45+ pre_data = PreDataJson (pre_data )
46+
47+ # 映射 - 使用 MapItemToHtmlTagsParser
48+ parser = MapItemToHtmlTagsParser ({})
49+ pre_data = parser .parse (pre_data )
50+
51+ # 提取 main_html
52+ main_html = pre_data .get (PreDataJsonKey .TYPICAL_MAIN_HTML , "" )
53+
54+ # 添加新字段
55+ data ['llm_webkit430_main_html' ] = main_html
56+
57+ return data
58+
59+ except ImportError as e :
60+ if verbose :
61+ print (f"\n ❌ 导入错误: { e } " )
62+ print (" 请确保安装了 llm_web_kit: pip install llm-webkit" )
63+ data ['llm_webkit430_main_html' ] = ""
64+ return data
65+ except Exception as e :
66+ if verbose :
67+ import traceback
68+ print (f"\n ⚠️ 处理失败: { e } " )
69+ print (f" 错误详情: { traceback .format_exc ()} " )
70+ # 失败时添加空字段
71+ data ['llm_webkit430_main_html' ] = ""
72+ return data
73+
74+
75+ def process_dataset (input_file : str , output_file : str = None , verbose : bool = False , test_first : int = None ):
76+ """
77+ 处理整个数据集
78+
79+ Args:
80+ input_file: 输入JSONL文件路径
81+ output_file: 输出JSONL文件路径(默认为输入文件名_with_main_html.jsonl)
82+ verbose: 是否显示详细信息
83+ test_first: 仅处理前N条数据(用于测试)
84+ """
85+ input_path = Path (input_file )
86+
87+ if not input_path .exists ():
88+ print (f"❌ 文件不存在: { input_file } " )
89+ return
90+
91+ # 确定输出文件名
92+ if output_file is None :
93+ output_file = str (input_path .parent / f"{ input_path .stem } _with_main_html.jsonl" )
94+
95+ print (f"📄 输入文件: { input_file } " )
96+ print (f"📄 输出文件: { output_file } " )
97+ if test_first :
98+ print (f"🧪 测试模式: 仅处理前 { test_first } 条数据" )
99+
100+ # 检查依赖
101+ print ("\n 🔍 检查依赖..." )
102+ try :
103+ from llm_web_kit .input .pre_data_json import PreDataJson , PreDataJsonKey
104+ from llm_web_kit .main_html_parser .parser .tag_mapping import MapItemToHtmlTagsParser
105+ print ("✅ llm_web_kit 模块可用" )
106+ except ImportError as e :
107+ print (f"❌ llm_web_kit 模块未安装: { e } " )
108+ print (" 请运行: pip install llm-webkit" )
109+ return
110+
111+ # 统计信息
112+ total = 0
113+ success = 0
114+ failed = 0
115+
116+ # 先统计总行数(用于进度条)
117+ print ("\n 📊 统计总行数..." )
118+ with open (input_file , 'r' , encoding = 'utf-8' ) as f :
119+ total_lines = sum (1 for _ in f )
120+
121+ if test_first :
122+ total_lines = min (total_lines , test_first )
123+
124+ print (f"📦 总共 { total_lines :,} 条数据\n " )
125+
126+ # 处理数据
127+ print ("🔄 开始处理...\n " )
128+ try :
129+ with open (input_file , 'r' , encoding = 'utf-8' ) as fin , \
130+ open (output_file , 'w' , encoding = 'utf-8' ) as fout :
131+
132+ for idx , line in enumerate (fin , 1 ):
133+ # 测试模式:只处理前N条
134+ if test_first and idx > test_first :
135+ break
136+
137+ if not line .strip ():
138+ continue
139+
140+ try :
141+ # 解析JSON
142+ data = json .loads (line )
143+ total += 1
144+
145+ # 显示进度(每100条显示一次)
146+ if total % 100 == 0 :
147+ print (f" 处理进度: { total } /{ total_lines } ({ total / total_lines * 100 :.1f} %)" )
148+
149+ # 处理单条数据
150+ if verbose and idx <= 3 :
151+ print (f"\n 处理第 { idx } 条数据..." )
152+
153+ processed_data = process_single_item (data , verbose = (verbose and idx <= 3 ))
154+
155+ # 检查是否成功添加字段
156+ if processed_data .get ('llm_webkit430_main_html' ):
157+ success += 1
158+ else :
159+ failed += 1
160+
161+ # 写入输出文件
162+ fout .write (json .dumps (processed_data , ensure_ascii = False ) + '\n ' )
163+
164+ except json .JSONDecodeError as e :
165+ print (f"\n ⚠️ 行 { idx } JSON解析错误: { e } " )
166+ failed += 1
167+ # 写入原始行
168+ fout .write (line )
169+ except Exception as e :
170+ print (f"\n ❌ 行 { idx } 处理错误: { e } " )
171+ if verbose :
172+ import traceback
173+ print (traceback .format_exc ())
174+ failed += 1
175+ # 写入原始数据
176+ try :
177+ data ['llm_webkit430_main_html' ] = ""
178+ fout .write (json .dumps (data , ensure_ascii = False ) + '\n ' )
179+ except :
180+ fout .write (line )
181+
182+ except Exception as e :
183+ print (f"\n ❌ 处理过程中发生严重错误: { e } " )
184+ import traceback
185+ print (traceback .format_exc ())
186+ return
187+
188+ # 输出统计信息
189+ print ("\n " + "=" * 60 )
190+ print ("✅ 处理完成!" )
191+ print ("=" * 60 )
192+ print (f"总处理数: { total :,} " )
193+ print (f"成功: { success :,} ({ success / total * 100 :.1f} %)" if total > 0 else "成功: 0" )
194+ print (f"失败: { failed :,} ({ failed / total * 100 :.1f} %)" if total > 0 else "失败: 0" )
195+ print (f"\n 输出文件: { output_file } " )
196+ print ("=" * 60 )
197+
198+
199+ def main ():
200+ parser = argparse .ArgumentParser (
201+ description = '为数据集添加 llm_webkit430_main_html 字段' ,
202+ formatter_class = argparse .RawDescriptionHelpFormatter ,
203+ epilog = '''
204+ 示例:
205+ # 基本使用
206+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl
207+
208+ # 指定输出文件
209+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
210+ --output data/WebMainBench_7887_with_main_html.jsonl
211+
212+ # 测试前10条数据
213+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
214+ --test-first 10 --verbose
215+
216+ # 详细模式(显示前3条的处理细节)
217+ python scripts/process_dataset.py data/WebMainBench_7887_within_formula_code.jsonl \\
218+ --verbose
219+ '''
220+ )
221+
222+ parser .add_argument (
223+ 'input_file' ,
224+ help = '输入JSONL文件路径'
225+ )
226+
227+ parser .add_argument (
228+ '--output' ,
229+ '-o' ,
230+ help = '输出JSONL文件路径(默认:输入文件名_with_main_html.jsonl)'
231+ )
232+
233+ parser .add_argument (
234+ '--verbose' ,
235+ '-v' ,
236+ action = 'store_true' ,
237+ help = '显示详细处理信息(仅显示前3条)'
238+ )
239+
240+ parser .add_argument (
241+ '--test-first' ,
242+ '-t' ,
243+ type = int ,
244+ help = '仅处理前N条数据(用于测试)'
245+ )
246+
247+ args = parser .parse_args ()
248+
249+ # 处理数据集
250+ process_dataset (args .input_file , args .output , args .verbose , args .test_first )
251+
252+
253+ if __name__ == '__main__' :
254+ main ()
0 commit comments