@@ -392,72 +392,6 @@ def _extract_content(self, html, url=None):
392392 print (f"报告已保存到: { report_path } " )
393393
394394
395- def demo_extractor_comparison ():
396- """演示多抽取器对比"""
397-
398- print ("\n === 多抽取器对比演示 ===\n " )
399-
400- # 创建数据集
401- dataset = create_sample_dataset ()
402-
403- # 创建多个模拟抽取器
404- from webmainbench .extractors import BaseExtractor , ExtractionResult
405-
406- class ExtractorA (BaseExtractor ):
407- def _setup (self ):
408- pass
409- def _extract_content (self , html , url = None ):
410- return ExtractionResult (
411- content = "抽取器A的结果" ,
412- # content_list=[{"type": "paragraph", "content": "抽取器A的结果"}],
413- success = True ,
414- confidence_score = 0.9
415- )
416-
417- class ExtractorB (BaseExtractor ):
418- def _setup (self ):
419- pass
420- def _extract_content (self , html , url = None ):
421- return ExtractionResult (
422- content = "抽取器B的结果" ,
423- # content_list=[{"type": "paragraph", "content": "抽取器B的结果"}],
424- success = True ,
425- confidence_score = 0.8
426- )
427-
428- # 注册抽取器
429- ExtractorFactory .register ("extractor_a" , ExtractorA )
430- ExtractorFactory .register ("extractor_b" , ExtractorB )
431-
432- # 运行对比
433- evaluator = Evaluator ()
434- extractors = ["extractor_a" , "extractor_b" ]
435-
436- results = evaluator .compare_extractors (
437- dataset = dataset ,
438- extractors = extractors ,
439- max_samples = 2
440- )
441-
442- # 显示对比结果
443- print ("对比结果:" )
444- print ("-" * 40 )
445- for extractor_name , result in results .items ():
446- overall_score = result .overall_metrics .get ('overall' , 0 )
447- print (f"{ extractor_name } : { overall_score :.4f} " )
448-
449- # 保存多抽取器对比榜单
450- all_results = []
451- for extractor_name , result in results .items ():
452- all_results .append (result .to_dict ())
453-
454- results_dir = Path ("results" )
455- results_dir .mkdir (exist_ok = True )
456- leaderboard_path = results_dir / "leaderboard.csv"
457- DataSaver .save_summary_report (all_results , leaderboard_path )
458- print (f"\n 📊 榜单已保存到: { leaderboard_path } " )
459-
460-
461395def demo_llm_webkit_evaluation ():
462396 """演示LLM-WebKit抽取器的6项指标评测"""
463397
@@ -955,16 +889,9 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
955889
956890 # 1. 从真实数据集加载包含预处理HTML的数据
957891 print ("1. 从真实数据集加载预处理HTML数据..." )
958-
959- # 使用DataLoader加载真实的样本数据
960- dataset_path = Path ("data/WebMainBench_dataset_merge_with_llm_webkit.jsonl" )
892+ dataset_path = Path ("data/WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl" )
961893 print (f"📂 数据集文件: { dataset_path } " )
962894
963- if not dataset_path .exists ():
964- print (f"❌ 数据文件不存在: { dataset_path } " )
965- print ("请确保已运行数据提取命令创建样本数据集" )
966- return
967-
968895 # 加载数据集
969896 dataset = DataLoader .load_jsonl (dataset_path , include_results = False )
970897 dataset .name = "real_preprocessed_html_test"
@@ -977,26 +904,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
977904 print (" - groundtruth_content: 人工标注的标准答案" )
978905 print (" - llm_webkit_md: LLM提取的markdown内容" )
979906
980- # 显示第一个样本的预览
981- if len (dataset .samples ) > 0 :
982- first_sample = dataset .samples [0 ]
983- sample_dict = first_sample .to_dict ()
984-
985- print (f"\n 🔍 第一个样本预览:" )
986- print (f" - ID: { sample_dict .get ('track_id' , 'N/A' )} " )
987- print (f" - URL: { sample_dict .get ('url' , 'N/A' )[:60 ]} ..." )
988-
989- # 检查是否有llm_webkit_html字段
990- if hasattr (first_sample , 'llm_webkit_html' ) or 'llm_webkit_html' in sample_dict :
991- llm_html = getattr (first_sample , 'llm_webkit_html' , sample_dict .get ('llm_webkit_html' , '' ))
992- if llm_html :
993- print (f" - 预处理HTML长度: { len (llm_html )} 字符" )
994- print (f" - 包含_item_id数量: { llm_html .count ('_item_id' )} " )
995- else :
996- print (f" - ⚠️ 预处理HTML字段为空" )
997- else :
998- print (f" - ❌ 未找到llm_webkit_html字段" )
999- print ()
1000907
1001908 # 2. 创建预处理HTML模式的LLM-WebKit抽取器
1002909 print ("2. 创建预处理HTML模式的LLM-WebKit抽取器..." )
@@ -1007,12 +914,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
1007914 }
1008915
1009916 extractor = ExtractorFactory .create ("llm-webkit" , config = config )
1010- print (f"✅ 抽取器创建成功" )
1011- print (f"📋 配置信息:" )
1012- print (f" - use_preprocessed_html: { extractor .inference_config .use_preprocessed_html } " )
1013- print (f" - preprocessed_html_field: { extractor .inference_config .preprocessed_html_field } " )
1014- print (f" - 跳过LLM推理: 是(直接处理预处理HTML)" )
1015- print ()
1016917
1017918 # 4. 运行评测
1018919 print ("4. 开始评测..." )
@@ -1054,20 +955,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
1054955 success_count = len ([s for s in sample_results if s .get ('extraction_success' , False )])
1055956 print (f" 成功样本数: { success_count } /{ len (dataset )} " )
1056957
1057- # 6. 展示样本提取结果
1058- print (f"\n 6. 📄 样本提取结果预览:" )
1059- print ("-" * 50 )
1060-
1061- for i , sample_result in enumerate (sample_results [:2 ]): # 只显示前2个样本
1062- print (f"\n 样本 { i + 1 } : { sample_result .get ('sample_id' , 'Unknown' )} " )
1063- if sample_result .get ('extraction_success' ):
1064- content = sample_result .get ('extracted_content' , '' )
1065- preview = content [:100 ].replace ('\n ' , ' ' ) if content else '无内容'
1066- print (f" ✅ 提取成功" )
1067- print (f" 📝 内容预览: { preview } ..." )
1068- print (f" ⏱️ 提取时间: { sample_result .get ('extraction_time' , 0 ):.3f} 秒" )
1069- else :
1070- print (f" ❌ 提取失败" )
1071958 # 7. 保存结果
1072959 print (f"\n 7. 💾 保存评测结果..." )
1073960
0 commit comments