Skip to content

Commit d90a1e6

Browse files
authored
Merge pull request #34 from e06084/main
feat: add multi extractor compare script
2 parents 3cdd9e6 + 41d903b commit d90a1e6

File tree

9 files changed

+1676
-165
lines changed

9 files changed

+1676
-165
lines changed

README.md

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ print(f"Overall Score: {result.overall_metrics['overall']:.4f}")
103103

104104
## 高级功能
105105

106-
### 多抽取器对比
106+
### 多抽取器对比评估
107107

108108
```python
109109
# 对比多个抽取器
@@ -114,6 +114,43 @@ for name, result in results.items():
114114
print(f"{name}: {result.overall_metrics['overall']:.4f}")
115115
```
116116

117+
#### 具体示例
118+
119+
```python
120+
python examples/multi_extractor_compare.py
121+
```
122+
123+
这个例子演示了如何:
124+
125+
1. **加载测试数据集**:使用包含代码、公式、表格、文本等多种内容类型的样本数据
126+
2. **创建多个抽取器**
127+
- `llm-webkit`:支持预处理HTML的智能抽取器
128+
- `magic-html`:基于 magic-html 库的抽取器
129+
- `trafilatura`:基于 trafilatura 库的抽取器
130+
- `resiliparse`:基于 resiliparse 库的抽取器
131+
3. **批量评估对比**:使用 `evaluator.compare_extractors()` 同时评估所有抽取器
132+
4. **生成对比报告**:自动保存多种格式的评估结果
133+
134+
#### 输出文件说明
135+
136+
评估完成后会在 `results/` 目录下生成三个重要文件:
137+
138+
| 文件名 | 格式 | 内容描述 |
139+
|--------|------|----------|
140+
| `leaderboard.csv` | CSV | **排行榜文件**:包含各抽取器的整体排名和分项指标对比,便于快速查看性能差异 |
141+
| `evaluation_results.json` | JSON | **详细评估结果**:包含每个抽取器的完整评估数据、指标详情和元数据信息 |
142+
| `dataset_with_results.jsonl` | JSONL | **增强数据集**:原始测试数据加上所有抽取器的提取结果,便于人工检查和分析 |
143+
144+
145+
`leaderboard.csv` 内容示例:
146+
```csv
147+
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
148+
llm-webkit,sample_dataset,4,1.0,0.2196,0.5,0.0,0.0,0.0,0.5982
149+
magic-html,sample_dataset,4,1.0,0.1526,0.1007,0.0,0.0,0.0,0.6624
150+
resiliparse,sample_dataset,4,1.0,0.1379,0.0,0.0,0.0,0.0,0.6897
151+
trafilatura,sample_dataset,4,1.0,0.1151,0.1007,0.0,0.0,0.0,0.4746
152+
```
153+
117154
### 自定义指标
118155

119156
```python

data/sample_dataset.jsonl

Lines changed: 4 additions & 4 deletions
Large diffs are not rendered by default.

examples/basic_usage.py

Lines changed: 1 addition & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -392,72 +392,6 @@ def _extract_content(self, html, url=None):
392392
print(f"报告已保存到: {report_path}")
393393

394394

395-
def demo_extractor_comparison():
396-
"""演示多抽取器对比"""
397-
398-
print("\n=== 多抽取器对比演示 ===\n")
399-
400-
# 创建数据集
401-
dataset = create_sample_dataset()
402-
403-
# 创建多个模拟抽取器
404-
from webmainbench.extractors import BaseExtractor, ExtractionResult
405-
406-
class ExtractorA(BaseExtractor):
407-
def _setup(self):
408-
pass
409-
def _extract_content(self, html, url=None):
410-
return ExtractionResult(
411-
content="抽取器A的结果",
412-
# content_list=[{"type": "paragraph", "content": "抽取器A的结果"}],
413-
success=True,
414-
confidence_score=0.9
415-
)
416-
417-
class ExtractorB(BaseExtractor):
418-
def _setup(self):
419-
pass
420-
def _extract_content(self, html, url=None):
421-
return ExtractionResult(
422-
content="抽取器B的结果",
423-
# content_list=[{"type": "paragraph", "content": "抽取器B的结果"}],
424-
success=True,
425-
confidence_score=0.8
426-
)
427-
428-
# 注册抽取器
429-
ExtractorFactory.register("extractor_a", ExtractorA)
430-
ExtractorFactory.register("extractor_b", ExtractorB)
431-
432-
# 运行对比
433-
evaluator = Evaluator()
434-
extractors = ["extractor_a", "extractor_b"]
435-
436-
results = evaluator.compare_extractors(
437-
dataset=dataset,
438-
extractors=extractors,
439-
max_samples=2
440-
)
441-
442-
# 显示对比结果
443-
print("对比结果:")
444-
print("-" * 40)
445-
for extractor_name, result in results.items():
446-
overall_score = result.overall_metrics.get('overall', 0)
447-
print(f"{extractor_name}: {overall_score:.4f}")
448-
449-
# 保存多抽取器对比榜单
450-
all_results = []
451-
for extractor_name, result in results.items():
452-
all_results.append(result.to_dict())
453-
454-
results_dir = Path("results")
455-
results_dir.mkdir(exist_ok=True)
456-
leaderboard_path = results_dir / "leaderboard.csv"
457-
DataSaver.save_summary_report(all_results, leaderboard_path)
458-
print(f"\n📊 榜单已保存到: {leaderboard_path}")
459-
460-
461395
def demo_llm_webkit_evaluation():
462396
"""演示LLM-WebKit抽取器的6项指标评测"""
463397

@@ -955,16 +889,9 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
955889

956890
# 1. 从真实数据集加载包含预处理HTML的数据
957891
print("1. 从真实数据集加载预处理HTML数据...")
958-
959-
# 使用DataLoader加载真实的样本数据
960-
dataset_path = Path("data/WebMainBench_dataset_merge_with_llm_webkit.jsonl")
892+
dataset_path = Path("data/WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
961893
print(f"📂 数据集文件: {dataset_path}")
962894

963-
if not dataset_path.exists():
964-
print(f"❌ 数据文件不存在: {dataset_path}")
965-
print("请确保已运行数据提取命令创建样本数据集")
966-
return
967-
968895
# 加载数据集
969896
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
970897
dataset.name = "real_preprocessed_html_test"
@@ -977,26 +904,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
977904
print(" - groundtruth_content: 人工标注的标准答案")
978905
print(" - llm_webkit_md: LLM提取的markdown内容")
979906

980-
# 显示第一个样本的预览
981-
if len(dataset.samples) > 0:
982-
first_sample = dataset.samples[0]
983-
sample_dict = first_sample.to_dict()
984-
985-
print(f"\n🔍 第一个样本预览:")
986-
print(f" - ID: {sample_dict.get('track_id', 'N/A')}")
987-
print(f" - URL: {sample_dict.get('url', 'N/A')[:60]}...")
988-
989-
# 检查是否有llm_webkit_html字段
990-
if hasattr(first_sample, 'llm_webkit_html') or 'llm_webkit_html' in sample_dict:
991-
llm_html = getattr(first_sample, 'llm_webkit_html', sample_dict.get('llm_webkit_html', ''))
992-
if llm_html:
993-
print(f" - 预处理HTML长度: {len(llm_html)} 字符")
994-
print(f" - 包含_item_id数量: {llm_html.count('_item_id')}")
995-
else:
996-
print(f" - ⚠️ 预处理HTML字段为空")
997-
else:
998-
print(f" - ❌ 未找到llm_webkit_html字段")
999-
print()
1000907

1001908
# 2. 创建预处理HTML模式的LLM-WebKit抽取器
1002909
print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
@@ -1007,12 +914,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
1007914
}
1008915

1009916
extractor = ExtractorFactory.create("llm-webkit", config=config)
1010-
print(f"✅ 抽取器创建成功")
1011-
print(f"📋 配置信息:")
1012-
print(f" - use_preprocessed_html: {extractor.inference_config.use_preprocessed_html}")
1013-
print(f" - preprocessed_html_field: {extractor.inference_config.preprocessed_html_field}")
1014-
print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
1015-
print()
1016917

1017918
# 4. 运行评测
1018919
print("4. 开始评测...")
@@ -1054,20 +955,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
1054955
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
1055956
print(f" 成功样本数: {success_count}/{len(dataset)}")
1056957

1057-
# 6. 展示样本提取结果
1058-
print(f"\n6. 📄 样本提取结果预览:")
1059-
print("-" * 50)
1060-
1061-
for i, sample_result in enumerate(sample_results[:2]): # 只显示前2个样本
1062-
print(f"\n样本 {i+1}: {sample_result.get('sample_id', 'Unknown')}")
1063-
if sample_result.get('extraction_success'):
1064-
content = sample_result.get('extracted_content', '')
1065-
preview = content[:100].replace('\n', ' ') if content else '无内容'
1066-
print(f" ✅ 提取成功")
1067-
print(f" 📝 内容预览: {preview}...")
1068-
print(f" ⏱️ 提取时间: {sample_result.get('extraction_time', 0):.3f}秒")
1069-
else:
1070-
print(f" ❌ 提取失败")
1071958
# 7. 保存结果
1072959
print(f"\n7. 💾 保存评测结果...")
1073960

examples/demo.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from webmainbench import DataLoader, Evaluator, ExtractorFactory
2+
from pathlib import Path
23

34
# 1. 加载评测数据集
4-
dataset = DataLoader.load_jsonl("../data/sample_dataset.jsonl")
5+
dataset = DataLoader.load_jsonl(Path("data/sample_dataset.jsonl"))
56

67
# 2. 创建抽取器
78
extractor = ExtractorFactory.create("llm-webkit")
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from webmainbench import DataLoader, Evaluator, ExtractorFactory, DataSaver
2+
from pathlib import Path
3+
4+
5+
def all_extractor_comparison():
6+
"""演示多抽取器对比"""
7+
8+
print("\n=== 多抽取器对比演示 ===\n")
9+
10+
# 创建数据集
11+
dataset_path = Path("data/sample_dataset.jsonl")
12+
dataset = DataLoader.load_jsonl(dataset_path)
13+
14+
# 创建webkit抽取器
15+
config = {
16+
"use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式
17+
"preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名
18+
}
19+
webkit_extractor = ExtractorFactory.create("llm-webkit", config=config)
20+
# 创建magic-extractor抽取器
21+
magic_extractor = ExtractorFactory.create("magic-html")
22+
# 创建trafilatura抽取器
23+
trafilatura_extractor = ExtractorFactory.create("trafilatura")
24+
# 创建resiliparse抽取器
25+
resiliparse_extractor = ExtractorFactory.create("resiliparse")
26+
27+
# 运行对比
28+
evaluator = Evaluator()
29+
extractors = [webkit_extractor, magic_extractor, trafilatura_extractor, resiliparse_extractor]
30+
31+
results = evaluator.compare_extractors(
32+
dataset=dataset,
33+
extractors=extractors
34+
)
35+
36+
# 显示对比结果
37+
print("对比结果:")
38+
print("-" * 40)
39+
for extractor_name, result in results.items():
40+
overall_score = result.overall_metrics.get('overall', 0)
41+
print(f"{extractor_name}: {overall_score:.4f}")
42+
43+
# 保存多抽取器对比榜单
44+
all_results = []
45+
for extractor_name, result in results.items():
46+
all_results.append(result.to_dict())
47+
48+
results_dir = Path("results")
49+
results_dir.mkdir(exist_ok=True)
50+
leaderboard_path = results_dir / "leaderboard.csv"
51+
evaluation_results_path = results_dir / "evaluation_results.json"
52+
jsonl_dataset_path = results_dir / f"dataset_with_results.jsonl"
53+
DataSaver.save_summary_report(all_results, leaderboard_path)
54+
DataSaver.save_evaluation_results(all_results, evaluation_results_path)
55+
DataSaver.save_dataset_with_extraction(
56+
results=all_results,
57+
dataset=dataset, # 原始数据集对象
58+
file_path=jsonl_dataset_path
59+
)
60+
print(f"\n📊 榜单已保存到: {leaderboard_path}")
61+
62+
63+
if __name__ == "__main__":
64+
all_extractor_comparison()

results/dataset_with_results.jsonl

Lines changed: 4 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)