Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ print(f"Overall Score: {result.overall_metrics['overall']:.4f}")

## 高级功能

### 多抽取器对比
### 多抽取器对比评估

```python
# 对比多个抽取器
Expand All @@ -114,6 +114,43 @@ for name, result in results.items():
print(f"{name}: {result.overall_metrics['overall']:.4f}")
```

#### 具体示例

```python
python examples/multi_extractor_compare.py
```

这个例子演示了如何:

1. **加载测试数据集**:使用包含代码、公式、表格、文本等多种内容类型的样本数据
2. **创建多个抽取器**:
- `llm-webkit`:支持预处理HTML的智能抽取器
- `magic-html`:基于 magic-html 库的抽取器
- `trafilatura`:基于 trafilatura 库的抽取器
- `resiliparse`:基于 resiliparse 库的抽取器
3. **批量评估对比**:使用 `evaluator.compare_extractors()` 同时评估所有抽取器
4. **生成对比报告**:自动保存多种格式的评估结果

#### 输出文件说明

评估完成后会在 `results/` 目录下生成三个重要文件:

| 文件名 | 格式 | 内容描述 |
|--------|------|----------|
| `leaderboard.csv` | CSV | **排行榜文件**:包含各抽取器的整体排名和分项指标对比,便于快速查看性能差异 |
| `evaluation_results.json` | JSON | **详细评估结果**:包含每个抽取器的完整评估数据、指标详情和元数据信息 |
| `dataset_with_results.jsonl` | JSONL | **增强数据集**:原始测试数据加上所有抽取器的提取结果,便于人工检查和分析 |


`leaderboard.csv` 内容示例:
```csv
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
llm-webkit,sample_dataset,4,1.0,0.2196,0.5,0.0,0.0,0.0,0.5982
magic-html,sample_dataset,4,1.0,0.1526,0.1007,0.0,0.0,0.0,0.6624
resiliparse,sample_dataset,4,1.0,0.1379,0.0,0.0,0.0,0.0,0.6897
trafilatura,sample_dataset,4,1.0,0.1151,0.1007,0.0,0.0,0.0,0.4746
```

### 自定义指标

```python
Expand Down
8 changes: 4 additions & 4 deletions data/sample_dataset.jsonl

Large diffs are not rendered by default.

115 changes: 1 addition & 114 deletions examples/basic_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,72 +392,6 @@ def _extract_content(self, html, url=None):
print(f"报告已保存到: {report_path}")


def demo_extractor_comparison():
"""演示多抽取器对比"""

print("\n=== 多抽取器对比演示 ===\n")

# 创建数据集
dataset = create_sample_dataset()

# 创建多个模拟抽取器
from webmainbench.extractors import BaseExtractor, ExtractionResult

class ExtractorA(BaseExtractor):
def _setup(self):
pass
def _extract_content(self, html, url=None):
return ExtractionResult(
content="抽取器A的结果",
# content_list=[{"type": "paragraph", "content": "抽取器A的结果"}],
success=True,
confidence_score=0.9
)

class ExtractorB(BaseExtractor):
def _setup(self):
pass
def _extract_content(self, html, url=None):
return ExtractionResult(
content="抽取器B的结果",
# content_list=[{"type": "paragraph", "content": "抽取器B的结果"}],
success=True,
confidence_score=0.8
)

# 注册抽取器
ExtractorFactory.register("extractor_a", ExtractorA)
ExtractorFactory.register("extractor_b", ExtractorB)

# 运行对比
evaluator = Evaluator()
extractors = ["extractor_a", "extractor_b"]

results = evaluator.compare_extractors(
dataset=dataset,
extractors=extractors,
max_samples=2
)

# 显示对比结果
print("对比结果:")
print("-" * 40)
for extractor_name, result in results.items():
overall_score = result.overall_metrics.get('overall', 0)
print(f"{extractor_name}: {overall_score:.4f}")

# 保存多抽取器对比榜单
all_results = []
for extractor_name, result in results.items():
all_results.append(result.to_dict())

results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
leaderboard_path = results_dir / "leaderboard.csv"
DataSaver.save_summary_report(all_results, leaderboard_path)
print(f"\n📊 榜单已保存到: {leaderboard_path}")


def demo_llm_webkit_evaluation():
"""演示LLM-WebKit抽取器的6项指标评测"""

Expand Down Expand Up @@ -955,16 +889,9 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():

# 1. 从真实数据集加载包含预处理HTML的数据
print("1. 从真实数据集加载预处理HTML数据...")

# 使用DataLoader加载真实的样本数据
dataset_path = Path("data/WebMainBench_dataset_merge_with_llm_webkit.jsonl")
dataset_path = Path("data/WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
print(f"📂 数据集文件: {dataset_path}")

if not dataset_path.exists():
print(f"❌ 数据文件不存在: {dataset_path}")
print("请确保已运行数据提取命令创建样本数据集")
return

# 加载数据集
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
dataset.name = "real_preprocessed_html_test"
Expand All @@ -977,26 +904,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
print(" - groundtruth_content: 人工标注的标准答案")
print(" - llm_webkit_md: LLM提取的markdown内容")

# 显示第一个样本的预览
if len(dataset.samples) > 0:
first_sample = dataset.samples[0]
sample_dict = first_sample.to_dict()

print(f"\n🔍 第一个样本预览:")
print(f" - ID: {sample_dict.get('track_id', 'N/A')}")
print(f" - URL: {sample_dict.get('url', 'N/A')[:60]}...")

# 检查是否有llm_webkit_html字段
if hasattr(first_sample, 'llm_webkit_html') or 'llm_webkit_html' in sample_dict:
llm_html = getattr(first_sample, 'llm_webkit_html', sample_dict.get('llm_webkit_html', ''))
if llm_html:
print(f" - 预处理HTML长度: {len(llm_html)} 字符")
print(f" - 包含_item_id数量: {llm_html.count('_item_id')}")
else:
print(f" - ⚠️ 预处理HTML字段为空")
else:
print(f" - ❌ 未找到llm_webkit_html字段")
print()

# 2. 创建预处理HTML模式的LLM-WebKit抽取器
print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
Expand All @@ -1007,12 +914,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
}

extractor = ExtractorFactory.create("llm-webkit", config=config)
print(f"✅ 抽取器创建成功")
print(f"📋 配置信息:")
print(f" - use_preprocessed_html: {extractor.inference_config.use_preprocessed_html}")
print(f" - preprocessed_html_field: {extractor.inference_config.preprocessed_html_field}")
print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
print()

# 4. 运行评测
print("4. 开始评测...")
Expand Down Expand Up @@ -1054,20 +955,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
print(f" 成功样本数: {success_count}/{len(dataset)}")

# 6. 展示样本提取结果
print(f"\n6. 📄 样本提取结果预览:")
print("-" * 50)

for i, sample_result in enumerate(sample_results[:2]): # 只显示前2个样本
print(f"\n样本 {i+1}: {sample_result.get('sample_id', 'Unknown')}")
if sample_result.get('extraction_success'):
content = sample_result.get('extracted_content', '')
preview = content[:100].replace('\n', ' ') if content else '无内容'
print(f" ✅ 提取成功")
print(f" 📝 内容预览: {preview}...")
print(f" ⏱️ 提取时间: {sample_result.get('extraction_time', 0):.3f}秒")
else:
print(f" ❌ 提取失败")
# 7. 保存结果
print(f"\n7. 💾 保存评测结果...")

Expand Down
3 changes: 2 additions & 1 deletion examples/demo.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from webmainbench import DataLoader, Evaluator, ExtractorFactory
from pathlib import Path

# 1. 加载评测数据集
dataset = DataLoader.load_jsonl("../data/sample_dataset.jsonl")
dataset = DataLoader.load_jsonl(Path("data/sample_dataset.jsonl"))

# 2. 创建抽取器
extractor = ExtractorFactory.create("llm-webkit")
Expand Down
64 changes: 64 additions & 0 deletions examples/multi_extractor_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from webmainbench import DataLoader, Evaluator, ExtractorFactory, DataSaver
from pathlib import Path


def all_extractor_comparison():
"""演示多抽取器对比"""

print("\n=== 多抽取器对比演示 ===\n")

# 创建数据集
dataset_path = Path("data/sample_dataset.jsonl")
dataset = DataLoader.load_jsonl(dataset_path)

# 创建webkit抽取器
config = {
"use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式
"preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名
}
webkit_extractor = ExtractorFactory.create("llm-webkit", config=config)
# 创建magic-extractor抽取器
magic_extractor = ExtractorFactory.create("magic-html")
# 创建trafilatura抽取器
trafilatura_extractor = ExtractorFactory.create("trafilatura")
# 创建resiliparse抽取器
resiliparse_extractor = ExtractorFactory.create("resiliparse")

# 运行对比
evaluator = Evaluator()
extractors = [webkit_extractor, magic_extractor, trafilatura_extractor, resiliparse_extractor]

results = evaluator.compare_extractors(
dataset=dataset,
extractors=extractors
)

# 显示对比结果
print("对比结果:")
print("-" * 40)
for extractor_name, result in results.items():
overall_score = result.overall_metrics.get('overall', 0)
print(f"{extractor_name}: {overall_score:.4f}")

# 保存多抽取器对比榜单
all_results = []
for extractor_name, result in results.items():
all_results.append(result.to_dict())

results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
leaderboard_path = results_dir / "leaderboard.csv"
evaluation_results_path = results_dir / "evaluation_results.json"
jsonl_dataset_path = results_dir / f"dataset_with_results.jsonl"
DataSaver.save_summary_report(all_results, leaderboard_path)
DataSaver.save_evaluation_results(all_results, evaluation_results_path)
DataSaver.save_dataset_with_extraction(
results=all_results,
dataset=dataset, # 原始数据集对象
file_path=jsonl_dataset_path
)
print(f"\n📊 榜单已保存到: {leaderboard_path}")


if __name__ == "__main__":
all_extractor_comparison()
4 changes: 4 additions & 0 deletions results/dataset_with_results.jsonl

Large diffs are not rendered by default.

Loading