Skip to content

Commit 480b889

Browse files
authored
Merge pull request opendatalab#37 from darkrush/main_html
Main html
2 parents cfbfd02 + a389021 commit 480b889

File tree

13 files changed

+795
-5
lines changed

13 files changed

+795
-5
lines changed

examples/main_html_eval.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
#!/usr/bin/env python3
2+
"""
3+
WebMainBench 基本使用示例
4+
"""
5+
6+
import json
7+
from pathlib import Path
8+
9+
# 导入 WebMainBench 模块
10+
from webmainbench import (
11+
DataLoader, DataSaver, BenchmarkDataset, DataSample,
12+
ExtractorFactory, MainHTMLEvaluator,
13+
format_results, setup_logging
14+
)
15+
16+
17+
def load_benchdata(dataset_path: str) -> BenchmarkDataset:
18+
dataset_path = Path(dataset_path)
19+
print(f"📂 数据集文件: {dataset_path}")
20+
21+
if not dataset_path.exists():
22+
print(f"❌ 数据文件不存在: {dataset_path}")
23+
print("请确保已运行数据提取命令创建样本数据集")
24+
return
25+
26+
# 加载数据集
27+
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
28+
dataset.name = "real_preprocessed_html_test"
29+
dataset.description = "基于真实数据的预处理HTML功能测试"
30+
return dataset
31+
32+
33+
def load_extractor(model_path: str):
34+
extractor = ExtractorFactory.create("dripper", config={"model_path": model_path})
35+
return extractor
36+
37+
38+
def save_results(result_file: Path, results: list[dict]):
39+
with result_file.open("w", encoding="utf-8") as f:
40+
for res in results:
41+
f.write(json.dumps(res, ensure_ascii=False) + "\n")
42+
43+
44+
45+
def demo_llm_webkit_with_preprocessed_html_evaluation(model_path: str):
46+
"""演示LLM-WebKit预处理HTML功能的评测"""
47+
48+
print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
49+
50+
# 设置日志
51+
setup_logging(level="INFO")
52+
53+
# 1. 从真实数据集加载包含预处理HTML的数据
54+
print("1. 从真实数据集加载预处理HTML数据...")
55+
56+
# 使用DataLoader加载真实的样本数据
57+
58+
dataset = load_benchdata("data/WebMainBench_llm-webkit_v1_WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
59+
print(f"✅ 真实数据集加载成功,包含 {len(dataset)} 个样本")
60+
61+
62+
63+
# 2. 创建预处理HTML模式的LLM-WebKit抽取器
64+
print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
65+
66+
extractor = load_extractor(model_path)
67+
print(f"✅ 抽取器创建成功")
68+
print(f"📋 配置信息:")
69+
print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
70+
print()
71+
72+
# 4. 运行评测
73+
print("4. 开始评测...")
74+
print("=" * 50)
75+
76+
evaluator = MainHTMLEvaluator()
77+
result = evaluator.evaluate(
78+
dataset=dataset,
79+
extractor=extractor,
80+
max_samples=None
81+
)
82+
83+
# 5. 显示评测结果
84+
print("\n5. 📊 预处理HTML模式评测结果:")
85+
print("=" * 50)
86+
87+
results_dict = result.to_dict()
88+
metrics = results_dict.get('overall_metrics', {})
89+
90+
# 显示关键指标
91+
print(f"\n🏆 综合指标:")
92+
for key in metrics.keys():
93+
print(f" {key}: {metrics[key]:.4f}")
94+
95+
print(f"\n⚡ 性能统计:")
96+
sample_results = results_dict.get('sample_results', [])
97+
if sample_results:
98+
extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')]
99+
if extraction_times:
100+
avg_time = sum(extraction_times) / len(extraction_times)
101+
print(f" 平均提取时间: {avg_time:.3f}秒")
102+
print(f" 处理速度: {1/avg_time:.1f}样本/秒")
103+
104+
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
105+
print(f" 成功样本数: {success_count}/{len(dataset)}")
106+
107+
# 7. 保存结果
108+
print(f"\n6. 💾 保存评测结果...")
109+
110+
results_dir = Path("results")
111+
results_dir.mkdir(exist_ok=True)
112+
# 新增:保存带抽取结果的增强数据集(JSONL格式)
113+
jsonl_dataset_path = results_dir / f"{extractor.name}_preprocessed_html_dataset_with_results.jsonl"
114+
save_results(jsonl_dataset_path, result.sample_results)
115+
print(f"✅ 结果已保存到: {jsonl_dataset_path}")
116+
117+
118+
print(f"✅ 带抽取结果的JSONL数据集已保存到: {jsonl_dataset_path}")
119+
results_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_results.json"
120+
report_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_report.csv"
121+
122+
DataSaver.save_evaluation_results(result, results_path)
123+
DataSaver.save_summary_report(result, report_path)
124+
125+
print(f"✅ 详细结果已保存到: {results_path}")
126+
print(f"✅ CSV报告已保存到: {report_path}")
127+
128+
129+
130+
if __name__ == "__main__":
131+
import argparse
132+
parser = argparse.ArgumentParser(description="WebMainBench 基本使用示例")
133+
parser.add_argument("--model_path", required=True, help="LLM model路径")
134+
args = parser.parse_args()
135+
try:
136+
demo_llm_webkit_with_preprocessed_html_evaluation(args.model_path)
137+
print("\n✅ 示例运行完成!")
138+
139+
except Exception as e:
140+
print(f"\n❌ 运行出错: {e}")
141+
import traceback
142+
traceback.print_exc()

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ trafilatura
1010
# llm-web-kit==3.2.0
1111
https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.5-released/magic_html-0.1.5-py3-none-any.whl
1212
streamlit
13-
markdown
13+
markdown
14+
jieba

webmainbench/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from .data import DataLoader, DataSaver, BenchmarkDataset, DataSample
1212
from .extractors import BaseExtractor, ExtractorFactory, ExtractionResult
1313
from .metrics import BaseMetric, MetricCalculator, MetricResult
14-
from .evaluator import Evaluator, EvaluationResult
14+
from .evaluator import Evaluator, EvaluationResult, MainHTMLEvaluator
1515
from .utils import setup_logging, format_results
1616

1717
__all__ = [
@@ -29,4 +29,5 @@
2929
"EvaluationResult",
3030
"setup_logging",
3131
"format_results",
32+
"MainHTMLEvaluator"
3233
]

webmainbench/evaluator/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
"""
66

77
from .evaluator import Evaluator, EvaluationResult
8-
8+
from .main_html_evaluator import MainHTMLEvaluator
99
__all__ = [
1010
"Evaluator",
1111
"EvaluationResult",
12+
"MainHTMLEvaluator"
1213
]

0 commit comments

Comments
 (0)