Skip to content

Commit c4474e0

Browse files
committed
feat: refine data saver
1 parent 1ba2323 commit c4474e0

File tree

6 files changed

+533
-184
lines changed

6 files changed

+533
-184
lines changed

examples/basic_usage.py

Lines changed: 244 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -383,12 +383,12 @@ def _extract_content(self, html, url=None):
383383
results_dir.mkdir(exist_ok=True)
384384

385385
results_path = results_dir / "evaluation_results.json"
386-
DataSaver.save_evaluation_results(result.to_dict(), results_path)
386+
DataSaver.save_evaluation_results(result, results_path)
387387
print(f"\n结果已保存到: {results_path}")
388388

389389
# 10. 生成报告
390390
report_path = results_dir / "evaluation_report.csv"
391-
DataSaver.save_summary_report(result.to_dict(), report_path)
391+
DataSaver.save_summary_report(result, report_path)
392392
print(f"报告已保存到: {report_path}")
393393

394394

@@ -458,10 +458,251 @@ def _extract_content(self, html, url=None):
458458
print(f"\n📊 榜单已保存到: {leaderboard_path}")
459459

460460

461+
def demo_llm_webkit_evaluation():
462+
"""演示LLM-WebKit抽取器的6项指标评测"""
463+
464+
print("=== LLM-WebKit Extractor 6项指标评测示例 ===\n")
465+
466+
# 设置日志
467+
setup_logging(level="INFO")
468+
469+
# 1. 创建包含各种内容类型的测试数据集
470+
print("1. 创建包含多种内容类型的测试数据集...")
471+
472+
samples = []
473+
474+
# 样本1: 包含文本和代码
475+
samples.append(DataSample(
476+
id="text_code_sample",
477+
html="""
478+
<html>
479+
<body>
480+
<h1>Python编程示例</h1>
481+
<p>这是一段关于Python编程的介绍文本。</p>
482+
<pre><code>
483+
def hello_world():
484+
print("Hello, World!")
485+
return True
486+
</code></pre>
487+
<p>以上代码展示了一个简单的Python函数。</p>
488+
</body>
489+
</html>
490+
""",
491+
groundtruth_content="""# Python编程示例
492+
493+
这是一段关于Python编程的介绍文本。
494+
495+
```python
496+
def hello_world():
497+
print("Hello, World!")
498+
return True
499+
```
500+
501+
以上代码展示了一个简单的Python函数。""",
502+
groundtruth_content_list=[
503+
{"type": "heading", "content": "Python编程示例", "level": 1},
504+
{"type": "text", "content": "这是一段关于Python编程的介绍文本。"},
505+
{"type": "code", "content": "def hello_world():\n print(\"Hello, World!\")\n return True", "language": "python"},
506+
{"type": "text", "content": "以上代码展示了一个简单的Python函数。"}
507+
]
508+
))
509+
510+
# 样本2: 包含表格
511+
samples.append(DataSample(
512+
id="table_sample",
513+
html="""
514+
<html>
515+
<body>
516+
<h2>销售数据统计</h2>
517+
<table>
518+
<thead>
519+
<tr>
520+
<th>产品</th>
521+
<th>销量</th>
522+
<th>收入</th>
523+
</tr>
524+
</thead>
525+
<tbody>
526+
<tr>
527+
<td>产品A</td>
528+
<td>100</td>
529+
<td>1000</td>
530+
</tr>
531+
<tr>
532+
<td>产品B</td>
533+
<td>200</td>
534+
<td>3000</td>
535+
</tr>
536+
</tbody>
537+
</table>
538+
</body>
539+
</html>
540+
""",
541+
groundtruth_content="""## 销售数据统计
542+
543+
| 产品 | 销量 | 收入 |
544+
|------|------|------|
545+
| 产品A | 100 | 1000 |
546+
| 产品B | 200 | 3000 |""",
547+
groundtruth_content_list=[
548+
{"type": "heading", "content": "销售数据统计", "level": 2},
549+
{"type": "table", "content": "| 产品 | 销量 | 收入 |\n|------|------|------|\n| 产品A | 100 | 1000 |\n| 产品B | 200 | 3000 |"}
550+
]
551+
))
552+
553+
# 样本3: 包含公式
554+
samples.append(DataSample(
555+
id="formula_sample",
556+
html="""
557+
<html>
558+
<body>
559+
<h2>数学公式示例</h2>
560+
<p>这是一个行内公式: $E = mc^2$</p>
561+
<p>这是一个行间公式:</p>
562+
<div>$$\\int_{-\\infty}^{\\infty} e^{-x^2} dx = \\sqrt{\\pi}$$</div>
563+
</body>
564+
</html>
565+
""",
566+
groundtruth_content="""## 数学公式示例
567+
568+
这是一个行内公式: $E = mc^2$
569+
570+
这是一个行间公式:
571+
572+
$$\\int_{-\\infty}^{\\infty} e^{-x^2} dx = \\sqrt{\\pi}$$""",
573+
groundtruth_content_list=[
574+
{"type": "heading", "content": "数学公式示例", "level": 2},
575+
{"type": "text", "content": "这是一个行内公式: $E = mc^2$"},
576+
{"type": "text", "content": "这是一个行间公式:"},
577+
{"type": "formula", "content": "\\int_{-\\infty}^{\\infty} e^{-x^2} dx = \\sqrt{\\pi}"}
578+
]
579+
))
580+
581+
# 创建数据集并添加样本
582+
dataset = BenchmarkDataset(name="llm_webkit_test", description="LLM-WebKit 6项指标测试数据集")
583+
for sample in samples:
584+
dataset.add_sample(sample)
585+
586+
print(f"测试数据集包含 {len(dataset)} 个样本")
587+
print(f"样本类型: 文本+代码, 表格, 公式\n")
588+
589+
# 2. 创建LLM-WebKit抽取器
590+
print("2. 创建LLM-WebKit抽取器...")
591+
592+
# 显示所有可用的抽取器
593+
available_extractors = ExtractorFactory.list_available()
594+
print(f"可用的抽取器: {available_extractors}")
595+
596+
# 直接创建LLM-WebKit抽取器,设置模型路径
597+
config = {
598+
"model_path": "/Users/chupei/model/checkpoint-3296"
599+
}
600+
extractor = ExtractorFactory.create("llm-webkit", config=config)
601+
print(f"✅ LLM-WebKit抽取器创建成功,模型路径: {config['model_path']}")
602+
603+
print()
604+
605+
# 3. 创建评测器并显示所有可用指标
606+
print("3. 创建评测器...")
607+
evaluator = Evaluator()
608+
available_metrics = evaluator.metric_calculator.list_available_metrics()
609+
print(f"✅ 可用的评测指标 ({len(available_metrics)}项):")
610+
611+
# 按照6项指标分类显示
612+
target_metrics = ["overall", "text_edit", "code_edit", "table_edit", "table_TEDS", "formula_edit"]
613+
614+
for metric in target_metrics:
615+
if metric in available_metrics:
616+
print(f" ✅ {metric}")
617+
else:
618+
print(f" ❌ {metric} (未注册)")
619+
620+
print()
621+
622+
# 4. 运行评测
623+
print("4. 开始评测...")
624+
print("=" * 60)
625+
626+
result = evaluator.evaluate(
627+
dataset=dataset,
628+
extractor=extractor,
629+
max_samples=None # 评测所有样本
630+
)
631+
632+
# 5. 显示详细的6项指标结果
633+
print("\n5. 📊 6项指标详细评测结果:")
634+
print("=" * 60)
635+
636+
results_dict = result.to_dict()
637+
638+
# 从overall_metrics中提取指标结果
639+
metrics = results_dict.get('overall_metrics', {})
640+
641+
# 按照指标分类显示
642+
print(f"\n🏆 综合指标:")
643+
if 'overall' in metrics:
644+
print(f" overall (综合得分): {metrics['overall']:.4f}")
645+
else:
646+
print(" overall: 未计算")
647+
648+
print(f"\n📝 文本相关指标:")
649+
if 'text_edit' in metrics:
650+
print(f" text_edit (文本编辑距离): {metrics['text_edit']:.4f}")
651+
else:
652+
print(" text_edit: 未计算")
653+
if 'code_edit' in metrics:
654+
print(f" code_edit (代码编辑距离): {metrics['code_edit']:.4f}")
655+
else:
656+
print(" code_edit: 未计算")
657+
658+
print(f"\n📊 表格相关指标:")
659+
if 'table_edit' in metrics:
660+
print(f" table_edit (表格编辑距离): {metrics['table_edit']:.4f}")
661+
else:
662+
print(" table_edit: 未计算")
663+
if 'table_TEDS' in metrics:
664+
print(f" table_TEDS (表格结构相似度): {metrics['table_TEDS']:.4f}")
665+
else:
666+
print(" table_TEDS: 未计算")
667+
668+
print(f"\n🧮 公式相关指标:")
669+
if 'formula_edit' in metrics:
670+
print(f" formula_edit (公式编辑距离): {metrics['formula_edit']:.4f}")
671+
else:
672+
print(" formula_edit: 未计算")
673+
674+
print(f"\n📈 详细统计:")
675+
print(f" 总样本数: {len(dataset)}")
676+
success_count = len([s for s in results_dict.get('sample_results', []) if s.get('extraction_success', False)])
677+
failure_count = len(dataset) - success_count
678+
print(f" 成功样本数: {success_count}")
679+
print(f" 失败样本数: {failure_count}")
680+
681+
# 6. 保存结果到文件
682+
print("\n" + "=" * 60)
683+
print("6. 保存评测结果...")
684+
685+
results_dir = Path("results")
686+
results_dir.mkdir(exist_ok=True)
687+
688+
# 保存详细结果
689+
results_path = results_dir / "llm_webkit_evaluation_results.json"
690+
DataSaver.save_evaluation_results(result, results_path) # 直接传递result对象
691+
print(f"✅ 详细结果已保存到: {results_path}")
692+
693+
# 生成CSV报告
694+
report_path = results_dir / "llm_webkit_evaluation_report.csv"
695+
DataSaver.save_summary_report(result, report_path) # 直接传递result对象
696+
print(f"✅ CSV报告已保存到: {report_path}")
697+
698+
print("\n" + "=" * 60)
699+
print("✅ LLM-WebKit 6项指标评测完成!")
700+
701+
461702
if __name__ == "__main__":
462703
try:
463704
demo_basic_evaluation()
464-
# demo_extractor_comparison()
705+
# demo_llm_webkit_evaluation() # 使用新的LLM-WebKit评测示例
465706
print("\n✅ 示例运行完成!")
466707

467708
except Exception as e:

results/evaluation_report.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
extractor,total_samples,success_rate,overall
2-
mock,2,1.0,0.8
1+
extractor,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2+
mock,2,1.0,0.8,1.0,1.0,1.0,1.0,0.0

results/evaluation_results.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"metadata": {
33
"dataset_name": "sample_dataset",
44
"extractor_name": "mock",
5-
"timestamp": "2025-07-30T10:05:54.322334",
5+
"timestamp": "2025-07-31T14:29:43.477342",
66
"total_samples": 2
77
},
88
"overall_metrics": {
@@ -17,7 +17,7 @@
1717
{
1818
"sample_id": "sample-001-programming-tutorial",
1919
"extraction_success": true,
20-
"extraction_time": 3.0994415283203125e-06,
20+
"extraction_time": 4.0531158447265625e-06,
2121
"metrics": {
2222
"code_edit": {
2323
"score": 1.0,
@@ -113,7 +113,7 @@
113113
{
114114
"sample_id": "sample-002-math-formulas",
115115
"extraction_success": true,
116-
"extraction_time": 2.1457672119140625e-06,
116+
"extraction_time": 1.9073486328125e-06,
117117
"metrics": {
118118
"code_edit": {
119119
"score": 1.0,

0 commit comments

Comments
 (0)