Skip to content

Commit 0784253

Browse files
authored
Merge pull request #19 from pekopoke/dev
Dev:update metrics
2 parents b90b73d + 2460023 commit 0784253

File tree

8 files changed

+549
-154
lines changed

8 files changed

+549
-154
lines changed

examples/basic_usage.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -796,13 +796,15 @@ def demo_multi_extraction():
796796
from webmainbench import DataLoader, DataSaver, Evaluator, ExtractorFactory
797797
from pathlib import Path
798798
import time
799+
800+
799801
# 设置日志
800802
setup_logging(level="INFO")
801803

802804
# 配置文件路径
803805
data_dir = Path("../data")
804-
dataset_path = data_dir / "sample_dataset.jsonl"
805-
# dataset_path = "/home/lulindong/Pycharm_projects/cc/test.jsonl"
806+
# dataset_path = data_dir / "sample_dataset.jsonl"
807+
dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
806808

807809
print(f"📂 数据集文件: {dataset_path}")
808810

@@ -815,7 +817,6 @@ def demo_multi_extraction():
815817
"list_bullets": True,
816818
"preserve_formatting": True
817819
}},
818-
819820
{"name": "trafilatura", "config": {}},
820821
{"name": "magic-html", "config": {}},
821822
]
@@ -902,7 +903,7 @@ def demo_multi_extraction():
902903
all_results.append(result)
903904

904905
# 保存带有当前抽取器内容的数据集
905-
enriched_dataset_path = results_dir / f"{dataset.name}_with_{extractor.name}_extraction.jsonl"
906+
enriched_dataset_path = results_dir / f"{dataset.name}_{extractor.name}_extraction_infer.jsonl"
906907
DataSaver.save_dataset_with_extraction(
907908
results=result,
908909
dataset=dataset,
@@ -1014,7 +1015,7 @@ def forward(self, x):
10141015
{"type": "code", "content": "import torch\nimport torch.nn as nn\n\nclass SimpleNet(nn.Module):\n def __init__(self):\n super().__init__()\n self.fc = nn.Linear(784, 10)\n \n def forward(self, x):\n return self.fc(x)", "language": "python"}
10151016
]
10161017
}
1017-
samples.append(DataSample.from_dict(sample_1_data))
1018+
# samples.append(DataSample.from_dict(sample_1_data))
10181019

10191020
# 样本2: 包含表格的预处理HTML
10201021
sample_2_data = {
@@ -1063,10 +1064,18 @@ def forward(self, x):
10631064
{"type": "table", "content": "| 模型 | 准确率 | 参数量 |\n|------|--------|---------|\n| ResNet-18 | 95.3% | 11.7M |\n| VGG-16 | 92.7% | 138M |"}
10641065
]
10651066
}
1066-
samples.append(DataSample.from_dict(sample_2_data))
1067-
1068-
# 创建数据集并添加样本
1069-
dataset = BenchmarkDataset(name="preprocessed_html_test", description="预处理HTML功能测试数据集")
1067+
# samples.append(DataSample.from_dict(sample_2_data))
1068+
#
1069+
# # 创建数据集并添加样本
1070+
# dataset = BenchmarkDataset(name="preprocessed_html_test", description="预处理HTML功能测试数据集")
1071+
1072+
1073+
1074+
# 本地加载数据集
1075+
jsonl_file_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
1076+
1077+
# 使用DataLoader加载本地JSONL数据
1078+
dataset = DataLoader.load_jsonl(jsonl_file_path)
10701079
for sample in samples:
10711080
dataset.add_sample(sample)
10721081

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2-
llm-webkit,preprocessed_html_test,2,1.0,0.5029,0.5,1.0,0.5,0.5,0.0143
1+
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2+
llm-webkit,preprocessed_html_test,2,1.0,0.5029,0.5,1.0,0.5,0.5,0.0143

0 commit comments

Comments
 (0)