@@ -796,13 +796,15 @@ def demo_multi_extraction():
796796 from webmainbench import DataLoader , DataSaver , Evaluator , ExtractorFactory
797797 from pathlib import Path
798798 import time
799+
800+
799801 # 设置日志
800802 setup_logging (level = "INFO" )
801803
802804 # 配置文件路径
803805 data_dir = Path ("../data" )
804- dataset_path = data_dir / "sample_dataset.jsonl"
805- # dataset_path = "/home/lulindong/Pycharm_projects/cc/test .jsonl"
806+ # dataset_path = data_dir / "sample_dataset.jsonl"
807+ dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit .jsonl"
806808
807809 print (f"📂 数据集文件: { dataset_path } " )
808810
@@ -815,7 +817,6 @@ def demo_multi_extraction():
815817 "list_bullets" : True ,
816818 "preserve_formatting" : True
817819 }},
818-
819820 {"name" : "trafilatura" , "config" : {}},
820821 {"name" : "magic-html" , "config" : {}},
821822 ]
@@ -902,7 +903,7 @@ def demo_multi_extraction():
902903 all_results .append (result )
903904
904905 # 保存带有当前抽取器内容的数据集
905- enriched_dataset_path = results_dir / f"{ dataset .name } _with_ { extractor .name } _extraction .jsonl"
906+ enriched_dataset_path = results_dir / f"{ dataset .name } _ { extractor .name } _extraction_infer .jsonl"
906907 DataSaver .save_dataset_with_extraction (
907908 results = result ,
908909 dataset = dataset ,
@@ -1014,7 +1015,7 @@ def forward(self, x):
10141015 {"type" : "code" , "content" : "import torch\n import torch.nn as nn\n \n class SimpleNet(nn.Module):\n def __init__(self):\n super().__init__()\n self.fc = nn.Linear(784, 10)\n \n def forward(self, x):\n return self.fc(x)" , "language" : "python" }
10151016 ]
10161017 }
1017- samples .append (DataSample .from_dict (sample_1_data ))
1018+ # samples.append(DataSample.from_dict(sample_1_data))
10181019
10191020 # 样本2: 包含表格的预处理HTML
10201021 sample_2_data = {
@@ -1063,10 +1064,18 @@ def forward(self, x):
10631064 {"type" : "table" , "content" : "| 模型 | 准确率 | 参数量 |\n |------|--------|---------|\n | ResNet-18 | 95.3% | 11.7M |\n | VGG-16 | 92.7% | 138M |" }
10641065 ]
10651066 }
1066- samples .append (DataSample .from_dict (sample_2_data ))
1067-
1068- # 创建数据集并添加样本
1069- dataset = BenchmarkDataset (name = "preprocessed_html_test" , description = "预处理HTML功能测试数据集" )
1067+ # samples.append(DataSample.from_dict(sample_2_data))
1068+ #
1069+ # # 创建数据集并添加样本
1070+ # dataset = BenchmarkDataset(name="preprocessed_html_test", description="预处理HTML功能测试数据集")
1071+
1072+
1073+
1074+ # 本地加载数据集
1075+ jsonl_file_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
1076+
1077+ # 使用DataLoader加载本地JSONL数据
1078+ dataset = DataLoader .load_jsonl (jsonl_file_path )
10701079 for sample in samples :
10711080 dataset .add_sample (sample )
10721081
0 commit comments