Skip to content

Commit 6b9bb62

Browse files
committed
fix: llm_web_kit extrator
1 parent 30e2650 commit 6b9bb62

File tree

6 files changed

+88
-93
lines changed

6 files changed

+88
-93
lines changed

examples/basic_usage.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,15 +1015,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
10151015
print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
10161016
print()
10171017

1018-
# 3. 性能对比:展示预处理HTML模式的优势
1019-
print("3. 性能优势演示...")
1020-
print("🚀 预处理HTML模式的优势:")
1021-
print(" ✅ 无需加载大型LLM模型(节省内存)")
1022-
print(" ✅ 跳过HTML简化推理步骤(节省时间)")
1023-
print(" ✅ 只需要基础的llm_web_kit依赖")
1024-
print(" ✅ 适合批量处理已预处理的数据")
1025-
print()
1026-
10271018
# 4. 运行评测
10281019
print("4. 开始评测...")
10291020
print("=" * 50)
@@ -1094,25 +1085,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
10941085
print(f"✅ 详细结果已保存到: {results_path}")
10951086
print(f"✅ CSV报告已保存到: {report_path}")
10961087

1097-
# 8. 使用建议
1098-
print(f"\n8. 💡 实际使用建议:")
1099-
print("=" * 50)
1100-
print("🔧 何时使用预处理HTML模式:")
1101-
print(" 1. 已有LLM简化后的HTML数据")
1102-
print(" 2. 需要批量处理大量数据")
1103-
print(" 3. 部署环境内存有限")
1104-
print(" 4. 对提取速度有较高要求")
1105-
print()
1106-
print("📝 数据准备要求:")
1107-
print(" 1. 确保预处理HTML包含_item_id属性")
1108-
print(" 2. 保持原始HTML作为备用")
1109-
print(" 3. 验证预处理质量")
1110-
print()
1111-
print("⚙️ 配置参数说明:")
1112-
print(" - use_preprocessed_html: True/False")
1113-
print(" - preprocessed_html_field: 字段名(默认'llm_webkit_html')")
1114-
1115-
print("\n✅ 预处理HTML功能演示完成!")
11161088

11171089

11181090
if __name__ == "__main__":
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2-
llm-webkit,real_preprocessed_html_test,2,1.0,0.1042,0.0,0.0,0.0,0.0,0.521
2+
llm-webkit,real_preprocessed_html_test,2,1.0,0.0091,0.0,0.0,0.0,0.0,0.0453

results/preprocessed_html_evaluation_results.json

Lines changed: 42 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2,32 +2,32 @@
22
"metadata": {
33
"dataset_name": "real_preprocessed_html_test",
44
"extractor_name": "llm-webkit",
5-
"timestamp": "2025-08-08T16:08:47.516977",
5+
"timestamp": "2025-08-13T14:53:57.558572",
66
"total_samples": 2
77
},
88
"overall_metrics": {
9-
"text_edit": 0.5209619232317955,
9+
"text_edit": 0.045309156685715835,
1010
"code_edit": 0.0,
1111
"table_edit": 0.0,
1212
"table_TEDS": 0.0,
1313
"formula_edit": 0.0,
14-
"overall": 0.10419238464635909
14+
"overall": 0.009061831337143167
1515
},
1616
"sample_results": [
1717
{
1818
"sample_id": "33e291cd-5b26-48b1-977f-3c63b45e6d13",
1919
"extraction_success": true,
20-
"extraction_time": 1.0598080158233643,
20+
"extraction_time": 0.6193361282348633,
2121
"metrics": {
2222
"code_edit": {
2323
"score": 0.0,
2424
"success": true,
2525
"details": {
26-
"distance": 499,
27-
"predicted_length": 499,
26+
"distance": 505,
27+
"predicted_length": 505,
2828
"groundtruth_length": 0,
2929
"normalized": true,
30-
"predicted_code_length": 499,
30+
"predicted_code_length": 505,
3131
"groundtruth_code_length": 0,
3232
"content_type": "code"
3333
}
@@ -43,57 +43,48 @@
4343
"error": "Both predicted and groundtruth are empty"
4444
},
4545
"text_edit": {
46-
"score": 0.16059957173447537,
46+
"score": 0.09025270758122739,
4747
"success": true,
4848
"details": {
49-
"distance": 392,
50-
"predicted_length": 467,
49+
"distance": 252,
50+
"predicted_length": 25,
5151
"groundtruth_length": 277,
5252
"normalized": true,
53-
"predicted_text_length": 467,
53+
"predicted_text_length": 25,
5454
"groundtruth_text_length": 277,
5555
"content_type": "text"
5656
}
5757
},
5858
"table_edit": {
5959
"score": 0.0,
60-
"success": true,
60+
"success": false,
6161
"details": {
62-
"distance": 43,
63-
"predicted_length": 43,
64-
"groundtruth_length": 0,
65-
"normalized": true,
66-
"predicted_table_length": 43,
62+
"predicted_table_length": 0,
6763
"groundtruth_table_length": 0,
6864
"content_type": "table"
69-
}
65+
},
66+
"error": "Both predicted and groundtruth are empty"
7067
},
7168
"table_TEDS": {
7269
"score": 0.0,
73-
"success": true,
70+
"success": false,
7471
"details": {
75-
"edit_distance": 4.0,
76-
"predicted_nodes": 4,
77-
"groundtruth_nodes": 3,
78-
"max_nodes": 4,
79-
"structure_only": false,
80-
"algorithm": "TEDS",
81-
"content_type": "table"
82-
}
72+
"content_type": "table",
73+
"algorithm": "TEDS"
74+
},
75+
"error": "Skipped due to table_edit failure: unknown reason"
8376
},
8477
"overall": {
85-
"score": 0.04014989293361884,
78+
"score": 0.045126353790613694,
8679
"success": true,
8780
"details": {
8881
"source": "average_of_all_metrics",
8982
"description": "Overall score as average of all successful metrics",
90-
"successful_metrics": 4,
91-
"failed_metrics": 1,
83+
"successful_metrics": 2,
84+
"failed_metrics": 3,
9285
"individual_scores": {
9386
"code_edit": 0.0,
94-
"text_edit": 0.16059957173447537,
95-
"table_edit": 0.0,
96-
"table_TEDS": 0.0
87+
"text_edit": 0.09025270758122739
9788
}
9889
}
9990
}
@@ -109,7 +100,7 @@
109100
{
110101
"sample_id": "93898d00-0d6c-451d-9f99-4c386c6c2918",
111102
"extraction_success": true,
112-
"extraction_time": 0.9913830757141113,
103+
"extraction_time": 0.0010640621185302734,
113104
"metrics": {
114105
"code_edit": {
115106
"score": 0.0,
@@ -132,56 +123,47 @@
132123
"error": "Both predicted and groundtruth are empty"
133124
},
134125
"text_edit": {
135-
"score": 0.8813242747291157,
126+
"score": 0.00036560579020428197,
136127
"success": true,
137128
"details": {
138-
"distance": 21730,
139-
"predicted_length": 183104,
129+
"distance": 161317,
130+
"predicted_length": 59,
140131
"groundtruth_length": 161376,
141132
"normalized": true,
142-
"predicted_text_length": 183104,
133+
"predicted_text_length": 59,
143134
"groundtruth_text_length": 161376,
144135
"content_type": "text"
145136
}
146137
},
147138
"table_edit": {
148139
"score": 0.0,
149-
"success": true,
140+
"success": false,
150141
"details": {
151-
"distance": 33,
152-
"predicted_length": 33,
153-
"groundtruth_length": 0,
154-
"normalized": true,
155-
"predicted_table_length": 33,
142+
"predicted_table_length": 0,
156143
"groundtruth_table_length": 0,
157144
"content_type": "table"
158-
}
145+
},
146+
"error": "Both predicted and groundtruth are empty"
159147
},
160148
"table_TEDS": {
161149
"score": 0.0,
162-
"success": true,
150+
"success": false,
163151
"details": {
164-
"edit_distance": 8.0,
165-
"predicted_nodes": 8,
166-
"groundtruth_nodes": 3,
167-
"max_nodes": 8,
168-
"structure_only": false,
169-
"algorithm": "TEDS",
170-
"content_type": "table"
171-
}
152+
"content_type": "table",
153+
"algorithm": "TEDS"
154+
},
155+
"error": "Skipped due to table_edit failure: unknown reason"
172156
},
173157
"overall": {
174-
"score": 0.29377475824303856,
158+
"score": 0.00036560579020428197,
175159
"success": true,
176160
"details": {
177161
"source": "average_of_all_metrics",
178162
"description": "Overall score as average of all successful metrics",
179-
"successful_metrics": 3,
180-
"failed_metrics": 2,
163+
"successful_metrics": 1,
164+
"failed_metrics": 4,
181165
"individual_scores": {
182-
"text_edit": 0.8813242747291157,
183-
"table_edit": 0.0,
184-
"table_TEDS": 0.0
166+
"text_edit": 0.00036560579020428197
185167
}
186168
}
187169
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,5 +310,7 @@ def test_preprocessed_html_e2e(self):
310310
self.skipTest(f"LLM-WebKit dependencies not available: {e}")
311311

312312

313+
314+
313315
if __name__ == '__main__':
314316
unittest.main()

webmainbench/evaluator/evaluator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,9 @@ def _evaluate_sample(self, sample: DataSample, extractor: BaseExtractor) -> Dict
309309
"""Evaluate a single sample."""
310310
if extractor.__class__.__name__ == 'TestModelExtractor':
311311
extraction_result = extractor.extract_from_sample(sample)
312+
elif extractor.__class__.__name__ == 'LlmWebkitExtractor':
313+
# LlmWebkitExtractor可以接受DataSample对象来支持预处理HTML
314+
extraction_result = extractor.extract(sample, sample.url)
312315
else:
313316
# Extract content
314317
extraction_result = extractor.extract(sample.html, sample.url)

webmainbench/extractors/llm_webkit_extractor.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -605,14 +605,50 @@ def _extract_content_from_main_html(self, main_html: str, url: str = None) -> tu
605605
print(f"❌ llm-webkit提取失败: {e}")
606606
print(f"❌ 错误详情: {traceback.format_exc()}")
607607
raise RuntimeError(f"llm-webkit提取失败: {str(e)}") from e
608-
609-
608+
609+
610+
def extract(self, html_or_sample, url: str = None) -> ExtractionResult:
611+
"""
612+
重写extract方法以支持预处理HTML模式
613+
614+
Args:
615+
html_or_sample: HTML字符串或DataSample对象
616+
url: 可选的页面URL
617+
618+
Returns:
619+
ExtractionResult实例
620+
"""
621+
# 判断输入类型
622+
if type(html_or_sample).__name__ == 'DataSample': # 这是一个DataSample对象
623+
sample = html_or_sample
624+
625+
# 检查是否使用预处理的HTML
626+
if self.inference_config.use_preprocessed_html:
627+
preprocessed_field = self.inference_config.preprocessed_html_field
628+
629+
# 从sample中获取预处理的HTML内容
630+
if hasattr(sample, preprocessed_field):
631+
preprocessed_html = getattr(sample, preprocessed_field)
632+
if preprocessed_html:
633+
print(f"📥 使用预处理HTML字段: {preprocessed_field}")
634+
return super().extract(preprocessed_html, sample.url)
635+
else:
636+
print(f"⚠️ 预处理HTML字段 {preprocessed_field} 为空,回退到原始HTML")
637+
else:
638+
print(f"⚠️ 样本中缺少预处理HTML字段 {preprocessed_field},回退到原始HTML")
639+
640+
# 使用原始HTML
641+
return super().extract(sample.html, sample.url)
642+
else:
643+
# 这是普通的HTML字符串,使用标准处理
644+
return super().extract(html_or_sample, url)
645+
610646
def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
611647
"""
612648
使用高级LLM推理提取内容.
613649
614650
Args:
615-
html: HTML内容或主HTML内容(如果配置了use_preprocessed_html
651+
html: HTML内容。如果配置了use_preprocessed_html=True,则由Evaluator传入预处理的HTML内容
616652
url: 可选的页面URL
617653
618654
Returns:
@@ -623,7 +659,7 @@ def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
623659
try:
624660
# 检查是否使用预处理的HTML(跳过HTML简化步骤)
625661
if self.inference_config.use_preprocessed_html:
626-
# 直接使用传入的html作为main_html,调用_extract_content_from_main_html
662+
# 传入的html已经是预处理的内容(由Evaluator从指定字段提取),直接用作main_html
627663
print(f"📥 使用预处理HTML,跳过HTML简化步骤")
628664
content, content_list = self._extract_content_from_main_html(html, url)
629665

0 commit comments

Comments
 (0)