Skip to content

Commit 4cf0655

Browse files
authored
Merge pull request #27 from e06084/main
fix: llm_web_kit extrator
2 parents 27245c7 + e46a245 commit 4cf0655

File tree

4 files changed

+32
-100
lines changed

4 files changed

+32
-100
lines changed

data/WebMainBench_dataset_sample2.jsonl

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2-
llm-webkit,real_preprocessed_html_test,2,1.0,0.0091,0.0,0.0,0.0,0.0,0.0453
2+
llm-webkit,real_preprocessed_html_test,2,0.5,0.0181,0.0,0.0,0.0,0.0,0.0903

results/preprocessed_html_evaluation_results.json

Lines changed: 19 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,22 @@
22
"metadata": {
33
"dataset_name": "real_preprocessed_html_test",
44
"extractor_name": "llm-webkit",
5-
"timestamp": "2025-08-13T14:53:57.558572",
5+
"timestamp": "2025-08-14T11:22:33.005587",
66
"total_samples": 2
77
},
88
"overall_metrics": {
9-
"text_edit": 0.045309156685715835,
9+
"text_edit": 0.09025270758122739,
1010
"code_edit": 0.0,
1111
"table_edit": 0.0,
1212
"table_TEDS": 0.0,
1313
"formula_edit": 0.0,
14-
"overall": 0.009061831337143167
14+
"overall": 0.018050541516245477
1515
},
1616
"sample_results": [
1717
{
1818
"sample_id": "33e291cd-5b26-48b1-977f-3c63b45e6d13",
1919
"extraction_success": true,
20-
"extraction_time": 0.6193361282348633,
20+
"extraction_time": 0.6739089488983154,
2121
"metrics": {
2222
"code_edit": {
2323
"score": 0.0,
@@ -99,91 +99,26 @@
9999
},
100100
{
101101
"sample_id": "93898d00-0d6c-451d-9f99-4c386c6c2918",
102-
"extraction_success": true,
103-
"extraction_time": 0.0010640621185302734,
104-
"metrics": {
105-
"code_edit": {
106-
"score": 0.0,
107-
"success": false,
108-
"details": {
109-
"predicted_code_length": 0,
110-
"groundtruth_code_length": 0,
111-
"content_type": "code"
112-
},
113-
"error": "Both predicted and groundtruth are empty"
114-
},
115-
"formula_edit": {
116-
"score": 0.0,
117-
"success": false,
118-
"details": {
119-
"predicted_formula_length": 0,
120-
"groundtruth_formula_length": 0,
121-
"content_type": "formula"
122-
},
123-
"error": "Both predicted and groundtruth are empty"
124-
},
125-
"text_edit": {
126-
"score": 0.00036560579020428197,
127-
"success": true,
128-
"details": {
129-
"distance": 161317,
130-
"predicted_length": 59,
131-
"groundtruth_length": 161376,
132-
"normalized": true,
133-
"predicted_text_length": 59,
134-
"groundtruth_text_length": 161376,
135-
"content_type": "text"
136-
}
137-
},
138-
"table_edit": {
139-
"score": 0.0,
140-
"success": false,
141-
"details": {
142-
"predicted_table_length": 0,
143-
"groundtruth_table_length": 0,
144-
"content_type": "table"
145-
},
146-
"error": "Both predicted and groundtruth are empty"
147-
},
148-
"table_TEDS": {
149-
"score": 0.0,
150-
"success": false,
151-
"details": {
152-
"content_type": "table",
153-
"algorithm": "TEDS"
154-
},
155-
"error": "Skipped due to table_edit failure: unknown reason"
156-
},
157-
"overall": {
158-
"score": 0.00036560579020428197,
159-
"success": true,
160-
"details": {
161-
"source": "average_of_all_metrics",
162-
"description": "Overall score as average of all successful metrics",
163-
"successful_metrics": 1,
164-
"failed_metrics": 4,
165-
"individual_scores": {
166-
"text_edit": 0.00036560579020428197
167-
}
168-
}
169-
}
170-
},
171-
"sample_metadata": {
172-
"url": "https://www.15shuba.net/html/58/58618/index.html",
173-
"domain": null,
174-
"language": "zh",
175-
"content_type": null,
176-
"difficulty": null
177-
}
102+
"extraction_success": false,
103+
"extraction_time": 0.0,
104+
"extraction_error": "Empty HTML input",
105+
"metrics": {}
178106
}
179107
],
180108
"category_metrics": null,
181109
"error_analysis": {
182110
"total_samples": 2,
183-
"failed_count": 0,
184-
"success_rate": 1.0,
185-
"common_errors": {},
186-
"sample_errors": []
111+
"failed_count": 1,
112+
"success_rate": 0.5,
113+
"common_errors": {
114+
"empty_input": 1
115+
},
116+
"sample_errors": [
117+
{
118+
"sample_id": "93898d00-0d6c-451d-9f99-4c386c6c2918",
119+
"error": "Empty HTML input"
120+
}
121+
]
187122
},
188123
"extractor_config": {
189124
"use_preprocessed_html": true,

webmainbench/extractors/llm_webkit_extractor.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -623,22 +623,19 @@ def extract(self, html_or_sample, url: str = None) -> ExtractionResult:
623623
sample = html_or_sample
624624

625625
# 检查是否使用预处理的HTML
626-
if self.inference_config.use_preprocessed_html:
627-
preprocessed_field = self.inference_config.preprocessed_html_field
628-
629-
# 从sample中获取预处理的HTML内容
630-
if hasattr(sample, preprocessed_field):
631-
preprocessed_html = getattr(sample, preprocessed_field)
632-
if preprocessed_html:
626+
try:
627+
if self.inference_config.use_preprocessed_html:
628+
preprocessed_field = self.inference_config.preprocessed_html_field
629+
630+
# 从sample中获取预处理的HTML内容
631+
if hasattr(sample, preprocessed_field):
632+
preprocessed_html = getattr(sample, preprocessed_field)
633633
print(f"📥 使用预处理HTML字段: {preprocessed_field}")
634634
return super().extract(preprocessed_html, sample.url)
635-
else:
636-
print(f"⚠️ 预处理HTML字段 {preprocessed_field} 为空,回退到原始HTML")
637-
else:
638-
print(f"⚠️ 样本中缺少预处理HTML字段 {preprocessed_field},回退到原始HTML")
639-
640-
# 使用原始HTML
641-
return super().extract(sample.html, sample.url)
635+
except Exception as e:
636+
return ExtractionResult.create_error_result(
637+
f"访问预处理HTML字段 {preprocessed_field} 时发生异常: {str(e)}"
638+
)
642639
else:
643640
# 这是普通的HTML字符串,使用标准处理
644641
return super().extract(html_or_sample, url)

0 commit comments

Comments
 (0)