Skip to content

Commit 40f0617

Browse files
authored
Merge pull request #42 from pekopoke/dev
fix bug:table 重复
2 parents b34a46f + 5bb6dd4 commit 40f0617

File tree

5 files changed

+40
-47
lines changed

5 files changed

+40
-47
lines changed

tests/test_metrics.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -874,8 +874,8 @@ def test_html_table_edit_distance(self):
874874
# 验证表格编辑距离(分隔符长度差异导致的固定分数)
875875
self.assertIn("table_edit", results)
876876
self.assertTrue(results["table_edit"].success)
877-
self.assertAlmostEqual(results["table_edit"].score, 0.48314306100606497, places=5,
878-
msg=f"table_edit分数应该是0.48314306100606497,实际: {results['table_edit'].score}")
877+
self.assertAlmostEqual(results["table_edit"].score, 0.593573, places=5,
878+
msg=f"table_edit分数应该是0.593573,实际: {results['table_edit'].score}")
879879

880880
# 验证TEDS指标(表格结构完全相同,满分)
881881
self.assertIn("table_TEDS", results)

tests/test_table_extraction.py

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -205,34 +205,17 @@ def test_table_with_complex_content(self):
205205
| 包含*斜体* | 包含$公式$ | 包含>引用 |"""
206206
self.assertIn(expected_table, result['table'])
207207

208-
def test_nested_html_tables(self):
209-
"""测试嵌套HTML表格"""
210-
text = """嵌套表格:
211-
<table>
212-
<tr><td>外层表格</td></tr>
213-
<tr><td>
214-
<table>
215-
<tr><td>内层表格</td></tr>
216-
</table>
217-
</td></tr>
218-
</table>"""
208+
def test_abnormal_html_table(self):
209+
"""测试复杂html表格,不要重复抽取"""
210+
text = """<table><tbody><tr><td><table><tbody><tr><td><table><tbody><tr><td><strong>Better Management of /$800 Bln Forex Reserves Urged</strong></td></tr></tbody></table></td></tr><tr><td><p>A number of political advisors on Sunday called for more rationally managing China's massive foreign exchange reserves, which doubled over the 2004-05 period to an equivalent of US/$818.9 billion, second only to Japan.</p><p>The quick buildup is largely a result of China's booming exports and foreign exchange controls by the government, as well as speculation on the yuan's rise, industry watchers agree.</p><p>A big part of China's foreign exchange reserves are US dollar-denominated assets, including bonds issued by the US government. "Risks in the international foreign exchange market should be lowered when China manages its reserves," said Professor Guo Guoqing of a business school of the People's University of China.</p><p>Guo, a member of the National Committee of the Chinese People's Political Consultative Conference (CPPCC), the country's top advisory body, urged the government to cut back on subsidies for exports and take other measures to reduce foreign trade surpluses appropriately and achieve the balance in international payments.</p><p>Part of the reserves should be channeled into the imports of more high-tech machinery, equipment and other products, he suggested on the sidelines of the CPPCC's annual session.</p><p>The United States has been contending that the value of yuan, also known as renminbi or RMB, is too low, giving Chinese exporters an "unfair" advantage. But China said its huge trade surpluses are also a result of the US reluctance to export goods involving state-of-the-art technologies.</p><p>Fu Rui, also a CPPCC member, said with ample foreign exchange reserves, China could intentionally bulk up the reserves of strategic resources.</p><p>The international consensus is a country's rational foreign exchange reserves should equal to its imports demand for a full quarter. Also taking into consideration of payments for foreign debts, returns for foreign investors and other demands in China, many believe it is enough for the country to retain US/$300 billion.</p><p>But Lin Yifu, a popular economist, underscored China's per capita foreign exchange reserves remains not large - less than one-tenth of Japan's and far below that of Hong Kong and Singapore.</p><p>The reserves were "tremendous fruits" from China's reform and opening-up drive, he said.</p><p>His remarks were echoed by Xiao Zhuoji, a well-known economics professor with Beijing University. "The rise of foreign exchange reserves reflects China's fast, sustained economic growth and sound international payments," he said.</p><p>"The reserves are of significant importance to upgrade the China image in the international economic arena, strengthen the nation's macro-control capabilities and guard against financial risks," added Xiao, a Standing Committee member of the CPPCC National Committee.</p><p>But as the People's Bank of China, or the central bank, has to buy foreign exchange reserves under the current foreign exchange control policies, the country's monetary base will be enlarged, increasing its inflationary pressure and difficulties on macro-economic controls, analysts acknowledge.</p><p>Another prevailing view is that China's hefty foreign exchange reserves actually "occupied" large amounts of fund resources that otherwise can be diverted for domestic investment and consumption.</p><p>Some CPPCC members said they believe it is already "meaningless" now to talk about whether China's foreign exchange reserves size is big or not. "The key lies on how to raise the reserves' yields."</p><p>"If the annual yields from foreign exchange reserves could reach a stable 5 percent, the nation will reap in 300 billion yuan a year. What a big fortune!" one advisor told Xinhua.</p><p>Central banker Zhou Xiaochuan reiterated earlier that China will "pay attention to and maintain the flexibility" of foreign reserves structure, which is unknown to the public.</p></td></tr></tbody></table></td></tr></tbody></table>"""
219211

220212
result = self.metric._extract_from_markdown(text)
221-
print("result['table']",result['table'])
222-
# 验证嵌套表格被完整提取
223-
expected_table = """<table>
224-
<tr><td>外层表格</td></tr>
225-
<tr><td>
226-
<table>
227-
<tr><td>内层表格</td></tr>
228-
</table>
229-
</td></tr>
230-
</table>
231-
<table>
232-
<tr><td>内层表格</td></tr>
233-
</table>"""
213+
214+
# 验证复杂表格被完整提取
215+
expected_table = """<table><tbody><tr><td><table><tbody><tr><td><table><tbody><tr><td><strong>Better Management of /$800 Bln Forex Reserves Urged</strong></td></tr></tbody></table></td></tr><tr><td><p>A number of political advisors on Sunday called for more rationally managing China's massive foreign exchange reserves, which doubled over the 2004-05 period to an equivalent of US/$818.9 billion, second only to Japan.</p><p>The quick buildup is largely a result of China's booming exports and foreign exchange controls by the government, as well as speculation on the yuan's rise, industry watchers agree.</p><p>A big part of China's foreign exchange reserves are US dollar-denominated assets, including bonds issued by the US government. "Risks in the international foreign exchange market should be lowered when China manages its reserves," said Professor Guo Guoqing of a business school of the People's University of China.</p><p>Guo, a member of the National Committee of the Chinese People's Political Consultative Conference (CPPCC), the country's top advisory body, urged the government to cut back on subsidies for exports and take other measures to reduce foreign trade surpluses appropriately and achieve the balance in international payments.</p><p>Part of the reserves should be channeled into the imports of more high-tech machinery, equipment and other products, he suggested on the sidelines of the CPPCC's annual session.</p><p>The United States has been contending that the value of yuan, also known as renminbi or RMB, is too low, giving Chinese exporters an "unfair" advantage. But China said its huge trade surpluses are also a result of the US reluctance to export goods involving state-of-the-art technologies.</p><p>Fu Rui, also a CPPCC member, said with ample foreign exchange reserves, China could intentionally bulk up the reserves of strategic resources.</p><p>The international consensus is a country's rational foreign exchange reserves should equal to its imports demand for a full quarter. Also taking into consideration of payments for foreign debts, returns for foreign investors and other demands in China, many believe it is enough for the country to retain US/$300 billion.</p><p>But Lin Yifu, a popular economist, underscored China's per capita foreign exchange reserves remains not large - less than one-tenth of Japan's and far below that of Hong Kong and Singapore.</p><p>The reserves were "tremendous fruits" from China's reform and opening-up drive, he said.</p><p>His remarks were echoed by Xiao Zhuoji, a well-known economics professor with Beijing University. "The rise of foreign exchange reserves reflects China's fast, sustained economic growth and sound international payments," he said.</p><p>"The reserves are of significant importance to upgrade the China image in the international economic arena, strengthen the nation's macro-control capabilities and guard against financial risks," added Xiao, a Standing Committee member of the CPPCC National Committee.</p><p>But as the People's Bank of China, or the central bank, has to buy foreign exchange reserves under the current foreign exchange control policies, the country's monetary base will be enlarged, increasing its inflationary pressure and difficulties on macro-economic controls, analysts acknowledge.</p><p>Another prevailing view is that China's hefty foreign exchange reserves actually "occupied" large amounts of fund resources that otherwise can be diverted for domestic investment and consumption.</p><p>Some CPPCC members said they believe it is already "meaningless" now to talk about whether China's foreign exchange reserves size is big or not. "The key lies on how to raise the reserves' yields."</p><p>"If the annual yields from foreign exchange reserves could reach a stable 5 percent, the nation will reap in 300 billion yuan a year. What a big fortune!" one advisor told Xinhua.</p><p>Central banker Zhou Xiaochuan reiterated earlier that China will "pay attention to and maintain the flexibility" of foreign reserves structure, which is unknown to the public.</p></td></tr></tbody></table></td></tr></tbody></table>"""
234216
self.assertIn(expected_table, result['table'])
235217

236218

219+
237220
if __name__ == '__main__':
238221
unittest.main()

webmainbench/extractors/resiliparse_extractor.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,19 @@
1111
@dataclass
1212
class ResiliparseInferenceConfig:
1313
"""Configuration for Resiliparse extractor."""
14-
main_content: bool = True
15-
alt_texts: bool = True
16-
links: bool = False
17-
form_fields: bool = False
18-
noscript: bool = False
19-
list_bullets: bool = True
20-
preserve_formatting: bool = True
21-
comments: bool = True
14+
main_content: bool = True # 是否提取主要内容,默认开启.(丢弃<nav>(导航列表)、<footer>(版权信息) 、<aside>(侧边栏)、<footer>(页脚)等)
15+
alt_texts: bool = True # 是否提取 <img> 的 alt 属性文本,默认开启
16+
links: bool = False # 是否提取超链接,默认关闭
17+
form_fields: bool = False # 是否提取表单控件,默认关闭
18+
noscript: bool = False # 是否提取 <noscript> 标签的内容,默认关闭
19+
list_bullets: bool = True # 是否用 • 标记列表项,默认开启
20+
preserve_formatting: bool = True ## 控制格式保留:True(默认):保留列表、换行等基础格式,False:完全压缩(无换行、无列表,所有文本连在一起)
21+
comments: bool = True # 是否保留用户评论,默认开启
22+
post_meta: bool = True # 是否保留文章元信息,默认开启
23+
hidden_elements: bool = False # 是否保留CSS隐藏元素,默认关闭
24+
25+
26+
2227
# 可根据需要添加更多resiliparse支持的参数
2328

2429

webmainbench/extractors/trafilatura_extractor.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,22 @@
66
from dataclasses import dataclass
77
from .base import BaseExtractor, ExtractionResult
88
from .factory import extractor
9-
from trafilatura import extract
9+
from trafilatura import extract,html2txt
1010
import re
1111

1212

1313
@dataclass
1414
class TrafilaturaInferenceConfig:
1515
"""Configuration for Trafilatura extractor."""
16-
favor_precision: bool = True
17-
favor_recall: bool = True
18-
include_comments: bool = False
19-
include_tables: bool = True
20-
# 可根据需要添加更多trafilatura支持的参数
21-
include_images: bool = False
22-
include_links: bool = False
23-
# 新增:支持的输出格式(txt/markdown/json/xml等)
24-
output_format: str = "markdown" # 默认保持纯文本
16+
favor_precision: bool = True # 优先精度:只提取最核心的内容,过滤更多冗余(如侧边栏、广告),默认开启
17+
favor_recall: bool = True # 优先召回:尽可能提取所有潜在有效内容,减少遗漏,默认开启
18+
include_comments: bool = False # 是否保留评论,默认关闭
19+
include_tables: bool = True # 是否保留提取html表格,默认开启
20+
include_images: bool = False # 是否保留提取图片信息,默认开启
21+
include_links: bool = False # 是否保留链接,默认关闭
22+
with_metadata: bool = False # 是否保留元信息,默认关闭
23+
skip_elements: bool = False # 是否保留CSS隐藏元素,默认关闭
24+
output_format: str = "markdown" # 支持多种格式输出:"csv", "json", "html", "markdown", "txt", "xml"等
2525

2626

2727
@extractor("trafilatura")
@@ -68,7 +68,9 @@ def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
6868
include_tables=self.inference_config.include_tables,
6969
include_images=self.inference_config.include_images,
7070
include_links=self.inference_config.include_links,
71+
with_metadata=self.inference_config.with_metadata,
7172
output_format=self.inference_config.output_format # 传入输出格式
73+
7274
)
7375

7476
# 创建 content_list(简单分割段落)

webmainbench/metrics/base.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,12 @@ def _extract_from_markdown(text: str) -> Dict[str, str]:
326326
# 用 BeautifulSoup 替代正则,防止嵌套或匹配不全
327327
soup = BeautifulSoup(text, "html.parser")
328328
for table in soup.find_all("table"):
329-
html_table = str(table)
330-
extracted_segments.append(html_table)
331-
table_parts.append(html_table)
329+
# 判断当前表格的父级是否是表格内的标签(<td>、<tr>、<tbody>等)
330+
parent_is_table_related = table.find_parent(["td", "tr", "tbody", "table"]) is not None
331+
if not parent_is_table_related: # 父级不是表格相关标签 → 是外层表格
332+
html_table = str(table)
333+
extracted_segments.append(html_table)
334+
table_parts.append(html_table)
332335

333336
# ===== 2. 提取 Markdown 表格 =====
334337
lines = text.split('\n')

0 commit comments

Comments
 (0)