Skip to content

Commit eb55029

Browse files
authored
Merge pull request #4318 from myhloli/dev
Dev
2 parents 1dfbea1 + 2eef53a commit eb55029

File tree

1 file changed

+30
-9
lines changed

1 file changed

+30
-9
lines changed

mineru/utils/table_merge.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from mineru.utils.enum_class import BlockType, SplitFlag
1010

1111

12-
CONTINUATION_MARKERS = [
12+
CONTINUATION_END_MARKERS = [
1313
"(续)",
1414
"(续表)",
1515
"(续上表)",
@@ -18,6 +18,10 @@
1818
"(cont’d)",
1919
]
2020

21+
CONTINUATION_INLINE_MARKERS = [
22+
"(continued)",
23+
]
24+
2125

2226
def calculate_table_total_columns(soup):
2327
"""计算表格的总列数,通过分析整个表格结构来处理rowspan和colspan
@@ -163,20 +167,32 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
163167
def can_merge_tables(current_table_block, previous_table_block):
164168
"""判断两个表格是否可以合并"""
165169
# 检查表格是否有caption和footnote
170+
# 计算previous_table_block中的footnote数量
171+
footnote_count = sum(1 for block in previous_table_block["blocks"] if block["type"] == BlockType.TABLE_FOOTNOTE)
166172
# 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
167173
caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
168174
if caption_blocks:
169-
# 如果所有caption都不以"(续)"、"(续表)"、"(continued)"或"(cont.)"结尾,则不合并
175+
# 检查是否至少有一个caption包含续表标识
176+
has_continuation_marker = False
177+
for block in caption_blocks:
178+
caption_text = full_to_half(merge_para_with_text(block).strip()).lower()
179+
if (
180+
any(caption_text.endswith(marker.lower()) for marker in CONTINUATION_END_MARKERS)
181+
or any(marker.lower() in caption_text for marker in CONTINUATION_INLINE_MARKERS)
182+
):
183+
has_continuation_marker = True
184+
break
170185

171-
if not any(
172-
any(full_to_half(merge_para_with_text(block).strip()).lower().endswith(marker.lower())
173-
for marker in CONTINUATION_MARKERS)
174-
for block in caption_blocks
175-
):
186+
# 如果所有caption都不包含续表标识,则不允许合并
187+
if not has_continuation_marker:
176188
return False, None, None, None, None
177189

178-
if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
179-
return False, None, None, None, None
190+
# 如果current_table_block的caption存在续标识,放宽footnote的限制允许previous_table_block有最多一条footnote
191+
if footnote_count > 1:
192+
return False, None, None, None, None
193+
else:
194+
if footnote_count > 0:
195+
return False, None, None, None, None
180196

181197
# 获取两个表格的HTML内容
182198
current_html = ""
@@ -363,6 +379,11 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
363379
row.extract()
364380
tbody1.append(row)
365381

382+
# 清空previous_table_block的footnote
383+
previous_table_block["blocks"] = [
384+
block for block in previous_table_block["blocks"]
385+
if block["type"] != BlockType.TABLE_FOOTNOTE
386+
]
366387
# 添加待合并表格的footnote到前一个表格中
367388
for table_footnote in wait_merge_table_footnotes:
368389
temp_table_footnote = table_footnote.copy()

0 commit comments

Comments
 (0)