|
9 | 9 | from mineru.utils.enum_class import BlockType, SplitFlag |
10 | 10 |
|
11 | 11 |
|
12 | | -CONTINUATION_MARKERS = [ |
| 12 | +CONTINUATION_END_MARKERS = [ |
13 | 13 | "(续)", |
14 | 14 | "(续表)", |
15 | 15 | "(续上表)", |
|
18 | 18 | "(cont’d)", |
19 | 19 | ] |
20 | 20 |
|
| 21 | +CONTINUATION_INLINE_MARKERS = [ |
| 22 | + "(continued)", |
| 23 | +] |
| 24 | + |
21 | 25 |
|
22 | 26 | def calculate_table_total_columns(soup): |
23 | 27 | """计算表格的总列数,通过分析整个表格结构来处理rowspan和colspan |
@@ -163,20 +167,32 @@ def detect_table_headers(soup1, soup2, max_header_rows=5): |
163 | 167 | def can_merge_tables(current_table_block, previous_table_block): |
164 | 168 | """判断两个表格是否可以合并""" |
165 | 169 | # 检查表格是否有caption和footnote |
| 170 | + # 计算previous_table_block中的footnote数量 |
| 171 | + footnote_count = sum(1 for block in previous_table_block["blocks"] if block["type"] == BlockType.TABLE_FOOTNOTE) |
166 | 172 | # 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾 |
167 | 173 | caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION] |
168 | 174 | if caption_blocks: |
169 | | - # 如果所有caption都不以"(续)"、"(续表)"、"(continued)"或"(cont.)"结尾,则不合并 |
| 175 | + # 检查是否至少有一个caption包含续表标识 |
| 176 | + has_continuation_marker = False |
| 177 | + for block in caption_blocks: |
| 178 | + caption_text = full_to_half(merge_para_with_text(block).strip()).lower() |
| 179 | + if ( |
| 180 | + any(caption_text.endswith(marker.lower()) for marker in CONTINUATION_END_MARKERS) |
| 181 | + or any(marker.lower() in caption_text for marker in CONTINUATION_INLINE_MARKERS) |
| 182 | + ): |
| 183 | + has_continuation_marker = True |
| 184 | + break |
170 | 185 |
|
171 | | - if not any( |
172 | | - any(full_to_half(merge_para_with_text(block).strip()).lower().endswith(marker.lower()) |
173 | | - for marker in CONTINUATION_MARKERS) |
174 | | - for block in caption_blocks |
175 | | - ): |
| 186 | + # 如果所有caption都不包含续表标识,则不允许合并 |
| 187 | + if not has_continuation_marker: |
176 | 188 | return False, None, None, None, None |
177 | 189 |
|
178 | | - if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]): |
179 | | - return False, None, None, None, None |
| 190 | + # 如果current_table_block的caption存在续标识,放宽footnote的限制允许previous_table_block有最多一条footnote |
| 191 | + if footnote_count > 1: |
| 192 | + return False, None, None, None, None |
| 193 | + else: |
| 194 | + if footnote_count > 0: |
| 195 | + return False, None, None, None, None |
180 | 196 |
|
181 | 197 | # 获取两个表格的HTML内容 |
182 | 198 | current_html = "" |
@@ -363,6 +379,11 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo |
363 | 379 | row.extract() |
364 | 380 | tbody1.append(row) |
365 | 381 |
|
| 382 | + # 清空previous_table_block的footnote |
| 383 | + previous_table_block["blocks"] = [ |
| 384 | + block for block in previous_table_block["blocks"] |
| 385 | + if block["type"] != BlockType.TABLE_FOOTNOTE |
| 386 | + ] |
366 | 387 | # 添加待合并表格的footnote到前一个表格中 |
367 | 388 | for table_footnote in wait_merge_table_footnotes: |
368 | 389 | temp_table_footnote = table_footnote.copy() |
|
0 commit comments