Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion docs/api/parsers.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,15 @@ parser = ChartTablePDFParser(
# Layout Detection
layout_model_name: str = "PP-DocLayout_plus-L",
dpi: int = 200,
min_score: float = 0.0
min_score: float = 0.0,

# Split Table Merging
merge_split_tables: bool = False,
bottom_threshold_ratio: float = 0.20,
top_threshold_ratio: float = 0.15,
max_gap_ratio: float = 0.25,
column_alignment_tolerance: float = 10.0,
min_merge_confidence: float = 0.65,
)

# Extract charts/tables
Expand Down
69 changes: 68 additions & 1 deletion docs/user-guide/parsers/chart-table-extractor.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The `ChartTablePDFParser` is a specialized parser focused exclusively on extract
- **Focused Extraction**: Extract only charts and/or tables
- **Selective Processing**: Choose what to extract
- **VLM Integration**: Convert visuals to structured data
- **Split Table Merging**: Automatic detection and merging of tables split across pages
- **Faster Processing**: Skips unnecessary elements

## Basic Usage
Expand Down Expand Up @@ -64,6 +65,71 @@ parser.parse("report.pdf")
# Outputs: tables.xlsx, tables.html, vlm_items.json
```

## Split Table Merging

The `ChartTablePDFParser` includes automatic detection and merging of tables that are split across multiple pages. This feature is especially useful for processing financial reports, data tables, and other documents where large tables span page boundaries.

### Enabling Split Table Merging

```python
from doctra import ChartTablePDFParser

# Enable split table merging with default settings
parser = ChartTablePDFParser(
extract_tables=True,
merge_split_tables=True
)

parser.parse("document.pdf")
```

### Configuration Options

```python
parser = ChartTablePDFParser(
extract_tables=True,
merge_split_tables=True,

# Position thresholds
bottom_threshold_ratio=0.20, # 20% from bottom of page
top_threshold_ratio=0.15, # 15% from top of page

# Gap tolerance
max_gap_ratio=0.25, # 25% of page height max gap

# Structural validation
column_alignment_tolerance=10.0, # Pixel tolerance for column alignment
min_merge_confidence=0.65, # Minimum confidence to merge (0-1)
)
```

### How It Works

The split table detection uses a two-phase approach:

1. **Phase 1: Proximity Detection** - Fast spatial heuristics to identify candidate pairs based on position, overlap, gap, and width similarity
2. **Phase 2: Structural Validation** - Deep structural analysis using LSD (Line Segment Detector) to validate column alignment and structure

For detailed information about the algorithm, see the [Split Table Merging Guide](../features/split-table-merging.md).

### Output

When split tables are detected and merged:

- Individual table segments are skipped (not saved separately)
- Merged table images are saved as `merged_table_<page1>_<page2>.png` in the tables directory
- If VLM is enabled, merged tables are processed and included in the structured output (Excel, HTML, JSON)
- Merged tables include metadata: page range and confidence score

### When to Use Split Table Merging

Enable split table merging when:

- Processing financial reports or data tables
- Tables span multiple pages
- You need complete table data for analysis
- Working with documents that have large data tables

## When to Use

Use `ChartTablePDFParser` when:
Expand All @@ -76,6 +142,7 @@ Use `ChartTablePDFParser` when:
## See Also

- [VLM Integration](../engines/vlm-integration.md) - Structured data extraction
- [Structured Parser](structured-parser.md) - Full document parsing
- [Structured Parser](structured-parser.md) - Full document parsing with split table merging details
- [Split Table Merging Guide](../features/split-table-merging.md) - Comprehensive guide to split table detection
- [API Reference](../../api/parsers.md) - Complete API documentation

114 changes: 113 additions & 1 deletion doctra/parsers/table_chart_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from doctra.utils.structured_utils import to_structured_dict
from doctra.exporters.markdown_table import render_markdown_table
from doctra.exporters.markdown_writer import write_markdown
from doctra.exporters.html_writer import write_structured_html
from doctra.exporters.html_writer import write_structured_html, render_html_table
from doctra.parsers.split_table_detector import SplitTableDetector, SplitTableMatch
import json


Expand All @@ -42,6 +43,12 @@ class ChartTablePDFParser:
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
:param dpi: DPI for PDF rendering (default: 200)
:param min_score: Minimum confidence score for layout detection (default: 0.0)
:param merge_split_tables: Whether to detect and merge split tables (default: False)
:param bottom_threshold_ratio: Ratio for "too close to bottom" detection (default: 0.20)
:param top_threshold_ratio: Ratio for "too close to top" detection (default: 0.15)
:param max_gap_ratio: Maximum allowed gap between tables (default: 0.25, accounts for headers/footers)
:param column_alignment_tolerance: Pixel tolerance for column alignment (default: 10.0)
:param min_merge_confidence: Minimum confidence score for merging (default: 0.65)
"""

def __init__(
Expand All @@ -53,6 +60,12 @@ def __init__(
layout_model_name: str = "PP-DocLayout_plus-L",
dpi: int = 200,
min_score: float = 0.0,
merge_split_tables: bool = False,
bottom_threshold_ratio: float = 0.20,
top_threshold_ratio: float = 0.15,
max_gap_ratio: float = 0.25,
column_alignment_tolerance: float = 10.0,
min_merge_confidence: float = 0.65,
):
"""
Initialize the ChartTablePDFParser with extraction configuration.
Expand All @@ -63,6 +76,12 @@ def __init__(
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
:param dpi: DPI for PDF rendering (default: 200)
:param min_score: Minimum confidence score for layout detection (default: 0.0)
:param merge_split_tables: Whether to detect and merge split tables (default: False)
:param bottom_threshold_ratio: Ratio for "too close to bottom" detection (default: 0.20)
:param top_threshold_ratio: Ratio for "too close to top" detection (default: 0.15)
:param max_gap_ratio: Maximum allowed gap between tables (default: 0.25, accounts for headers/footers)
:param column_alignment_tolerance: Pixel tolerance for column alignment (default: 10.0)
:param min_merge_confidence: Minimum confidence score for merging (default: 0.65)
"""
if not extract_charts and not extract_tables:
raise ValueError("At least one of extract_charts or extract_tables must be True")
Expand All @@ -83,6 +102,19 @@ def __init__(
f"vlm must be an instance of VLMStructuredExtractor or None, "
f"got {type(vlm).__name__}"
)

# Initialize split table detector if enabled
self.merge_split_tables = merge_split_tables
if self.merge_split_tables and self.extract_tables:
self.split_table_detector = SplitTableDetector(
bottom_threshold_ratio=bottom_threshold_ratio,
top_threshold_ratio=top_threshold_ratio,
max_gap_ratio=max_gap_ratio,
column_alignment_tolerance=column_alignment_tolerance,
min_merge_confidence=min_merge_confidence,
)
else:
self.split_table_detector = None

def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
"""
Expand Down Expand Up @@ -112,6 +144,24 @@ def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
)
pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]

# Detect split tables if enabled
split_table_matches: List[SplitTableMatch] = []
merged_table_segments = []

if self.merge_split_tables and self.extract_tables:
if self.split_table_detector:
try:
split_table_matches = self.split_table_detector.detect_split_tables(pages, pil_pages)
if split_table_matches:
print(f"🔗 Detected {len(split_table_matches)} split table(s) to merge")
for match in split_table_matches:
merged_table_segments.append(match.segment1)
merged_table_segments.append(match.segment2)
except Exception as e:
import traceback
traceback.print_exc()
split_table_matches = []

target_labels = []
if self.extract_charts:
target_labels.append("chart")
Expand Down Expand Up @@ -203,6 +253,11 @@ def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
charts_bar.update(1)

elif box.label == "table" and self.extract_tables:
# Skip table segments that are part of merged tables
is_merged = any(seg.match_box(box, page_num) for seg in merged_table_segments)
if is_merged:
continue

table_filename = f"table_{table_counter:03d}.png"
table_path = os.path.join(tables_dir, table_filename)

Expand Down Expand Up @@ -247,6 +302,63 @@ def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
if tables_bar:
tables_bar.update(1)

# Process merged tables if any were detected
if split_table_matches and self.split_table_detector and self.extract_tables:
for match_idx, match in enumerate(split_table_matches):
try:
merged_img = self.split_table_detector.merge_table_images(match)

merged_filename = f"merged_table_{match.segment1.page_index}_{match.segment2.page_index}.png"
merged_path = os.path.join(tables_dir, merged_filename)
merged_img.save(merged_path)

abs_merged_path = os.path.abspath(merged_path)
rel_merged = os.path.relpath(abs_merged_path, out_dir)

pages_str = f"pages {match.segment1.page_index}-{match.segment2.page_index}"

if self.vlm is not None:
wrote_table = False
try:
extracted_table = self.vlm.extract_table(abs_merged_path)
structured_item = to_structured_dict(extracted_table)
if structured_item:
structured_item["page"] = f"{match.segment1.page_index}-{match.segment2.page_index}"
structured_item["type"] = "Table (Merged)"
structured_item["split_merge"] = True
structured_item["merge_confidence"] = match.confidence
structured_items.append(structured_item)

vlm_items.append({
"kind": "table",
"page": pages_str,
"image_rel_path": rel_merged,
"title": structured_item.get("title"),
"headers": structured_item.get("headers"),
"rows": structured_item.get("rows"),
"split_merge": True,
"merge_confidence": match.confidence,
})

md_lines.append(f"\n### Merged Table ({pages_str})\n")
md_lines.append(
render_markdown_table(
structured_item.get("headers"),
structured_item.get("rows"),
title=structured_item.get("title") or f"Merged Table ({pages_str})"
)
)
wrote_table = True
except Exception as e:
pass

if not wrote_table:
md_lines.append(f"\n### Merged Table ({pages_str})\n")
md_lines.append(f"![Merged Table ({pages_str})]({rel_merged})\n")
except Exception as e:
import traceback
traceback.print_exc()

excel_path = None

if self.vlm is not None:
Expand Down