diff --git a/.gitignore b/.gitignore index aa4abd389..15613ea8a 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,7 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ +.test-logs/ # Translations *.mo diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 8ca9b11b9..701a461f2 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -41,19 +41,20 @@ all = [ "openpyxl", "xlrd", "lxml", - "pdfminer.six>=20251107", + "pdfminer.six>=20251230", + "pdfplumber>=0.11.9", "olefile", "pydub", "SpeechRecognition", "youtube-transcript-api~=1.0.0", "azure-ai-documentintelligence", - "azure-identity" + "azure-identity", ] pptx = ["python-pptx"] docx = ["mammoth~=1.11.0", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] -pdf = ["pdfminer.six"] +pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"] outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 63162d523..b692f169f 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,22 +1,18 @@ import sys import io - from typing import BinaryIO, Any - from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE - -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later +# Load dependencies _dependency_exc_info = None try: import pdfminer import pdfminer.high_level + import pdfplumber except ImportError: - # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() @@ -28,16 +24,374 @@ ACCEPTED_FILE_EXTENSIONS = [".pdf"] +def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str: + """Convert a 2D list (rows/columns) into a nicely aligned Markdown table. + + Args: + table: 2D list of cell values + include_separator: If True, include header separator row (standard markdown). + If False, output simple pipe-separated rows. + """ + if not table: + return "" + + # Normalize None → "" + table = [[cell if cell is not None else "" for cell in row] for row in table] + + # Filter out empty rows + table = [row for row in table if any(cell.strip() for cell in row)] + + if not table: + return "" + + # Column widths + col_widths = [max(len(str(cell)) for cell in col) for col in zip(*table)] + + def fmt_row(row: list[str]) -> str: + return ( + "|" + + "|".join(str(cell).ljust(width) for cell, width in zip(row, col_widths)) + + "|" + ) + + if include_separator: + header, *rows = table + md = [fmt_row(header)] + md.append("|" + "|".join("-" * w for w in col_widths) + "|") + for row in rows: + md.append(fmt_row(row)) + else: + md = [fmt_row(row) for row in table] + + return "\n".join(md) + + +def _extract_form_content_from_words(page: Any) -> str | None: + """ + Extract form-style content from a PDF page by analyzing word positions. + This handles borderless forms/tables where words are aligned in columns. + + Returns markdown with proper table formatting: + - Tables have pipe-separated columns with header separator rows + - Non-table content is rendered as plain text + + Returns None if the page doesn't appear to be a form-style document, + indicating that pdfminer should be used instead for better text spacing. + """ + words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3) + if not words: + return None + + # Group words by their Y position (rows) + y_tolerance = 5 + rows_by_y: dict[float, list[dict]] = {} + for word in words: + y_key = round(word["top"] / y_tolerance) * y_tolerance + if y_key not in rows_by_y: + rows_by_y[y_key] = [] + rows_by_y[y_key].append(word) + + # Sort rows by Y position + sorted_y_keys = sorted(rows_by_y.keys()) + page_width = page.width if hasattr(page, "width") else 612 + + # First pass: analyze each row + row_info: list[dict] = [] + for y_key in sorted_y_keys: + row_words = sorted(rows_by_y[y_key], key=lambda w: w["x0"]) + if not row_words: + continue + + first_x0 = row_words[0]["x0"] + last_x1 = row_words[-1]["x1"] + line_width = last_x1 - first_x0 + combined_text = " ".join(w["text"] for w in row_words) + + # Count distinct x-position groups (columns) + x_positions = [w["x0"] for w in row_words] + x_groups: list[float] = [] + for x in sorted(x_positions): + if not x_groups or x - x_groups[-1] > 50: + x_groups.append(x) + + # Determine row type + is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60 + + row_info.append( + { + "y_key": y_key, + "words": row_words, + "text": combined_text, + "x_groups": x_groups, + "is_paragraph": is_paragraph, + "num_columns": len(x_groups), + } + ) + + # Collect ALL x-positions from rows with 3+ columns (table-like rows) + # This gives us the global column structure + all_table_x_positions: list[float] = [] + for info in row_info: + if info["num_columns"] >= 3 and not info["is_paragraph"]: + all_table_x_positions.extend(info["x_groups"]) + + if not all_table_x_positions: + return None + + # Compute global column boundaries + all_table_x_positions.sort() + global_columns: list[float] = [] + for x in all_table_x_positions: + if not global_columns or x - global_columns[-1] > 30: + global_columns.append(x) + + # Too many columns suggests dense text, not a form + if len(global_columns) > 8: + return None + + # Now classify each row as table row or not + # A row is a table row if it has words that align with 2+ of the global columns + for info in row_info: + if info["is_paragraph"]: + info["is_table_row"] = False + continue + + # Count how many global columns this row's words align with + aligned_columns: set[int] = set() + for word in info["words"]: + word_x = word["x0"] + for col_idx, col_x in enumerate(global_columns): + if abs(word_x - col_x) < 40: + aligned_columns.add(col_idx) + break + + # If row uses 2+ of the established columns, it's a table row + info["is_table_row"] = len(aligned_columns) >= 2 + + # Find table regions (consecutive table rows) + table_regions: list[tuple[int, int]] = [] # (start_idx, end_idx) + i = 0 + while i < len(row_info): + if row_info[i]["is_table_row"]: + start_idx = i + while i < len(row_info) and row_info[i]["is_table_row"]: + i += 1 + end_idx = i + table_regions.append((start_idx, end_idx)) + else: + i += 1 + + # Check if enough rows are table rows (at least 20%) + total_table_rows = sum(end - start for start, end in table_regions) + if len(row_info) > 0 and total_table_rows / len(row_info) < 0.2: + return None + + # Build output - collect table data first, then format with proper column widths + result_lines: list[str] = [] + num_cols = len(global_columns) + + # Helper function to extract cells from a row + def extract_cells(info: dict) -> list[str]: + cells: list[str] = ["" for _ in range(num_cols)] + for word in info["words"]: + word_x = word["x0"] + # Find the correct column using boundary ranges + assigned_col = num_cols - 1 # Default to last column + for col_idx in range(num_cols - 1): + col_end = global_columns[col_idx + 1] + if word_x < col_end - 20: + assigned_col = col_idx + break + if cells[assigned_col]: + cells[assigned_col] += " " + word["text"] + else: + cells[assigned_col] = word["text"] + return cells + + # Process rows, collecting table data for proper formatting + idx = 0 + while idx < len(row_info): + info = row_info[idx] + + # Check if this row starts a table region + table_region = None + for start, end in table_regions: + if idx == start: + table_region = (start, end) + break + + if table_region: + start, end = table_region + # Collect all rows in this table + table_data: list[list[str]] = [] + for table_idx in range(start, end): + cells = extract_cells(row_info[table_idx]) + table_data.append(cells) + + # Calculate column widths for this table + if table_data: + col_widths = [ + max(len(row[col]) for row in table_data) for col in range(num_cols) + ] + # Ensure minimum width of 3 for separator dashes + col_widths = [max(w, 3) for w in col_widths] + + # Format header row + header = table_data[0] + header_str = ( + "| " + + " | ".join( + cell.ljust(col_widths[i]) for i, cell in enumerate(header) + ) + + " |" + ) + result_lines.append(header_str) + + # Format separator row + separator = ( + "| " + + " | ".join("-" * col_widths[i] for i in range(num_cols)) + + " |" + ) + result_lines.append(separator) + + # Format data rows + for row in table_data[1:]: + row_str = ( + "| " + + " | ".join( + cell.ljust(col_widths[i]) for i, cell in enumerate(row) + ) + + " |" + ) + result_lines.append(row_str) + + idx = end # Skip to end of table region + else: + # Check if we're inside a table region (not at start) + in_table = False + for start, end in table_regions: + if start < idx < end: + in_table = True + break + + if not in_table: + # Non-table content + result_lines.append(info["text"]) + idx += 1 + + return "\n".join(result_lines) + + +def _extract_tables_from_words(page: Any) -> list[list[list[str]]]: + """ + Extract tables from a PDF page by analyzing word positions. + This handles borderless tables where words are aligned in columns. + + This function is designed for structured tabular data (like invoices), + not for multi-column text layouts in scientific documents. + """ + words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3) + if not words: + return [] + + # Group words by their Y position (rows) + y_tolerance = 5 + rows_by_y: dict[float, list[dict]] = {} + for word in words: + y_key = round(word["top"] / y_tolerance) * y_tolerance + if y_key not in rows_by_y: + rows_by_y[y_key] = [] + rows_by_y[y_key].append(word) + + # Sort rows by Y position + sorted_y_keys = sorted(rows_by_y.keys()) + + # Find potential column boundaries by analyzing x positions across all rows + all_x_positions = [] + for words_in_row in rows_by_y.values(): + for word in words_in_row: + all_x_positions.append(word["x0"]) + + if not all_x_positions: + return [] + + # Cluster x positions to find column starts + all_x_positions.sort() + x_tolerance_col = 20 + column_starts: list[float] = [] + for x in all_x_positions: + if not column_starts or x - column_starts[-1] > x_tolerance_col: + column_starts.append(x) + + # Need at least 3 columns but not too many (likely text layout, not table) + if len(column_starts) < 3 or len(column_starts) > 10: + return [] + + # Find rows that span multiple columns (potential table rows) + table_rows = [] + for y_key in sorted_y_keys: + words_in_row = sorted(rows_by_y[y_key], key=lambda w: w["x0"]) + + # Assign words to columns + row_data = [""] * len(column_starts) + for word in words_in_row: + # Find the closest column + best_col = 0 + min_dist = float("inf") + for i, col_x in enumerate(column_starts): + dist = abs(word["x0"] - col_x) + if dist < min_dist: + min_dist = dist + best_col = i + + if row_data[best_col]: + row_data[best_col] += " " + word["text"] + else: + row_data[best_col] = word["text"] + + # Only include rows that have content in multiple columns + non_empty = sum(1 for cell in row_data if cell.strip()) + if non_empty >= 2: + table_rows.append(row_data) + + # Validate table quality - tables should have: + # 1. Enough rows (at least 3 including header) + # 2. Short cell content (tables have concise data, not paragraphs) + # 3. Consistent structure across rows + if len(table_rows) < 3: + return [] + + # Check if cells contain short, structured data (not long text) + long_cell_count = 0 + total_cell_count = 0 + for row in table_rows: + for cell in row: + if cell.strip(): + total_cell_count += 1 + # If cell has more than 30 chars, it's likely prose text + if len(cell.strip()) > 30: + long_cell_count += 1 + + # If more than 30% of cells are long, this is probably not a table + if total_cell_count > 0 and long_cell_count / total_cell_count > 0.3: + return [] + + return [table_rows] + + class PdfConverter(DocumentConverter): """ - Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. + Converts PDFs to Markdown. + Supports extracting tables into aligned Markdown format (via pdfplumber). + Falls back to pdfminer if pdfplumber is missing or fails. """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() @@ -55,9 +409,8 @@ def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> DocumentConverterResult: - # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( @@ -65,13 +418,55 @@ def convert( extension=".pdf", feature="pdf", ) - ) from _dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] + ) from _dependency_exc_info[1].with_traceback( _dependency_exc_info[2] - ) + ) # type: ignore[union-attr] - assert isinstance(file_stream, io.IOBase) # for mypy - return DocumentConverterResult( - markdown=pdfminer.high_level.extract_text(file_stream), - ) + assert isinstance(file_stream, io.IOBase) + + markdown_chunks: list[str] = [] + + # Read file stream into BytesIO for compatibility with pdfplumber + pdf_bytes = io.BytesIO(file_stream.read()) + + try: + # Track how many pages are form-style vs plain text + form_pages = 0 + plain_pages = 0 + + with pdfplumber.open(pdf_bytes) as pdf: + for page in pdf.pages: + # Try form-style word position extraction + page_content = _extract_form_content_from_words(page) + + # If extraction returns None, this page is not form-style + if page_content is None: + plain_pages += 1 + # Extract text using pdfplumber's basic extraction for this page + text = page.extract_text() + if text and text.strip(): + markdown_chunks.append(text.strip()) + else: + form_pages += 1 + if page_content.strip(): + markdown_chunks.append(page_content) + + # If most pages are plain text, use pdfminer for better text handling + if plain_pages > form_pages and plain_pages > 0: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + else: + # Build markdown from chunks + markdown = "\n\n".join(markdown_chunks).strip() + + except Exception: + # Fallback if pdfplumber fails + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + + # Fallback if still empty + if not markdown: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + + return DocumentConverterResult(markdown=markdown) diff --git a/packages/markitdown/tests/test_files/MEDRPT-2024-PAT-3847_medical_report_scan.pdf b/packages/markitdown/tests/test_files/MEDRPT-2024-PAT-3847_medical_report_scan.pdf new file mode 100644 index 000000000..30e1960a0 Binary files /dev/null and b/packages/markitdown/tests/test_files/MEDRPT-2024-PAT-3847_medical_report_scan.pdf differ diff --git a/packages/markitdown/tests/test_files/RECEIPT-2024-TXN-98765_retail_purchase.pdf b/packages/markitdown/tests/test_files/RECEIPT-2024-TXN-98765_retail_purchase.pdf new file mode 100644 index 000000000..34842dc78 Binary files /dev/null and b/packages/markitdown/tests/test_files/RECEIPT-2024-TXN-98765_retail_purchase.pdf differ diff --git a/packages/markitdown/tests/test_files/REPAIR-2022-INV-001_multipage.pdf b/packages/markitdown/tests/test_files/REPAIR-2022-INV-001_multipage.pdf new file mode 100644 index 000000000..c795d9e1b Binary files /dev/null and b/packages/markitdown/tests/test_files/REPAIR-2022-INV-001_multipage.pdf differ diff --git a/packages/markitdown/tests/test_files/SPARSE-2024-INV-1234_borderless_table.pdf b/packages/markitdown/tests/test_files/SPARSE-2024-INV-1234_borderless_table.pdf new file mode 100644 index 000000000..e8ba29fe9 Binary files /dev/null and b/packages/markitdown/tests/test_files/SPARSE-2024-INV-1234_borderless_table.pdf differ diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py new file mode 100644 index 000000000..92a79ddc9 --- /dev/null +++ b/packages/markitdown/tests/test_pdf_tables.py @@ -0,0 +1,871 @@ +#!/usr/bin/env python3 -m pytest +"""Tests for PDF table extraction functionality.""" +import os +import re +import pytest + +from markitdown import MarkItDown + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") + + +# --- Helper Functions --- +def validate_strings(result, expected_strings, exclude_strings=None): + """Validate presence or absence of specific strings.""" + text_content = result.text_content.replace("\\", "") + for string in expected_strings: + assert string in text_content, f"Expected string not found: {string}" + if exclude_strings: + for string in exclude_strings: + assert string not in text_content, f"Excluded string found: {string}" + + +def validate_markdown_table(result, expected_headers, expected_data_samples): + """Validate that a markdown table exists with expected headers and data.""" + text_content = result.text_content + + # Check for markdown table structure (| header | header |) + assert "|" in text_content, "No markdown table markers found" + + # Check headers are present + for header in expected_headers: + assert header in text_content, f"Expected table header not found: {header}" + + # Check some data values are present + for data in expected_data_samples: + assert data in text_content, f"Expected table data not found: {data}" + + +def extract_markdown_tables(text_content): + """ + Extract all markdown tables from text content. + Returns a list of tables, where each table is a list of rows, + and each row is a list of cell values. + """ + tables = [] + lines = text_content.split("\n") + current_table = [] + in_table = False + + for line in lines: + line = line.strip() + if line.startswith("|") and line.endswith("|"): + # Skip separator rows (contain only dashes and pipes) + if re.match(r"^\|[\s\-|]+\|$", line): + continue + # Parse cells from the row + cells = [cell.strip() for cell in line.split("|")[1:-1]] + current_table.append(cells) + in_table = True + else: + if in_table and current_table: + tables.append(current_table) + current_table = [] + in_table = False + + # Don't forget the last table + if current_table: + tables.append(current_table) + + return tables + + +def validate_table_structure(table): + """ + Validate that a table has consistent structure: + - All rows have the same number of columns + - Has at least a header row and one data row + """ + if not table: + return False, "Table is empty" + + if len(table) < 2: + return False, "Table should have at least header and one data row" + + num_cols = len(table[0]) + if num_cols < 2: + return False, f"Table should have at least 2 columns, found {num_cols}" + + for i, row in enumerate(table): + if len(row) != num_cols: + return False, f"Row {i} has {len(row)} columns, expected {num_cols}" + + return True, "Table structure is valid" + + +class TestPdfTableExtraction: + """Test PDF table extraction with various PDF types.""" + + @pytest.fixture + def markitdown(self): + """Create MarkItDown instance.""" + return MarkItDown() + + def test_borderless_table_extraction(self, markitdown): + """Test extraction of borderless tables from SPARSE inventory PDF. + + Expected output structure: + - Header: INVENTORY RECONCILIATION REPORT with Report ID, Warehouse, Date, Prepared By + - Pipe-separated rows with inventory data + - Text section: Variance Analysis with Summary Statistics + - More pipe-separated rows with extended inventory review + - Footer: Recommendations section + """ + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Validate document header content + expected_strings = [ + "INVENTORY RECONCILIATION REPORT", + "Report ID: SPARSE-2024-INV-1234", + "Warehouse: Distribution Center East", + "Report Date: 2024-11-15", + "Prepared By: Sarah Martinez", + ] + validate_strings(result, expected_strings) + + # Validate pipe-separated format is used + assert "|" in text_content, "Should have pipe separators for form-style data" + + # --- Validate First Table Data (Inventory Variance) --- + # Validate table headers are present + first_table_headers = [ + "Product Code", + "Location", + "Expected", + "Actual", + "Variance", + "Status", + ] + for header in first_table_headers: + assert header in text_content, f"Should contain header '{header}'" + + # Validate first table has all expected SKUs + first_table_skus = ["SKU-8847", "SKU-9201", "SKU-4563", "SKU-7728"] + for sku in first_table_skus: + assert sku in text_content, f"Should contain {sku}" + + # Validate first table has correct status values + expected_statuses = ["OK", "CRITICAL"] + for status in expected_statuses: + assert status in text_content, f"Should contain status '{status}'" + + # Validate first table has location codes + expected_locations = ["A-12", "B-07", "C-15", "D-22", "A-08"] + for loc in expected_locations: + assert loc in text_content, f"Should contain location '{loc}'" + + # --- Validate Second Table Data (Extended Inventory Review) --- + # Validate second table headers + second_table_headers = [ + "Category", + "Unit Cost", + "Total Value", + "Last Audit", + "Notes", + ] + for header in second_table_headers: + assert header in text_content, f"Should contain header '{header}'" + + # Validate second table has all expected SKUs (10 products) + second_table_skus = [ + "SKU-8847", + "SKU-9201", + "SKU-4563", + "SKU-7728", + "SKU-3345", + "SKU-5512", + "SKU-6678", + "SKU-7789", + "SKU-2234", + "SKU-1123", + ] + for sku in second_table_skus: + assert sku in text_content, f"Should contain {sku}" + + # Validate second table has categories + expected_categories = ["Electronics", "Hardware", "Software", "Accessories"] + for category in expected_categories: + assert category in text_content, f"Should contain category '{category}'" + + # Validate second table has cost values (spot check) + expected_costs = ["$45.00", "$32.50", "$120.00", "$15.75"] + for cost in expected_costs: + assert cost in text_content, f"Should contain cost '{cost}'" + + # Validate second table has note values + expected_notes = ["Verified", "Critical", "Pending"] + for note in expected_notes: + assert note in text_content, f"Should contain note '{note}'" + + # --- Validate Analysis Text Section --- + analysis_strings = [ + "Variance Analysis:", + "Summary Statistics:", + "Total Variance Cost: $4,287.50", + "Critical Items: 1", + "Overall Accuracy: 97.2%", + "Recommendations:", + ] + validate_strings(result, analysis_strings) + + # --- Validate Document Structure Order --- + # Verify sections appear in correct order + # Note: Using flexible patterns since column merging may occur based on gap detection + import re + + header_pos = text_content.find("INVENTORY RECONCILIATION REPORT") + # Look for Product Code header - may be in same column as Location or separate + first_table_match = re.search(r"\|\s*Product Code", text_content) + variance_pos = text_content.find("Variance Analysis:") + extended_review_pos = text_content.find("Extended Inventory Review:") + # Second table - look for SKU entries after extended review section + # The table may not have pipes on every row due to paragraph detection + second_table_pos = -1 + if extended_review_pos != -1: + # Look for either "| Product Code" or "Product Code" as table header + second_table_match = re.search( + r"Product Code.*Category", text_content[extended_review_pos:] + ) + if second_table_match: + # Adjust position to be relative to full text + second_table_pos = extended_review_pos + second_table_match.start() + recommendations_pos = text_content.find("Recommendations:") + + positions = { + "header": header_pos, + "first_table": first_table_match.start() if first_table_match else -1, + "variance_analysis": variance_pos, + "extended_review": extended_review_pos, + "second_table": second_table_pos, + "recommendations": recommendations_pos, + } + + # All sections should be found + for name, pos in positions.items(): + assert pos != -1, f"Section '{name}' not found in output" + + # Verify correct order + assert ( + positions["header"] < positions["first_table"] + ), "Header should come before first table" + assert ( + positions["first_table"] < positions["variance_analysis"] + ), "First table should come before Variance Analysis" + assert ( + positions["variance_analysis"] < positions["extended_review"] + ), "Variance Analysis should come before Extended Review" + assert ( + positions["extended_review"] < positions["second_table"] + ), "Extended Review should come before second table" + assert ( + positions["second_table"] < positions["recommendations"] + ), "Second table should come before Recommendations" + + def test_borderless_table_no_duplication(self, markitdown): + """Test that borderless table content is not duplicated excessively.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Count occurrences of unique table data - should not be excessively duplicated + # SKU-8847 appears in both tables, plus possibly once in summary text + sku_count = text_content.count("SKU-8847") + # Should appear at most 4 times (2 tables + minor text references), not more + assert ( + sku_count <= 4 + ), f"SKU-8847 appears too many times ({sku_count}), suggests duplication issue" + + def test_borderless_table_correct_position(self, markitdown): + """Test that tables appear in correct positions relative to text.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Verify content order - header should come before table content, which should come before analysis + header_pos = text_content.find("Prepared By: Sarah Martinez") + # Look for Product Code in any pipe-separated format + product_code_pos = text_content.find("Product Code") + variance_pos = text_content.find("Variance Analysis:") + + assert header_pos != -1, "Header should be found" + assert product_code_pos != -1, "Product Code should be found" + assert variance_pos != -1, "Variance Analysis should be found" + + assert ( + header_pos < product_code_pos < variance_pos + ), "Product data should appear between header and Variance Analysis" + + # Second table content should appear after "Extended Inventory Review" + extended_review_pos = text_content.find("Extended Inventory Review:") + # Look for Category header which is in second table + category_pos = text_content.find("Category") + recommendations_pos = text_content.find("Recommendations:") + + if ( + extended_review_pos != -1 + and category_pos != -1 + and recommendations_pos != -1 + ): + # Find Category position after Extended Inventory Review + category_after_review = text_content.find("Category", extended_review_pos) + if category_after_review != -1: + assert ( + extended_review_pos < category_after_review < recommendations_pos + ), "Extended review table should appear between Extended Inventory Review and Recommendations" + + def test_receipt_pdf_extraction(self, markitdown): + """Test extraction of receipt PDF (no tables, formatted text). + + Expected output structure: + - Store header: TECHMART ELECTRONICS with address + - Transaction info: Store #, date, TXN, Cashier, Register + - Line items: 6 products with prices and member discounts + - Totals: Subtotal, Member Discount, Sales Tax, Rewards, TOTAL + - Payment info: Visa Card, Auth, Ref + - Rewards member info: Name, ID, Points + - Return policy and footer + """ + pdf_path = os.path.join( + TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # --- Validate Store Header --- + store_header = [ + "TECHMART ELECTRONICS", + "4567 Innovation Blvd", + "San Francisco, CA 94103", + "(415) 555-0199", + ] + validate_strings(result, store_header) + + # --- Validate Transaction Info --- + transaction_info = [ + "Store #0342 - Downtown SF", + "11/23/2024", + "TXN: TXN-98765-2024", + "Cashier: Emily Rodriguez", + "Register: POS-07", + ] + validate_strings(result, transaction_info) + + # --- Validate Line Items (6 products) --- + line_items = [ + # Product 1: Headphones + "Wireless Noise-Cancelling", + "Headphones - Premium Black", + "AUDIO-5521", + "$349.99", + "$299.99", + # Product 2: USB-C Hub + "USB-C Hub 7-in-1 Adapter", + "ACC-8834", + "$79.99", + "$159.98", + # Product 3: Portable SSD + "Portable SSD 2TB", + "STOR-2241", + "$289.00", + "$260.00", + # Product 4: Wireless Mouse + "Ergonomic Wireless Mouse", + "ACC-9012", + "$59.99", + # Product 5: Screen Cleaning Kit + "Screen Cleaning Kit", + "CARE-1156", + "$12.99", + "$38.97", + # Product 6: HDMI Cable + "HDMI 2.1 Cable 6ft", + "CABLE-7789", + "$24.99", + "$44.98", + ] + validate_strings(result, line_items) + + # --- Validate Totals --- + totals = [ + "SUBTOTAL", + "$863.91", + "Member Discount", + "Sales Tax (8.5%)", + "$66.23", + "Rewards Applied", + "-$25.00", + "TOTAL", + "$821.14", + ] + validate_strings(result, totals) + + # --- Validate Payment Info --- + payment_info = [ + "PAYMENT METHOD", + "Visa Card ending in 4782", + "Auth: 847392", + "REF-20241123-98765", + ] + validate_strings(result, payment_info) + + # --- Validate Rewards Member Info --- + rewards_info = [ + "REWARDS MEMBER", + "Sarah Mitchell", + "ID: TM-447821", + "Points Earned: 821", + "Total Points: 3,247", + ] + validate_strings(result, rewards_info) + + # --- Validate Return Policy & Footer --- + footer_info = [ + "RETURN POLICY", + "Returns within 30 days", + "Receipt required", + "Thank you for shopping!", + "www.techmart.example.com", + ] + validate_strings(result, footer_info) + + # --- Validate Document Structure Order --- + positions = { + "store_header": text_content.find("TECHMART ELECTRONICS"), + "transaction": text_content.find("TXN: TXN-98765-2024"), + "first_item": text_content.find("Wireless Noise-Cancelling"), + "subtotal": text_content.find("SUBTOTAL"), + "total": text_content.find("TOTAL"), + "payment": text_content.find("PAYMENT METHOD"), + "rewards": text_content.find("REWARDS MEMBER"), + "return_policy": text_content.find("RETURN POLICY"), + } + + # All sections should be found + for name, pos in positions.items(): + assert pos != -1, f"Section '{name}' not found in output" + + # Verify correct order + assert ( + positions["store_header"] < positions["transaction"] + ), "Store header should come before transaction" + assert ( + positions["transaction"] < positions["first_item"] + ), "Transaction should come before items" + assert ( + positions["first_item"] < positions["subtotal"] + ), "Items should come before subtotal" + assert ( + positions["subtotal"] < positions["total"] + ), "Subtotal should come before total" + assert ( + positions["total"] < positions["payment"] + ), "Total should come before payment" + assert ( + positions["payment"] < positions["rewards"] + ), "Payment should come before rewards" + assert ( + positions["rewards"] < positions["return_policy"] + ), "Rewards should come before return policy" + + def test_multipage_invoice_extraction(self, markitdown): + """Test extraction of multipage invoice PDF with form-style layout. + + Expected output: Pipe-separated format with clear cell boundaries. + Form data should be extracted with pipes indicating column separations. + """ + pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf") + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Validate basic content is extracted + expected_strings = [ + "ZAVA AUTO REPAIR", + "Collision Repair", + "Redmond, WA", + "Gabriel Diaz", + "Jeep", + "Grand Cherokee", + "Parts", + "Body Labor", + "Paint Labor", + "GRAND TOTAL", + # Second page content + "Bruce Wayne", + "Batmobile", + ] + validate_strings(result, expected_strings) + + # Validate pipe-separated table format + # Form-style documents should use pipes to separate cells + assert "|" in text_content, "Form-style PDF should contain pipe separators" + + # Validate key form fields are properly separated + # These patterns check that label and value are in separate cells + # Note: cells may have padding spaces for column alignment + import re + + assert re.search( + r"\| Insured name\s*\|", text_content + ), "Insured name should be in its own cell" + assert re.search( + r"\| Gabriel Diaz\s*\|", text_content + ), "Gabriel Diaz should be in its own cell" + assert re.search( + r"\| Year\s*\|", text_content + ), "Year label should be in its own cell" + assert re.search( + r"\| 2022\s*\|", text_content + ), "Year value should be in its own cell" + + # Validate table structure for estimate totals + assert ( + re.search(r"\| Hours\s*\|", text_content) or "Hours |" in text_content + ), "Hours column header should be present" + assert ( + re.search(r"\| Rate\s*\|", text_content) or "Rate |" in text_content + ), "Rate column header should be present" + assert ( + re.search(r"\| Cost\s*\|", text_content) or "Cost |" in text_content + ), "Cost column header should be present" + + # Validate numeric values are extracted + assert "2,100" in text_content, "Parts cost should be extracted" + assert "300" in text_content, "Body labor cost should be extracted" + assert "225" in text_content, "Paint labor cost should be extracted" + assert "5,738" in text_content, "Grand total should be extracted" + + # Validate second page content (Bruce Wayne invoice) + assert "Bruce Wayne" in text_content, "Second page customer name" + assert "Batmobile" in text_content, "Second page vehicle model" + assert "211,522" in text_content, "Second page grand total" + + # Validate disclaimer text is NOT in table format (long paragraph) + # The disclaimer should be extracted as plain text, not pipe-separated + assert ( + "preliminary estimate" in text_content.lower() + ), "Disclaimer text should be present" + + def test_academic_pdf_extraction(self, markitdown): + """Test extraction of academic paper PDF (scientific document). + + Expected output: Plain text without tables or pipe characters. + Scientific documents should be extracted as flowing text with proper spacing, + not misinterpreted as tables. + """ + pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf") + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Validate academic paper content with proper spacing + expected_strings = [ + "Introduction", + "Large language models", # Should have proper spacing, not "Largelanguagemodels" + "agents", + "multi-agent", # Should be properly hyphenated + ] + validate_strings(result, expected_strings) + + # Validate proper text formatting (words separated by spaces) + assert "LLMs" in text_content, "Should contain 'LLMs' acronym" + assert "reasoning" in text_content, "Should contain 'reasoning'" + assert "observations" in text_content, "Should contain 'observations'" + + # Ensure content is not empty and has proper length + assert len(text_content) > 1000, "Academic PDF should have substantial content" + + # Scientific documents should NOT have tables or pipe characters + assert ( + "|" not in text_content + ), "Scientific document should not contain pipe characters (no tables)" + + # Verify no markdown tables were extracted + tables = extract_markdown_tables(text_content) + assert ( + len(tables) == 0 + ), f"Scientific document should have no tables, found {len(tables)}" + + # Verify text is properly formatted with spaces between words + # Check that common phrases are NOT joined together (which would indicate bad extraction) + assert ( + "Largelanguagemodels" not in text_content + ), "Text should have proper spacing, not joined words" + assert ( + "multiagentconversations" not in text_content.lower() + ), "Text should have proper spacing between words" + + def test_scanned_pdf_handling(self, markitdown): + """Test handling of scanned/image-based PDF (no text layer). + + Expected output: Empty - scanned PDFs without OCR have no text layer. + """ + pdf_path = os.path.join( + TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + + # Scanned PDFs without OCR have no text layer, so extraction should be empty + assert ( + result is not None + ), "Converter should return a result even for scanned PDFs" + assert result.text_content is not None, "text_content should not be None" + + # Verify extraction is empty (no text layer in scanned PDF) + assert ( + result.text_content.strip() == "" + ), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'" + + +class TestPdfTableMarkdownFormat: + """Test that extracted tables have proper markdown formatting.""" + + @pytest.fixture + def markitdown(self): + """Create MarkItDown instance.""" + return MarkItDown() + + def test_markdown_table_has_pipe_format(self, markitdown): + """Test that form-style PDFs have pipe-separated format.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Find rows with pipes + lines = text_content.split("\n") + pipe_rows = [ + line for line in lines if line.startswith("|") and line.endswith("|") + ] + + assert len(pipe_rows) > 0, "Should have pipe-separated rows" + + # Check that Product Code appears in a pipe-separated row + product_code_found = any("Product Code" in row for row in pipe_rows) + assert product_code_found, "Product Code should be in pipe-separated format" + + def test_markdown_table_columns_have_pipes(self, markitdown): + """Test that form-style PDF columns are separated with pipes.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Find table rows and verify column structure + lines = text_content.split("\n") + table_rows = [ + line for line in lines if line.startswith("|") and line.endswith("|") + ] + + assert len(table_rows) > 0, "Should have markdown table rows" + + # Check that at least some rows have multiple columns (pipes) + multi_col_rows = [row for row in table_rows if row.count("|") >= 3] + assert ( + len(multi_col_rows) > 5 + ), f"Should have rows with multiple columns, found {len(multi_col_rows)}" + + +class TestPdfTableStructureConsistency: + """Test that extracted tables have consistent structure across all PDF types.""" + + @pytest.fixture + def markitdown(self): + """Create MarkItDown instance.""" + return MarkItDown() + + def test_borderless_table_structure(self, markitdown): + """Test that borderless table PDF has pipe-separated structure.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Should have pipe-separated content + assert "|" in text_content, "Borderless table PDF should have pipe separators" + + # Check that key content is present + assert "Product Code" in text_content, "Should contain Product Code" + assert "SKU-8847" in text_content, "Should contain first SKU" + assert "SKU-9201" in text_content, "Should contain second SKU" + + def test_multipage_invoice_table_structure(self, markitdown): + """Test that multipage invoice PDF has pipe-separated format.""" + pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf") + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Should have pipe-separated content + assert "|" in text_content, "Invoice PDF should have pipe separators" + + # Find rows with pipes + lines = text_content.split("\n") + pipe_rows = [ + line for line in lines if line.startswith("|") and line.endswith("|") + ] + + assert ( + len(pipe_rows) > 10 + ), f"Should have multiple pipe-separated rows, found {len(pipe_rows)}" + + # Check that some rows have multiple columns + multi_col_rows = [row for row in pipe_rows if row.count("|") >= 4] + assert len(multi_col_rows) > 5, "Should have rows with 3+ columns" + + def test_receipt_has_no_tables(self, markitdown): + """Test that receipt PDF doesn't incorrectly extract tables from formatted text.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + tables = extract_markdown_tables(result.text_content) + + # Receipt should not have markdown tables extracted + # (it's formatted text, not tabular data) + # If tables are extracted, they should be minimal/empty + total_table_rows = sum(len(t) for t in tables) + assert ( + total_table_rows < 5 + ), f"Receipt should not have significant tables, found {total_table_rows} rows" + + def test_scanned_pdf_no_tables(self, markitdown): + """Test that scanned PDF has empty extraction and no tables.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + + # Scanned PDF with no text layer should have empty extraction + assert ( + result.text_content.strip() == "" + ), "Scanned PDF should have empty extraction" + + tables = extract_markdown_tables(result.text_content) + + # Scanned PDF with no text layer should have no tables + assert len(tables) == 0, "Scanned PDF should have no extracted tables" + + def test_all_pdfs_table_rows_consistent(self, markitdown): + """Test that all PDF tables have rows with pipe-separated content. + + Note: With gap-based column detection, rows may have different column counts + depending on how content is spaced in the PDF. What's important is that each + row has pipe separators and the content is readable. + """ + pdf_files = [ + "SPARSE-2024-INV-1234_borderless_table.pdf", + "REPAIR-2022-INV-001_multipage.pdf", + "RECEIPT-2024-TXN-98765_retail_purchase.pdf", + "test.pdf", + ] + + for pdf_file in pdf_files: + pdf_path = os.path.join(TEST_FILES_DIR, pdf_file) + if not os.path.exists(pdf_path): + continue + + result = markitdown.convert(pdf_path) + tables = extract_markdown_tables(result.text_content) + + for table_idx, table in enumerate(tables): + if not table: + continue + + # Verify each row has at least one column (pipe-separated content) + for row_idx, row in enumerate(table): + assert ( + len(row) >= 1 + ), f"{pdf_file}: Table {table_idx}, row {row_idx} has no columns" + + # Verify the row has non-empty content + row_content = " ".join(cell.strip() for cell in row) + assert ( + len(row_content.strip()) > 0 + ), f"{pdf_file}: Table {table_idx}, row {row_idx} is empty" + + def test_borderless_table_data_integrity(self, markitdown): + """Test that borderless table extraction preserves data integrity.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + tables = extract_markdown_tables(result.text_content) + + assert len(tables) >= 2, "Should have at least 2 tables" + + # Check first table has expected SKU data + first_table = tables[0] + table_text = str(first_table) + assert "SKU-8847" in table_text, "First table should contain SKU-8847" + assert "SKU-9201" in table_text, "First table should contain SKU-9201" + + # Check second table has expected category data + second_table = tables[1] + table_text = str(second_table) + assert "Electronics" in table_text, "Second table should contain Electronics" + assert "Hardware" in table_text, "Second table should contain Hardware"