|
| 1 | +#!/usr/bin/env python3 -m pytest |
| 2 | +"""Tests for MasterFormat-style partial numbering in PDF conversion.""" |
| 3 | + |
| 4 | +import os |
| 5 | +import re |
| 6 | +import pytest |
| 7 | + |
| 8 | +from markitdown import MarkItDown |
| 9 | +from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN |
| 10 | + |
| 11 | +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") |
| 12 | + |
| 13 | + |
| 14 | +class TestMasterFormatPartialNumbering: |
| 15 | + """Test handling of MasterFormat-style partial numbering (.1, .2, etc.).""" |
| 16 | + |
| 17 | + def test_partial_numbering_pattern_regex(self): |
| 18 | + """Test that the partial numbering regex pattern correctly matches.""" |
| 19 | + |
| 20 | + # Should match partial numbering patterns |
| 21 | + assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None |
| 22 | + assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None |
| 23 | + assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None |
| 24 | + assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None |
| 25 | + |
| 26 | + # Should NOT match other patterns |
| 27 | + assert PARTIAL_NUMBERING_PATTERN.match("1.") is None |
| 28 | + assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None |
| 29 | + assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None |
| 30 | + assert PARTIAL_NUMBERING_PATTERN.match("text") is None |
| 31 | + assert PARTIAL_NUMBERING_PATTERN.match(".a") is None |
| 32 | + assert PARTIAL_NUMBERING_PATTERN.match("") is None |
| 33 | + |
| 34 | + def test_masterformat_partial_numbering_not_split(self): |
| 35 | + """Test that MasterFormat partial numbering stays with associated text. |
| 36 | +
|
| 37 | + MasterFormat documents use partial numbering like: |
| 38 | + .1 The intent of this Request for Proposal... |
| 39 | + .2 Available information relative to... |
| 40 | +
|
| 41 | + These should NOT be split into separate table columns, but kept |
| 42 | + as coherent text lines with the number followed by its description. |
| 43 | + """ |
| 44 | + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") |
| 45 | + |
| 46 | + markitdown = MarkItDown() |
| 47 | + result = markitdown.convert(pdf_path) |
| 48 | + text_content = result.text_content |
| 49 | + |
| 50 | + # Partial numberings should NOT appear isolated on their own lines |
| 51 | + # If they're isolated, it means the parser incorrectly split them from their text |
| 52 | + lines = text_content.split("\n") |
| 53 | + isolated_numberings = [] |
| 54 | + for line in lines: |
| 55 | + stripped = line.strip() |
| 56 | + # Check if line contains ONLY a partial numbering (with possible whitespace/pipes) |
| 57 | + cleaned = stripped.replace("|", "").strip() |
| 58 | + if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]: |
| 59 | + isolated_numberings.append(stripped) |
| 60 | + |
| 61 | + assert len(isolated_numberings) == 0, ( |
| 62 | + f"Partial numberings should not be isolated from their text. " |
| 63 | + f"Found isolated: {isolated_numberings}" |
| 64 | + ) |
| 65 | + |
| 66 | + # Verify that partial numberings appear WITH following text on the same line |
| 67 | + # Look for patterns like ".1 The intent" or ".1 Some text" |
| 68 | + partial_with_text = re.findall(r"\.\d+\s+\w+", text_content) |
| 69 | + assert ( |
| 70 | + len(partial_with_text) > 0 |
| 71 | + ), "Expected to find partial numberings followed by text on the same line" |
| 72 | + |
| 73 | + def test_masterformat_content_preserved(self): |
| 74 | + """Test that MasterFormat document content is fully preserved.""" |
| 75 | + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") |
| 76 | + |
| 77 | + markitdown = MarkItDown() |
| 78 | + result = markitdown.convert(pdf_path) |
| 79 | + text_content = result.text_content |
| 80 | + |
| 81 | + # Verify key content from the MasterFormat document is preserved |
| 82 | + expected_content = [ |
| 83 | + "RFP for Construction Management Services", |
| 84 | + "Section 00 00 43", |
| 85 | + "Instructions to Respondents", |
| 86 | + "Ken Sargent House", |
| 87 | + "INTENT", |
| 88 | + "Request for Proposal", |
| 89 | + "KEN SARGENT HOUSE", |
| 90 | + "GRANDE PRAIRIE, ALBERTA", |
| 91 | + "Section 00 00 45", |
| 92 | + ] |
| 93 | + |
| 94 | + for content in expected_content: |
| 95 | + assert ( |
| 96 | + content in text_content |
| 97 | + ), f"Expected content '{content}' not found in extracted text" |
| 98 | + |
| 99 | + # Verify partial numbering is followed by text on the same line |
| 100 | + # .1 should be followed by "The intent" on the same line |
| 101 | + assert re.search( |
| 102 | + r"\.1\s+The intent", text_content |
| 103 | + ), "Partial numbering .1 should be followed by 'The intent' text" |
| 104 | + |
| 105 | + # .2 should be followed by "Available information" on the same line |
| 106 | + assert re.search( |
| 107 | + r"\.2\s+Available information", text_content |
| 108 | + ), "Partial numbering .2 should be followed by 'Available information' text" |
| 109 | + |
| 110 | + # Ensure text content is not empty and has reasonable length |
| 111 | + assert ( |
| 112 | + len(text_content.strip()) > 100 |
| 113 | + ), "MasterFormat document should have substantial text content" |
| 114 | + |
| 115 | + def test_merge_partial_numbering_with_empty_lines_between(self): |
| 116 | + """Test that partial numberings merge correctly even with empty lines between. |
| 117 | +
|
| 118 | + When PDF extractors produce output like: |
| 119 | + .1 |
| 120 | +
|
| 121 | + The intent of this Request... |
| 122 | +
|
| 123 | + The merge logic should still combine them properly. |
| 124 | + """ |
| 125 | + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") |
| 126 | + |
| 127 | + markitdown = MarkItDown() |
| 128 | + result = markitdown.convert(pdf_path) |
| 129 | + text_content = result.text_content |
| 130 | + |
| 131 | + # The merged result should have .1 and .2 followed by text |
| 132 | + # Check that we don't have patterns like ".1\n\nThe intent" (unmerged) |
| 133 | + lines = text_content.split("\n") |
| 134 | + |
| 135 | + for i, line in enumerate(lines): |
| 136 | + stripped = line.strip() |
| 137 | + # If we find an isolated partial numbering, the merge failed |
| 138 | + if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]: |
| 139 | + # Check if next non-empty line exists and wasn't merged |
| 140 | + for j in range(i + 1, min(i + 3, len(lines))): |
| 141 | + if lines[j].strip(): |
| 142 | + pytest.fail( |
| 143 | + f"Partial numbering '{stripped}' on line {i} was not " |
| 144 | + f"merged with following text '{lines[j].strip()[:30]}...'" |
| 145 | + ) |
| 146 | + break |
| 147 | + |
| 148 | + def test_multiple_partial_numberings_all_merged(self): |
| 149 | + """Test that all partial numberings in a document are properly merged.""" |
| 150 | + pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf") |
| 151 | + |
| 152 | + markitdown = MarkItDown() |
| 153 | + result = markitdown.convert(pdf_path) |
| 154 | + text_content = result.text_content |
| 155 | + |
| 156 | + # Count occurrences of merged partial numberings (number followed by text) |
| 157 | + merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content)) |
| 158 | + |
| 159 | + # Count isolated partial numberings (number alone on a line) |
| 160 | + isolated_count = 0 |
| 161 | + for line in text_content.split("\n"): |
| 162 | + stripped = line.strip() |
| 163 | + if re.match(r"^\.\d+$", stripped): |
| 164 | + isolated_count += 1 |
| 165 | + |
| 166 | + assert ( |
| 167 | + merged_count >= 2 |
| 168 | + ), f"Expected at least 2 merged partial numberings, found {merged_count}" |
| 169 | + assert ( |
| 170 | + isolated_count == 0 |
| 171 | + ), f"Found {isolated_count} isolated partial numberings that weren't merged" |
0 commit comments