Fix: PDF parsing doesn't support partially numbered lists (#1525)

lesyk · web-flow · commit 7fdaefb724d0 · 2026-01-08T15:15:22.000-08:00
* Fix: PDF parsing doesn't support partially numbered lists

* Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file

* Refactor: Improve assertion formatting in partial numbering tests
diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.4"
+__version__ = "0.1.5b1"
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,11 +1,62 @@
 import sys
 import io
+import re
 from typing import BinaryIO, Any
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 
+# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
+PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
+
+
+def _merge_partial_numbering_lines(text: str) -> str:
+    """
+    Post-process extracted text to merge MasterFormat-style partial numbering
+    with the following text line.
+
+    MasterFormat documents use partial numbering like:
+        .1  The intent of this Request for Proposal...
+        .2  Available information relative to...
+
+    Some PDF extractors split these into separate lines:
+        .1
+        The intent of this Request for Proposal...
+
+    This function merges them back together.
+    """
+    lines = text.split("\n")
+    result_lines: list[str] = []
+    i = 0
+
+    while i < len(lines):
+        line = lines[i]
+        stripped = line.strip()
+
+        # Check if this line is ONLY a partial numbering
+        if PARTIAL_NUMBERING_PATTERN.match(stripped):
+            # Look for the next non-empty line to merge with
+            j = i + 1
+            while j < len(lines) and not lines[j].strip():
+                j += 1
+
+            if j < len(lines):
+                # Merge the partial numbering with the next line
+                next_line = lines[j].strip()
+                result_lines.append(f"{stripped} {next_line}")
+                i = j + 1  # Skip past the merged line
+            else:
+                # No next line to merge with, keep as is
+                result_lines.append(line)
+                i += 1
+        else:
+            result_lines.append(line)
+            i += 1
+
+    return "\n".join(result_lines)
+
+
 # Load dependencies
 _dependency_exc_info = None
 try:
@@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None:
         # Determine row type
         is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
 
+        # Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
+        # These should be treated as list items, not table rows
+        has_partial_numbering = False
+        if row_words:
+            first_word = row_words[0]["text"].strip()
+            if PARTIAL_NUMBERING_PATTERN.match(first_word):
+                has_partial_numbering = True
+
         row_info.append(
             {
                 "y_key": y_key,
@@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None:
                 "x_groups": x_groups,
                 "is_paragraph": is_paragraph,
                 "num_columns": len(x_groups),
+                "has_partial_numbering": has_partial_numbering,
             }
         )
 
@@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None:
             info["is_table_row"] = False
             continue
 
+        # Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
+        if info["has_partial_numbering"]:
+            info["is_table_row"] = False
+            continue
+
         # Count how many global columns this row's words align with
         aligned_columns: set[int] = set()
         for word in info["words"]:
@@ -469,4 +534,7 @@ def convert(
             pdf_bytes.seek(0)
             markdown = pdfminer.high_level.extract_text(pdf_bytes)
 
+        # Post-process to merge MasterFormat-style partial numbering with following text
+        markdown = _merge_partial_numbering_lines(markdown)
+
         return DocumentConverterResult(markdown=markdown)
diff --git a/packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf b/packages/markitdown/tests/test_files/masterformat_partial_numbering.pdf
diff --git a/packages/markitdown/tests/test_pdf_masterformat.py b/packages/markitdown/tests/test_pdf_masterformat.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3 -m pytest
+"""Tests for MasterFormat-style partial numbering in PDF conversion."""
+
+import os
+import re
+import pytest
+
+from markitdown import MarkItDown
+from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
+
+TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
+
+
+class TestMasterFormatPartialNumbering:
+    """Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
+
+    def test_partial_numbering_pattern_regex(self):
+        """Test that the partial numbering regex pattern correctly matches."""
+
+        # Should match partial numbering patterns
+        assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
+        assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
+        assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
+        assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
+
+        # Should NOT match other patterns
+        assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
+        assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
+        assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
+        assert PARTIAL_NUMBERING_PATTERN.match("text") is None
+        assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
+        assert PARTIAL_NUMBERING_PATTERN.match("") is None
+
+    def test_masterformat_partial_numbering_not_split(self):
+        """Test that MasterFormat partial numbering stays with associated text.
+
+        MasterFormat documents use partial numbering like:
+            .1  The intent of this Request for Proposal...
+            .2  Available information relative to...
+
+        These should NOT be split into separate table columns, but kept
+        as coherent text lines with the number followed by its description.
+        """
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Partial numberings should NOT appear isolated on their own lines
+        # If they're isolated, it means the parser incorrectly split them from their text
+        lines = text_content.split("\n")
+        isolated_numberings = []
+        for line in lines:
+            stripped = line.strip()
+            # Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
+            cleaned = stripped.replace("|", "").strip()
+            if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
+                isolated_numberings.append(stripped)
+
+        assert len(isolated_numberings) == 0, (
+            f"Partial numberings should not be isolated from their text. "
+            f"Found isolated: {isolated_numberings}"
+        )
+
+        # Verify that partial numberings appear WITH following text on the same line
+        # Look for patterns like ".1 The intent" or ".1  Some text"
+        partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
+        assert (
+            len(partial_with_text) > 0
+        ), "Expected to find partial numberings followed by text on the same line"
+
+    def test_masterformat_content_preserved(self):
+        """Test that MasterFormat document content is fully preserved."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Verify key content from the MasterFormat document is preserved
+        expected_content = [
+            "RFP for Construction Management Services",
+            "Section 00 00 43",
+            "Instructions to Respondents",
+            "Ken Sargent House",
+            "INTENT",
+            "Request for Proposal",
+            "KEN SARGENT HOUSE",
+            "GRANDE PRAIRIE, ALBERTA",
+            "Section 00 00 45",
+        ]
+
+        for content in expected_content:
+            assert (
+                content in text_content
+            ), f"Expected content '{content}' not found in extracted text"
+
+        # Verify partial numbering is followed by text on the same line
+        # .1 should be followed by "The intent" on the same line
+        assert re.search(
+            r"\.1\s+The intent", text_content
+        ), "Partial numbering .1 should be followed by 'The intent' text"
+
+        # .2 should be followed by "Available information" on the same line
+        assert re.search(
+            r"\.2\s+Available information", text_content
+        ), "Partial numbering .2 should be followed by 'Available information' text"
+
+        # Ensure text content is not empty and has reasonable length
+        assert (
+            len(text_content.strip()) > 100
+        ), "MasterFormat document should have substantial text content"
+
+    def test_merge_partial_numbering_with_empty_lines_between(self):
+        """Test that partial numberings merge correctly even with empty lines between.
+
+        When PDF extractors produce output like:
+            .1
+
+            The intent of this Request...
+
+        The merge logic should still combine them properly.
+        """
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # The merged result should have .1 and .2 followed by text
+        # Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
+        lines = text_content.split("\n")
+
+        for i, line in enumerate(lines):
+            stripped = line.strip()
+            # If we find an isolated partial numbering, the merge failed
+            if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
+                # Check if next non-empty line exists and wasn't merged
+                for j in range(i + 1, min(i + 3, len(lines))):
+                    if lines[j].strip():
+                        pytest.fail(
+                            f"Partial numbering '{stripped}' on line {i} was not "
+                            f"merged with following text '{lines[j].strip()[:30]}...'"
+                        )
+                        break
+
+    def test_multiple_partial_numberings_all_merged(self):
+        """Test that all partial numberings in a document are properly merged."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Count occurrences of merged partial numberings (number followed by text)
+        merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
+
+        # Count isolated partial numberings (number alone on a line)
+        isolated_count = 0
+        for line in text_content.split("\n"):
+            stripped = line.strip()
+            if re.match(r"^\.\d+$", stripped):
+                isolated_count += 1
+
+        assert (
+            merged_count >= 2
+        ), f"Expected at least 2 merged partial numberings, found {merged_count}"
+        assert (
+            isolated_count == 0
+        ), f"Found {isolated_count} isolated partial numberings that weren't merged"

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: MIT`
`4`		`-__version__ = "0.1.4"`
	`4`	`+__version__ = "0.1.5b1"`