Skip to content

Commit 7fdaefb

Browse files
authored
Fix: PDF parsing doesn't support partially numbered lists (#1525)
* Fix: PDF parsing doesn't support partially numbered lists * Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file * Refactor: Improve assertion formatting in partial numbering tests
1 parent 251dddc commit 7fdaefb

File tree

4 files changed

+240
-1
lines changed

4 files changed

+240
-1
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
22
#
33
# SPDX-License-Identifier: MIT
4-
__version__ = "0.1.4"
4+
__version__ = "0.1.5b1"

packages/markitdown/src/markitdown/converters/_pdf_converter.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,62 @@
11
import sys
22
import io
3+
import re
34
from typing import BinaryIO, Any
45

56
from .._base_converter import DocumentConverter, DocumentConverterResult
67
from .._stream_info import StreamInfo
78
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
89

10+
# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
11+
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
12+
13+
14+
def _merge_partial_numbering_lines(text: str) -> str:
15+
"""
16+
Post-process extracted text to merge MasterFormat-style partial numbering
17+
with the following text line.
18+
19+
MasterFormat documents use partial numbering like:
20+
.1 The intent of this Request for Proposal...
21+
.2 Available information relative to...
22+
23+
Some PDF extractors split these into separate lines:
24+
.1
25+
The intent of this Request for Proposal...
26+
27+
This function merges them back together.
28+
"""
29+
lines = text.split("\n")
30+
result_lines: list[str] = []
31+
i = 0
32+
33+
while i < len(lines):
34+
line = lines[i]
35+
stripped = line.strip()
36+
37+
# Check if this line is ONLY a partial numbering
38+
if PARTIAL_NUMBERING_PATTERN.match(stripped):
39+
# Look for the next non-empty line to merge with
40+
j = i + 1
41+
while j < len(lines) and not lines[j].strip():
42+
j += 1
43+
44+
if j < len(lines):
45+
# Merge the partial numbering with the next line
46+
next_line = lines[j].strip()
47+
result_lines.append(f"{stripped} {next_line}")
48+
i = j + 1 # Skip past the merged line
49+
else:
50+
# No next line to merge with, keep as is
51+
result_lines.append(line)
52+
i += 1
53+
else:
54+
result_lines.append(line)
55+
i += 1
56+
57+
return "\n".join(result_lines)
58+
59+
960
# Load dependencies
1061
_dependency_exc_info = None
1162
try:
@@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None:
117168
# Determine row type
118169
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
119170

171+
# Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
172+
# These should be treated as list items, not table rows
173+
has_partial_numbering = False
174+
if row_words:
175+
first_word = row_words[0]["text"].strip()
176+
if PARTIAL_NUMBERING_PATTERN.match(first_word):
177+
has_partial_numbering = True
178+
120179
row_info.append(
121180
{
122181
"y_key": y_key,
@@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None:
125184
"x_groups": x_groups,
126185
"is_paragraph": is_paragraph,
127186
"num_columns": len(x_groups),
187+
"has_partial_numbering": has_partial_numbering,
128188
}
129189
)
130190

@@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None:
156216
info["is_table_row"] = False
157217
continue
158218

219+
# Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
220+
if info["has_partial_numbering"]:
221+
info["is_table_row"] = False
222+
continue
223+
159224
# Count how many global columns this row's words align with
160225
aligned_columns: set[int] = set()
161226
for word in info["words"]:
@@ -469,4 +534,7 @@ def convert(
469534
pdf_bytes.seek(0)
470535
markdown = pdfminer.high_level.extract_text(pdf_bytes)
471536

537+
# Post-process to merge MasterFormat-style partial numbering with following text
538+
markdown = _merge_partial_numbering_lines(markdown)
539+
472540
return DocumentConverterResult(markdown=markdown)
Binary file not shown.
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
#!/usr/bin/env python3 -m pytest
2+
"""Tests for MasterFormat-style partial numbering in PDF conversion."""
3+
4+
import os
5+
import re
6+
import pytest
7+
8+
from markitdown import MarkItDown
9+
from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
10+
11+
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
12+
13+
14+
class TestMasterFormatPartialNumbering:
15+
"""Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
16+
17+
def test_partial_numbering_pattern_regex(self):
18+
"""Test that the partial numbering regex pattern correctly matches."""
19+
20+
# Should match partial numbering patterns
21+
assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
22+
assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
23+
assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
24+
assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
25+
26+
# Should NOT match other patterns
27+
assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
28+
assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
29+
assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
30+
assert PARTIAL_NUMBERING_PATTERN.match("text") is None
31+
assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
32+
assert PARTIAL_NUMBERING_PATTERN.match("") is None
33+
34+
def test_masterformat_partial_numbering_not_split(self):
35+
"""Test that MasterFormat partial numbering stays with associated text.
36+
37+
MasterFormat documents use partial numbering like:
38+
.1 The intent of this Request for Proposal...
39+
.2 Available information relative to...
40+
41+
These should NOT be split into separate table columns, but kept
42+
as coherent text lines with the number followed by its description.
43+
"""
44+
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
45+
46+
markitdown = MarkItDown()
47+
result = markitdown.convert(pdf_path)
48+
text_content = result.text_content
49+
50+
# Partial numberings should NOT appear isolated on their own lines
51+
# If they're isolated, it means the parser incorrectly split them from their text
52+
lines = text_content.split("\n")
53+
isolated_numberings = []
54+
for line in lines:
55+
stripped = line.strip()
56+
# Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
57+
cleaned = stripped.replace("|", "").strip()
58+
if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
59+
isolated_numberings.append(stripped)
60+
61+
assert len(isolated_numberings) == 0, (
62+
f"Partial numberings should not be isolated from their text. "
63+
f"Found isolated: {isolated_numberings}"
64+
)
65+
66+
# Verify that partial numberings appear WITH following text on the same line
67+
# Look for patterns like ".1 The intent" or ".1 Some text"
68+
partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
69+
assert (
70+
len(partial_with_text) > 0
71+
), "Expected to find partial numberings followed by text on the same line"
72+
73+
def test_masterformat_content_preserved(self):
74+
"""Test that MasterFormat document content is fully preserved."""
75+
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
76+
77+
markitdown = MarkItDown()
78+
result = markitdown.convert(pdf_path)
79+
text_content = result.text_content
80+
81+
# Verify key content from the MasterFormat document is preserved
82+
expected_content = [
83+
"RFP for Construction Management Services",
84+
"Section 00 00 43",
85+
"Instructions to Respondents",
86+
"Ken Sargent House",
87+
"INTENT",
88+
"Request for Proposal",
89+
"KEN SARGENT HOUSE",
90+
"GRANDE PRAIRIE, ALBERTA",
91+
"Section 00 00 45",
92+
]
93+
94+
for content in expected_content:
95+
assert (
96+
content in text_content
97+
), f"Expected content '{content}' not found in extracted text"
98+
99+
# Verify partial numbering is followed by text on the same line
100+
# .1 should be followed by "The intent" on the same line
101+
assert re.search(
102+
r"\.1\s+The intent", text_content
103+
), "Partial numbering .1 should be followed by 'The intent' text"
104+
105+
# .2 should be followed by "Available information" on the same line
106+
assert re.search(
107+
r"\.2\s+Available information", text_content
108+
), "Partial numbering .2 should be followed by 'Available information' text"
109+
110+
# Ensure text content is not empty and has reasonable length
111+
assert (
112+
len(text_content.strip()) > 100
113+
), "MasterFormat document should have substantial text content"
114+
115+
def test_merge_partial_numbering_with_empty_lines_between(self):
116+
"""Test that partial numberings merge correctly even with empty lines between.
117+
118+
When PDF extractors produce output like:
119+
.1
120+
121+
The intent of this Request...
122+
123+
The merge logic should still combine them properly.
124+
"""
125+
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
126+
127+
markitdown = MarkItDown()
128+
result = markitdown.convert(pdf_path)
129+
text_content = result.text_content
130+
131+
# The merged result should have .1 and .2 followed by text
132+
# Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
133+
lines = text_content.split("\n")
134+
135+
for i, line in enumerate(lines):
136+
stripped = line.strip()
137+
# If we find an isolated partial numbering, the merge failed
138+
if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
139+
# Check if next non-empty line exists and wasn't merged
140+
for j in range(i + 1, min(i + 3, len(lines))):
141+
if lines[j].strip():
142+
pytest.fail(
143+
f"Partial numbering '{stripped}' on line {i} was not "
144+
f"merged with following text '{lines[j].strip()[:30]}...'"
145+
)
146+
break
147+
148+
def test_multiple_partial_numberings_all_merged(self):
149+
"""Test that all partial numberings in a document are properly merged."""
150+
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
151+
152+
markitdown = MarkItDown()
153+
result = markitdown.convert(pdf_path)
154+
text_content = result.text_content
155+
156+
# Count occurrences of merged partial numberings (number followed by text)
157+
merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
158+
159+
# Count isolated partial numberings (number alone on a line)
160+
isolated_count = 0
161+
for line in text_content.split("\n"):
162+
stripped = line.strip()
163+
if re.match(r"^\.\d+$", stripped):
164+
isolated_count += 1
165+
166+
assert (
167+
merged_count >= 2
168+
), f"Expected at least 2 merged partial numberings, found {merged_count}"
169+
assert (
170+
isolated_count == 0
171+
), f"Found {isolated_count} isolated partial numberings that weren't merged"

0 commit comments

Comments
 (0)