Skip to content

Commit 09e0397

Browse files
authored
fix: patch pdfminer (#98)
Patches pdfminer.six, which is not parsing keywords if the end of the keyword coincides with the end of a stream of PDF operations.
1 parent 854abdb commit 09e0397

File tree

6 files changed

+42
-4
lines changed

6 files changed

+42
-4
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.4.3
2+
3+
* Patch pdfminer.six to fix parsing bug
4+
15
## 0.4.2
26

37
* Output of table extraction is now stored in `text_as_html` property rather than `text` property

sample-docs/IRS-form-1987.pdf

305 KB
Binary file not shown.

test_unstructured_inference/inference/test_layout.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -433,9 +433,12 @@ def test_ocr_image(region, objects, ocr_strategy, expected):
433433
assert elements.needs_ocr(region, objects, ocr_strategy) is expected
434434

435435

436-
def test_load_pdf():
437-
layouts, images = layout.load_pdf("sample-docs/loremipsum.pdf")
436+
@pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"])
437+
def test_load_pdf(filename):
438+
layouts, images = layout.load_pdf(f"sample-docs/{filename}")
438439
assert len(layouts)
440+
for lo in layouts:
441+
assert len(lo)
439442
assert len(images)
440443
assert len(layouts) == len(images)
441444

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.2" # pragma: no cover
1+
__version__ = "0.4.3" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
from typing import List, Optional, Tuple, Union, BinaryIO
55

66
import numpy as np
7-
import pdfplumber
87
import pdf2image
8+
from pdfminer import psparser
99
from PIL import Image
1010

1111
from unstructured_inference.inference.elements import (
@@ -17,6 +17,13 @@
1717
from unstructured_inference.logger import logger
1818
from unstructured_inference.models.base import get_model
1919
from unstructured_inference.models.unstructuredmodel import UnstructuredModel
20+
from unstructured_inference.patches.pdfminer import parse_keyword
21+
22+
# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
23+
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
24+
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore
25+
26+
import pdfplumber # noqa
2027

2128
VALID_OCR_STRATEGIES = (
2229
"auto", # Use OCR when it looks like other methods have failed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from typing import Union
2+
3+
from pdfminer.psparser import END_KEYWORD, PSKeyword, KWD, PSBaseParser
4+
5+
6+
def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
7+
"""Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
8+
https://github.com/pdfminer/pdfminer.six/pull/885."""
9+
m = END_KEYWORD.search(s, i)
10+
if not m:
11+
j = len(s)
12+
self._curtoken += s[i:]
13+
else:
14+
j = m.start(0)
15+
self._curtoken += s[i:j]
16+
if self._curtoken == b"true":
17+
token: Union[bool, PSKeyword] = True
18+
elif self._curtoken == b"false":
19+
token = False
20+
else:
21+
token = KWD(self._curtoken)
22+
self._add_token(token)
23+
self._parse1 = self._parse_main
24+
return j

0 commit comments

Comments
 (0)