fix: patch pdfminer (#98)

qued · web-flow · commit 09e03979de06 · 2023-05-04T14:43:12.000-05:00
Patches pdfminer.six, which is not parsing keywords if the end of the keyword coincides with the end of a stream of PDF operations.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.4.3
+
+* Patch pdfminer.six to fix parsing bug
+
 ## 0.4.2
 
 * Output of table extraction is now stored in `text_as_html` property rather than `text` property
diff --git a/sample-docs/IRS-form-1987.pdf b/sample-docs/IRS-form-1987.pdf
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -433,9 +433,12 @@ def test_ocr_image(region, objects, ocr_strategy, expected):
     assert elements.needs_ocr(region, objects, ocr_strategy) is expected
 
 
-def test_load_pdf():
-    layouts, images = layout.load_pdf("sample-docs/loremipsum.pdf")
+@pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"])
+def test_load_pdf(filename):
+    layouts, images = layout.load_pdf(f"sample-docs/{filename}")
     assert len(layouts)
+    for lo in layouts:
+        assert len(lo)
     assert len(images)
     assert len(layouts) == len(images)
 
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.2"  # pragma: no cover
+__version__ = "0.4.3"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -4,8 +4,8 @@
 from typing import List, Optional, Tuple, Union, BinaryIO
 
 import numpy as np
-import pdfplumber
 import pdf2image
+from pdfminer import psparser
 from PIL import Image
 
 from unstructured_inference.inference.elements import (
@@ -17,6 +17,13 @@
 from unstructured_inference.logger import logger
 from unstructured_inference.models.base import get_model
 from unstructured_inference.models.unstructuredmodel import UnstructuredModel
+from unstructured_inference.patches.pdfminer import parse_keyword
+
+# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
+# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
+psparser.PSBaseParser._parse_keyword = parse_keyword  # type: ignore
+
+import pdfplumber  # noqa
 
 VALID_OCR_STRATEGIES = (
     "auto",  # Use OCR when it looks like other methods have failed
diff --git a/unstructured_inference/patches/pdfminer.py b/unstructured_inference/patches/pdfminer.py
@@ -0,0 +1,24 @@
+from typing import Union
+
+from pdfminer.psparser import END_KEYWORD, PSKeyword, KWD, PSBaseParser
+
+
+def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
+    """Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
+    https://github.com/pdfminer/pdfminer.six/pull/885."""
+    m = END_KEYWORD.search(s, i)
+    if not m:
+        j = len(s)
+        self._curtoken += s[i:]
+    else:
+        j = m.start(0)
+        self._curtoken += s[i:j]
+    if self._curtoken == b"true":
+        token: Union[bool, PSKeyword] = True
+    elif self._curtoken == b"false":
+        token = False
+    else:
+        token = KWD(self._curtoken)
+    self._add_token(token)
+    self._parse1 = self._parse_main
+    return j

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.2" # pragma: no cover`
	`1`	`+__version__ = "0.4.3" # pragma: no cover`