Skip to content

Commit 704d6e1

Browse files
authored
chore: Update PDFDocument to use from_file method (#35)
* update PDFDocument to use from_file method * bump version
1 parent 2d5dba0 commit 704d6e1

File tree

5 files changed

+12
-14
lines changed

5 files changed

+12
-14
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.2.1-dev8
1+
## 0.2.1-dev9
22

3+
* Update `PDFDocument` to use the `from_file` method
34
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
45
* Added staging brick for separating text into attention window size chunks for `transformers`.
56
* Added staging brick for LabelBox.

test_unstructured/documents/test_pdf.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,15 +100,13 @@ def test_read_pdf(monkeypatch, mock_page_layout):
100100
images = [image, image]
101101

102102
layouts = Layout([mock_page_layout, mock_page_layout])
103-
page = PDFPage(number=0, image=image, layout=mock_page_layout)
104103

105104
monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
106105
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
107106

108-
doc = PDFDocument("fake-file.pdf")
109-
110107
with patch.object(lp, "load_pdf", return_value=(layouts, images)):
111-
page.get_elements()
108+
doc = PDFDocument.from_file("fake-file.pdf")
109+
112110
assert str(doc).startswith("A Catchy Title")
113111
assert str(doc).count("A Catchy Title") == 2 # Once for each page
114112
assert str(doc).endswith("A very repetitive narrative. ")

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.1-dev8" # pragma: no cover
1+
__version__ = "0.2.1-dev9" # pragma: no cover

unstructured/documents/base.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from __future__ import annotations
2-
from abc import ABC, abstractmethod
2+
from abc import ABC
33
from typing import List, Optional
44

55
from unstructured.documents.elements import Element, NarrativeText
@@ -15,7 +15,6 @@ def __init__(self):
1515
def __str__(self) -> str:
1616
return "\n\n".join([str(page) for page in self.pages])
1717

18-
@abstractmethod
1918
def _read(self) -> List[Page]: # pragma: no cover
2019
pass
2120

unstructured/documents/pdf.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class PDFDocument(Document):
1919
document image analysis (DIA) model detects the layout of the page prior to extracting
2020
element."""
2121

22-
def __init__(self, filename):
22+
def __init__(self):
2323
print(
2424
"""
2525
@@ -29,12 +29,12 @@ def __init__(self, filename):
2929
3030
"""
3131
)
32-
self.filename = filename
3332
super().__init__()
3433

35-
def _read(self) -> List[Page]:
36-
logger.info(f"Reading PDF for file: {self.filename} ...")
37-
layouts, images = lp.load_pdf(self.filename, load_images=True)
34+
@classmethod
35+
def from_file(cls, filename: str):
36+
logger.info(f"Reading PDF for file: {filename} ...")
37+
layouts, images = lp.load_pdf(filename, load_images=True)
3838
pages: List[Page] = list()
3939
for i, layout in enumerate(layouts):
4040
image = images[i]
@@ -43,7 +43,7 @@ def _read(self) -> List[Page]:
4343
page = PDFPage(number=i, image=image, layout=layout)
4444
page.get_elements()
4545
pages.append(page)
46-
return pages
46+
return cls.from_pages(pages)
4747

4848

4949
class PDFPage(Page):

0 commit comments

Comments
 (0)