chore: Update PDFDocument to use from_file method (#35)

MthwRobinson · web-flow · commit 704d6e11d182 · 2022-10-13T16:04:30.000Z
* update PDFDocument to use from_file method

* bump version
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,6 @@
-## 0.2.1-dev8
+## 0.2.1-dev9
 
+* Update `PDFDocument` to use the `from_file` method
 * Added staging brick for CSV format for ISD (Initial Structured Data) format.
 * Added staging brick for separating text into attention window size chunks for `transformers`.
 * Added staging brick for LabelBox.
diff --git a/test_unstructured/documents/test_pdf.py b/test_unstructured/documents/test_pdf.py
@@ -100,15 +100,13 @@ def test_read_pdf(monkeypatch, mock_page_layout):
     images = [image, image]
 
     layouts = Layout([mock_page_layout, mock_page_layout])
-    page = PDFPage(number=0, image=image, layout=mock_page_layout)
 
     monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
     monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
 
-    doc = PDFDocument("fake-file.pdf")
-
     with patch.object(lp, "load_pdf", return_value=(layouts, images)):
-        page.get_elements()
+        doc = PDFDocument.from_file("fake-file.pdf")
+
         assert str(doc).startswith("A Catchy Title")
         assert str(doc).count("A Catchy Title") == 2  # Once for each page
         assert str(doc).endswith("A very repetitive narrative. ")
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.1-dev8"  # pragma: no cover
+__version__ = "0.2.1-dev9"  # pragma: no cover
diff --git a/unstructured/documents/base.py b/unstructured/documents/base.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from abc import ABC, abstractmethod
+from abc import ABC
 from typing import List, Optional
 
 from unstructured.documents.elements import Element, NarrativeText
@@ -15,7 +15,6 @@ def __init__(self):
     def __str__(self) -> str:
         return "\n\n".join([str(page) for page in self.pages])
 
-    @abstractmethod
     def _read(self) -> List[Page]:  # pragma: no cover
         pass
 
diff --git a/unstructured/documents/pdf.py b/unstructured/documents/pdf.py
@@ -19,7 +19,7 @@ class PDFDocument(Document):
     document image analysis (DIA) model detects the layout of the page prior to extracting
     element."""
 
-    def __init__(self, filename):
+    def __init__(self):
         print(
             """
 
@@ -29,12 +29,12 @@ def __init__(self, filename):
 
 """
         )
-        self.filename = filename
         super().__init__()
 
-    def _read(self) -> List[Page]:
-        logger.info(f"Reading PDF for file: {self.filename} ...")
-        layouts, images = lp.load_pdf(self.filename, load_images=True)
+    @classmethod
+    def from_file(cls, filename: str):
+        logger.info(f"Reading PDF for file: {filename} ...")
+        layouts, images = lp.load_pdf(filename, load_images=True)
         pages: List[Page] = list()
         for i, layout in enumerate(layouts):
             image = images[i]
@@ -43,7 +43,7 @@ def _read(self) -> List[Page]:
             page = PDFPage(number=i, image=image, layout=layout)
             page.get_elements()
             pages.append(page)
-        return pages
+        return cls.from_pages(pages)
 
 
 class PDFPage(Page):

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.2.1-dev8" # pragma: no cover`
	`1`	`+__version__ = "0.2.1-dev9" # pragma: no cover`