66from typing import (
77 TYPE_CHECKING ,
88 Any ,
9- Dict ,
109 Iterable ,
1110 Iterator ,
1211 Mapping ,
2322from langchain_community .document_loaders .blob_loaders import Blob
2423
2524if TYPE_CHECKING :
26- import fitz .fitz
27- import pdfminer .layout
28- import pdfplumber .page
29- import pypdf ._page
30- import pypdfium2 ._helpers .page
31- from pypdf import PageObject
25+ import fitz
26+ import pdfminer
27+ import pdfplumber
28+ import pypdf
29+ import pypdfium2
3230 from textractor .data .text_linearization_config import TextLinearizationConfig
3331
34-
3532_PDF_FILTER_WITH_LOSS = ["DCTDecode" , "DCT" , "JPXDecode" ]
3633_PDF_FILTER_WITHOUT_LOSS = [
3734 "LZWDecode" ,
@@ -90,7 +87,7 @@ def __init__(
9087 extract_images : bool = False ,
9188 * ,
9289 extraction_mode : str = "plain" ,
93- extraction_kwargs : Optional [Dict [str , Any ]] = None ,
90+ extraction_kwargs : Optional [dict [str , Any ]] = None ,
9491 ):
9592 self .password = password
9693 self .extract_images = extract_images
@@ -107,7 +104,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
107104 "`pip install pypdf`"
108105 )
109106
110- def _extract_text_from_page (page : " PageObject" ) -> str :
107+ def _extract_text_from_page (page : pypdf . PageObject ) -> str :
111108 """
112109 Extract text from image given the version of pypdf.
113110 """
@@ -126,12 +123,13 @@ def _extract_text_from_page(page: "PageObject") -> str:
126123 Document (
127124 page_content = _extract_text_from_page (page = page )
128125 + self ._extract_images_from_page (page ),
129- metadata = {"source" : blob .source , "page" : page_number }, # type: ignore[attr-defined]
126+ metadata = {"source" : blob .source , "page" : page_number },
127+ # type: ignore[attr-defined]
130128 )
131129 for page_number , page in enumerate (pdf_reader .pages )
132130 ]
133131
134- def _extract_images_from_page (self , page : pypdf ._page . PageObject ) -> str :
132+ def _extract_images_from_page (self , page : pypdf .PageObject ) -> str :
135133 """Extract images from page and get the text with RapidOCR."""
136134 if not self .extract_images or "/XObject" not in page ["/Resources" ].keys (): # type: ignore[attr-defined]
137135 return ""
@@ -307,9 +305,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
307305 for page in doc
308306 ]
309307
310- def _get_page_content (
311- self , doc : fitz .fitz .Document , page : fitz .fitz .Page , blob : Blob
312- ) -> str :
308+ def _get_page_content (self , doc : fitz .Document , page : fitz .Page , blob : Blob ) -> str :
313309 """
314310 Get the text of the page using PyMuPDF and RapidOCR and issue a warning
315311 if it is empty.
@@ -327,7 +323,7 @@ def _get_page_content(
327323 return content
328324
329325 def _extract_metadata (
330- self , doc : fitz .fitz . Document , page : fitz . fitz .Page , blob : Blob
326+ self , doc : fitz .Document , page : fitz .Page , blob : Blob
331327 ) -> dict :
332328 """Extract metadata from the document and page."""
333329 return dict (
@@ -344,9 +340,7 @@ def _extract_metadata(
344340 },
345341 )
346342
347- def _extract_images_from_page (
348- self , doc : fitz .fitz .Document , page : fitz .fitz .Page
349- ) -> str :
343+ def _extract_images_from_page (self , doc : fitz .Document , page : fitz .Page ) -> str :
350344 """Extract images from page and get the text with RapidOCR."""
351345 if not self .extract_images :
352346 return ""
@@ -558,7 +552,7 @@ def __init__(
558552 textract_features : Optional [Sequence [int ]] = None ,
559553 client : Optional [Any ] = None ,
560554 * ,
561- linearization_config : Optional [" TextLinearizationConfig" ] = None ,
555+ linearization_config : Optional [TextLinearizationConfig ] = None ,
562556 ) -> None :
563557 """Initializes the parser.
564558
0 commit comments