diff --git a/.gitignore b/.gitignore index 45d553be4..93ddeafd0 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ __pycache__ .mypy_cache_test .env .venv* +.idea + diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index 6ad4f43e8..2dd58f9e7 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -59,8 +59,8 @@ openapi-pydantic>=0.3.2,<0.4 oracle-ads>=2.9.1,<3 oracledb>=2.2.0,<3 pandas>=2.0.1,<3 -pdfminer-six==20231228 -pdfplumber>=0.11 +pdfminer-six>=20250324 +pdfplumber>=0.11.6 pgvector>=0.1.6,<0.2 playwright>=1.48.0,<2 praw>=7.7.1,<8 diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 6b3a0a065..fa151861e 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -129,6 +129,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]: The standard keys are: - source + - page (if mode='page') - total_page - creationdate - creator @@ -1386,97 +1387,534 @@ def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> st class PDFPlumberParser(BaseBlobParser): - """Parse `PDF` with `PDFPlumber`.""" + """Parse a blob from a PDF using `pdfplumber` library. + + This class provides methods to parse a blob from a PDF document, supporting various + configurations such as handling password-protected PDFs, extracting images, and + defining extraction mode. + It integrates the 'pdfplumber' library for PDF processing and offers synchronous + blob parsing. + + Examples: + Setup: + + .. code-block:: bash + + pip install -U langchain-community pdfplumber + + Load a blob from a PDF file: + + .. code-block:: python + + from langchain_core.documents.base import Blob + + blob = Blob.from_path("./example_data/layout-parser-paper.pdf") + + Instantiate the parser: + + .. code-block:: python + + from langchain_community.document_loaders.parsers import PDFPlumberParser + + parser = PDFPlumberParser( + # password = None, + mode = "single", + pages_delimiter = "\n\f", + # extract_tables="markdown", + metadata_format="standard", + ) + + Lazily parse the blob: + + .. code-block:: python + + docs = [] + docs_lazy = parser.lazy_parse(blob) + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + """ def __init__( self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False, extract_images: bool = False, + *, + password: Optional[str] = None, + mode: Literal["single", "page"] = "page", + pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + images_parser: Optional[BaseImageBlobParser] = None, + images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", + extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, + extract_tables_settings: Optional[dict[str, Any]] = None, + metadata_format: Literal["legacy", "standard"] = "legacy", ) -> None: """Initialize the parser. Args: + password: Optional password for opening encrypted PDFs. + mode: The extraction mode, either "single" for the entire document or "page" + for page-wise extraction. + pages_delimiter: A string delimiter to separate pages in single-mode + extraction. + extract_images: Whether to extract images from the PDF. + images_parser: Optional image blob parser. + images_inner_format: The format for the parsed output. + - "text" = return the content as is + - "markdown-img" = wrap the content into an image markdown link, w/ link + pointing to (`![body)(#)`] + - "html-img" = wrap the content as the `alt` text of an tag and link to + (`{body}`) + extract_tables: Whether to extract images from the PDF in a specific + format, such as "csv", "markdown" or "html". text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` - dedupe: Avoiding the error of duplicate characters if `dedupe=True`. + dedupe: Avoiding the error of duplicate characters if `dedupe=True` + extract_tables_settings: Optional dictionary of settings for customizing + table extraction. + metadata_format: Use CamelCase keys with 'legacy' + and lower keys with 'standard'. + + Returns: + This method does not directly return data. Use the `parse` or `lazy_parse` + methods to retrieve parsed documents with content and metadata. + + Raises: + ValueError: If the `mode` is not "single" or "page". + ValueError: If the `extract_tables` is not "csv", "markdown" or "html". + """ + super().__init__() + if mode not in ["single", "page"]: + raise ValueError("mode must be single or page") + if extract_tables and extract_tables not in ["csv", "markdown", "html"]: + raise ValueError("mode must be csv, markdown or html") + if extract_images and not images_parser: + images_parser = RapidOCRBlobParser() + self.password = password + self.extract_images = extract_images + self.images_parser = images_parser + self.images_inner_format = images_inner_format + self.mode = mode + self.pages_delimiter = pages_delimiter + self.dedupe = dedupe + self.text_kwargs = text_kwargs or {} + self.extract_tables = extract_tables + self.extract_tables_settings = extract_tables_settings or { + "vertical_strategy": "lines", + "horizontal_strategy": "lines", + "snap_y_tolerance": 5, + "intersection_x_tolerance": 15, + } + if metadata_format == "legacy": + warnings.warn( + "The default value 'legacy' use some CamelCase keys. " + "It's will be deprecated in the next major version." + ) + + self.metadata_format = metadata_format + + def _validate_metadata(self, metadata: dict[str, Any]) -> dict[str, Any]: + if self.metadata_format == "legacy": + return metadata + else: + return _validate_metadata(metadata) + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob. + + Args: + blob: The blob to parse. + + Raises: + ImportError: If the `pypdf` package is not found. + + Yield: + An iterator over the parsed documents. """ try: - import PIL # noqa:F401 + import pdfplumber except ImportError: raise ImportError( - "pillow package not found, please install it with `pip install pillow`" + "pdfplumber package not found, please install it " + "with `pip install pdfplumber`" ) - self.text_kwargs = text_kwargs or {} - self.dedupe = dedupe - self.extract_images = extract_images - - def lazy_parse(self, blob: Blob) -> Iterator[Document]: - """Lazily parse the blob.""" - import pdfplumber with blob.as_bytes_io() as file_path: - doc = pdfplumber.open(file_path) # open document - - yield from [ - Document( - page_content=self._process_page_content(page) - + "\n" - + self._extract_images_from_page(page), - metadata=dict( - { + doc = pdfplumber.open(file_path, password=self.password) # open document + from pdfplumber.utils import geometry + + contents = [] + # The legacy version, use CreationDate, Creator, etc. + # The new 'standard' version must use lower case key. + if self.metadata_format == "legacy": + doc_metadata = ( + { + "producer": "PDFPlumber", + "creator": "PDFPlumber", + "creationdate": "", + } + | doc.metadata # Add parser metdata + | { # with more keys + "source": blob.source, + "file_path": blob.source, + "total_pages": len(doc.pages), + } + ) + else: + doc_metadata = _purge_metadata( + ( + doc.metadata # Add parser metdata + | { # with more keys "source": blob.source, "file_path": blob.source, - "page": page.page_number - 1, "total_pages": len(doc.pages), - }, - **{ - k: doc.metadata[k] - for k in doc.metadata - if type(doc.metadata[k]) in [str, int] - }, - ), + } + ) + ) + + for page in doc.pages: + tables_bbox: list[tuple[float, float, float, float]] = ( + self._extract_tables_bbox_from_page(page) + ) + tables_content = self._extract_tables_from_page(page) + images_bbox = [geometry.obj_to_bbox(image) for image in page.images] + image_from_page = self._extract_images_from_page(page) + page_text = [] + extras = [] + for content in self._split_page_content( + page, + tables_bbox, + tables_content, + images_bbox, + image_from_page, + ): + if isinstance(content, str): # Text + page_text.append(content) + elif isinstance(content, list): # Table + page_text.append(_JOIN_TABLES + self._convert_table(content)) + else: # Image + if self.images_parser: + try: + from PIL import Image as Img + + Img.fromarray(content) # Check if image is valid + image_bytes = io.BytesIO() + numpy.save(image_bytes, content) + blob = Blob.from_data( + image_bytes.getvalue(), + mime_type="application/x-npy", + ) + text_from_image = next( + self.images_parser.lazy_parse(blob) + ).page_content + extras.append( + _format_inner_image( + blob, text_from_image, self.images_inner_format + ) + ) + except TypeError: + pass + except EOFError: + pass + + all_text = _merge_text_and_extras(extras, "".join(page_text).strip()) + + if self.mode == "page": + # For legacy compatibility, add the last '\n'_ + if not all_text.endswith("\n"): + all_text += "\n" + yield Document( + page_content=all_text, + metadata=self._validate_metadata( + doc_metadata + | { + "page": page.page_number - 1, + } + ), + ) + else: + contents.append(all_text) + if self.mode == "single": + yield Document( + page_content=self.pages_delimiter.join(contents), + metadata=self._validate_metadata(doc_metadata), ) - for page in doc.pages - ] def _process_page_content(self, page: pdfplumber.page.Page) -> str: - """Process the page content based on dedupe.""" + """Process the page content based on dedupe. + + Args: + page: The PDF page to process. + + Returns: + The extracted text from the page. + """ if self.dedupe: return page.dedupe_chars().extract_text(**self.text_kwargs) return page.extract_text(**self.text_kwargs) - def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str: - """Extract images from page and get the text with RapidOCR.""" + def _split_page_content( + self, + page: pdfplumber.page.Page, + tables_bbox: list[tuple[float, float, float, float]], + tables_content: list[list[list[Any]]], + images_bbox: list[tuple[float, float, float, float]], + images_content: list[np.ndarray], + **kwargs: Any, + ) -> Iterator[Union[str, list[list[str]], np.ndarray]]: + """Split the page content into text, tables, and images. + + Args: + page: The PDF page to process. + tables_bbox: Bounding boxes of tables on the page. + tables_content: Content of tables on the page. + images_bbox: Bounding boxes of images on the page. + images_content: Content of images on the page. + **kwargs: Additional keyword arguments. + + Yields: + An iterator over the split content (text, tables, images). + """ + from pdfplumber.utils import ( + geometry, + text, + ) + + # Iterate over words. If a word is in a table, + # yield the accumulated text, and the table + # A the word is in a previously see table, ignore it + # Finish with the accumulated text + kwargs.update( + { + "keep_blank_chars": True, + # "use_text_flow": True, + "presorted": True, + "layout_bbox": kwargs.get("layout_bbox") or page.cropbox, + } + ) + chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars + + extractor = text.WordExtractor( + **{k: kwargs[k] for k in text.WORD_EXTRACTOR_KWARGS if k in kwargs} + ) + wordmap = extractor.extract_wordmap(chars) + extract_wordmaps: list[Any] = [] + used_arrays = [False] * len(tables_bbox) + for word, o in wordmap.tuples: + is_table = False + word_bbox = geometry.obj_to_bbox(word) + for i, table_bbox in enumerate(tables_bbox): + if geometry.get_bbox_overlap(word_bbox, table_bbox): + # Find a world in a table + is_table = True + if not used_arrays[i]: + # First time I see a word in this array + # Yield the previous part + if extract_wordmaps: + new_wordmap = text.WordMap(tuples=extract_wordmaps) + new_textmap = new_wordmap.to_textmap( + **{ + k: kwargs[k] + for k in text.TEXTMAP_KWARGS + if k in kwargs + } + ) + yield new_textmap.to_string() + extract_wordmaps.clear() + # And yield the table + used_arrays[i] = True + yield tables_content[i] + break + if not is_table: + extract_wordmaps.append((word, o)) + if extract_wordmaps: + new_wordmap = text.WordMap(tuples=extract_wordmaps) + new_textmap = new_wordmap.to_textmap( + **{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs} + ) + yield new_textmap.to_string() + # Add images + for content in images_content: + yield content + + def _extract_images_from_page(self, page: pdfplumber.page.Page) -> list[np.ndarray]: + """Extract images from a PDF page. + + Args: + page: The PDF page to extract images from. + + Returns: + A list of extracted images as numpy arrays. + """ from PIL import Image - if not self.extract_images: - return "" + if not self.images_parser: + return [] images = [] for img in page.images: - if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: - if img["stream"]["BitsPerComponent"] == 1: - images.append( - np.array( - Image.frombytes( - "1", - (img["stream"]["Width"], img["stream"]["Height"]), - img["stream"].get_data(), - ).convert("L") - ) - ) - else: + if "Filter" in img["stream"]: + if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: images.append( np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( img["stream"]["Height"], img["stream"]["Width"], -1 ) ) - elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: - images.append(img["stream"].get_data()) - else: - warnings.warn("Unknown PDF Filter!") + elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: + buf = np.frombuffer(img["stream"].get_data(), dtype=np.uint8) + images.append(np.array(Image.open(io.BytesIO(buf.tobytes())))) + else: + logger.warning("Unknown PDF Filter!") + + return images + + def _extract_tables_bbox_from_page( + self, + page: pdfplumber.page.Page, + ) -> list[tuple[float, float, float, float]]: + """Extract bounding boxes of tables from a PDF page. + + Args: + page: The PDF page to extract table bounding boxes from. + + Returns: + A list of bounding boxes for tables on the page. + """ + if not self.extract_tables: + return [] + from pdfplumber.table import TableSettings - return extract_from_images_with_rapidocr(images) + table_settings = self.extract_tables_settings + tset = TableSettings.resolve(table_settings) + return [table.bbox for table in page.find_tables(tset)] + + def _extract_tables_from_page( + self, + page: pdfplumber.page.Page, + ) -> list[list[list[Any]]]: + """Extract tables from a PDF page. + + Args: + page: The PDF page to extract tables from. + + Returns: + A list of tables, where each table is a list of rows, and each row is a + list of cell values. + """ + if not self.extract_tables: + return [] + table_settings = self.extract_tables_settings + tables_list = page.extract_tables(table_settings) + return tables_list + + def _convert_table(self, table: list[list[str]]) -> str: + """Convert a table to the specified format. + + Args: + table: The table to convert. + + Returns: + The table content as a string in the specified format. + """ + format = self.extract_tables + if format is None: + return "" + if format == "markdown": + return self._convert_table_to_markdown(table) + elif format == "html": + return self._convert_table_to_html(table) + elif format == "csv": + return self._convert_table_to_csv(table) + else: + raise ValueError(f"Unknown table format: {format}") + + def _convert_table_to_csv(self, table: list[list[str]]) -> str: + """Convert a table to CSV format. + + Args: + table: The table to convert. + + Returns: + The table content as a string in CSV format. + Replace "\n" with " ". + """ + if not table: + return "" + + output = ["\n\n"] + + # iterate over detail rows + for row in table: + line = "" + for i, cell in enumerate(row): + # output None cells with empty string + cell = "" if cell is None else cell.replace("\n", " ") + line += cell + "," + output.append(line) + return "\n".join(output) + "\n\n" + + def _convert_table_to_html(self, table: list[list[str]]) -> str: + """ + Convert table content as a string in HTML format. + If clean is true, markdown syntax is removed from cell content. + + Args: + table: The table to convert. + + Returns: + The table content as a string in HTML format. + """ + if not len(table): + return "" + output = "\n" + clean = True + + # iterate over detail rows + for row in table: + line = "" + for i, cell in enumerate(row): + # output None cells with empty string + cell = "" if cell is None else cell.replace("\n", " ") + if clean: # remove sensitive syntax + cell = html.escape(cell.replace("-", "-")) + line += "" + line += "\n" + output += line + return output + "
" + cell + "
\n" + + def _convert_table_to_markdown(self, table: list[list[str]]) -> str: + """Convert table content as a string in Github-markdown format. + + Args: + table: The table to convert. + + Returns: + The table content as a string in Markdown format. + Replace "-" to "-" and "\n" to " ". + """ + clean = False + if not table: + return "" + col_count = len(table[0]) + + output = "|" + "|".join("" for i in range(col_count)) + "|\n" + output += "|" + "|".join("---" for i in range(col_count)) + "|\n" + + # skip first row in details if header is part of the table + # iterate over detail rows + for row in table: + line = "|" + for i, cell in enumerate(row): + # output None cells with empty string + cell = "" if cell is None else cell.replace("\n", " ") + if clean: # remove sensitive syntax + cell = html.escape(cell.replace("-", "-")) + line += cell + "|" + line += "\n" + output += line + return output + "\n" class AmazonTextractPDFParser(BaseBlobParser): diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 6b51e481e..6f2b7c77b 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -1007,7 +1007,60 @@ def load(self) -> list[Document]: class PDFPlumberLoader(BasePDFLoader): - """Load `PDF` files using `pdfplumber`.""" + """Load and parse a PDF file using 'pdfplumber' library. + + This class provides methods to load and parse PDF documents, supporting various + configurations such as handling password-protected files, extracting images, and + defining extraction mode. It integrates the `pdfplumber` library for PDF processing + and offers both synchronous and asynchronous document loading. + + Examples: + Setup: + + .. code-block:: bash + + pip install -U langchain-community pdfplumber + + Instantiate the loader: + + .. code-block:: python + + from langchain_community.document_loaders import PDFPlumberLoader + + loader = PDFPlumberLoader( + file_path = "./example_data/layout-parser-paper.pdf", + # headers = None + # password = None, + mode = "single", + pages_delimiter = "\n\f", + images_inner_format = "text", + # extract_tables = None, + # extract_tables_settings = None, + # text_kwargs = {"use_text_flow": False, "keep_blank_chars": False}, + # dedupe = False, + metadata_format="standard", + ) + + Lazy load documents: + + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + Load documents asynchronously: + + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + """ def __init__( self, @@ -1016,34 +1069,80 @@ def __init__( dedupe: bool = False, headers: Optional[dict] = None, extract_images: bool = False, + *, + password: Optional[str] = None, + mode: Literal["single", "page"] = "page", + images_parser: Optional[BaseImageBlobParser] = None, + images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", + pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, + extract_tables_settings: Optional[dict[str, Any]] = None, + metadata_format: Literal["legacy", "standard"] = "legacy", ) -> None: - """Initialize with a file path.""" - try: - import pdfplumber # noqa:F401 - except ImportError: - raise ImportError( - "pdfplumber package not found, please install it with " - "`pip install pdfplumber`" - ) + """Initialize with a file path. - super().__init__(file_path, headers=headers) - self.text_kwargs = text_kwargs or {} - self.dedupe = dedupe - self.extract_images = extract_images + Args: + file_path: The path to the PDF file to be loaded. + headers: Optional headers to use for GET request to download a file from a + web path. + password: Optional password for opening encrypted PDFs. + mode: The extraction mode, either "single" for the entire document or "page" + for page-wise extraction. + pages_delimiter: A string delimiter to separate pages in single-mode + extraction. + extract_images: Whether to extract images from the PDF. + images_parser: Optional image blob parser. + images_inner_format: The format for the parsed output. + - "text" = return the content as is + - "markdown-img" = wrap the content into an image markdown link, w/ link + pointing to (`![body)(#)`] + - "html-img" = wrap the content as the `alt` text of an tag and link to + (`{body}`) + extract_tables: Whether to extract tables in a specific format, such as + "csv", "markdown", or "html". + extract_tables_settings: Optional dictionary of settings for customizing + table extraction. + text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` + dedupe: Avoiding the error of duplicate characters if `dedupe=True` + metadata_format: Use CamelCase keys with 'legacy' + and lower keys with 'standard'. - def load(self) -> list[Document]: - """Load file.""" + Returns: + This method does not directly return data. Use the `load`, `lazy_load`, + or `aload` methods + to retrieve parsed documents with content and metadata. - parser = PDFPlumberParser( - text_kwargs=self.text_kwargs, - dedupe=self.dedupe, - extract_images=self.extract_images, + Raises: + ImportError: If the `pdfplumber` package is not installed. + """ + super().__init__(file_path, headers=headers) + self.parser = PDFPlumberParser( + password=password, + mode=mode, + pages_delimiter=pages_delimiter, + extract_images=extract_images, + images_parser=images_parser, + images_inner_format=images_inner_format, + extract_tables=extract_tables, + text_kwargs=text_kwargs, + extract_tables_settings=extract_tables_settings, + dedupe=dedupe, + metadata_format=metadata_format, ) + + def lazy_load( + self, + ) -> Iterator[Document]: + """ + Lazy load given path as pages. + Insert image, if possible, between two paragraphs. + In this way, a paragraph can be continued on the next page. + """ if self.web_path: blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) else: blob = Blob.from_path(self.file_path) - return parser.parse(blob) + yield from self.parser.lazy_parse(blob) class AmazonTextractPDFLoader(BasePDFLoader): diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index 1137dd79f..439feaac8 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -2,16 +2,21 @@ import re from pathlib import Path -from typing import TYPE_CHECKING, Iterator +from typing import TYPE_CHECKING, Iterator, Type import pytest -import langchain_community.document_loaders.parsers as pdf_parsers from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.parsers import ( BaseImageBlobParser, +) +from langchain_community.document_loaders.parsers.pdf import ( + PDFMinerParser, PDFPlumberParser, + PyMuPDFParser, + PyPDFium2Parser, + PyPDFParser, ) if TYPE_CHECKING: @@ -95,13 +100,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0] -def test_pdfplumber_parser() -> None: - """Test PDFPlumber parser.""" - _assert_with_parser(PDFPlumberParser()) - _assert_with_duplicate_parser(PDFPlumberParser()) - _assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True) - - class EmptyImageBlobParser(BaseImageBlobParser): def _analyze_image(self, img: "Image") -> str: return "Hello world" @@ -112,24 +110,25 @@ def _analyze_image(self, img: "Image") -> str: [("single", EmptyImageBlobParser()), ("page", None)], ) @pytest.mark.parametrize( - "parser_factory,params", + "parser_class,params", [ - ("PDFMinerParser", {}), - ("PyMuPDFParser", {}), - ("PyPDFium2Parser", {}), - ("PyPDFParser", {"extraction_mode": "plain"}), - ("PyPDFParser", {"extraction_mode": "layout"}), + (PDFMinerParser, {}), + (PDFPlumberParser, {"metadata_format": "standard"}), + (PyMuPDFParser, {}), + (PyPDFium2Parser, {}), + (PyPDFParser, {"extraction_mode": "plain"}), + (PyPDFParser, {"extraction_mode": "layout"}), ], ) @pytest.mark.requires("pillow") def test_mode_and_extract_images_variations( - parser_factory: str, + parser_class: Type, params: dict, mode: str, image_parser: BaseImageBlobParser, ) -> None: _test_matrix( - parser_factory, + parser_class, params, mode, image_parser, @@ -142,18 +141,19 @@ def test_mode_and_extract_images_variations( ["text", "markdown-img", "html-img"], ) @pytest.mark.parametrize( - "parser_factory,params", + "parser_class,params", [ - ("PDFMinerParser", {}), - ("PyMuPDFParser", {}), - ("PyPDFium2Parser", {}), - ("PyPDFParser", {"extraction_mode": "plain"}), - ("PyPDFParser", {"extraction_mode": "layout"}), + (PDFMinerParser, {}), + (PDFPlumberParser, {"metadata_format": "standard"}), + (PyMuPDFParser, {}), + (PyPDFium2Parser, {}), + (PyPDFParser, {"extraction_mode": "plain"}), + (PyPDFParser, {"extraction_mode": "layout"}), ], ) @pytest.mark.requires("pillow") def test_mode_and_image_formats_variations( - parser_factory: str, + parser_class: Type, params: dict, images_inner_format: str, ) -> None: @@ -161,7 +161,7 @@ def test_mode_and_image_formats_variations( image_parser = EmptyImageBlobParser() _test_matrix( - parser_factory, + parser_class, params, mode, image_parser, @@ -170,7 +170,7 @@ def test_mode_and_image_formats_variations( def _test_matrix( - parser_factory: str, + parser_class: Type, params: dict, mode: str, image_parser: BaseImageBlobParser, @@ -222,8 +222,6 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None: assert len(docs) parser.password = old_password - parser_class = getattr(pdf_parsers, parser_factory) - parser = parser_class( mode=mode, images_parser=image_parser, @@ -243,13 +241,14 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None: ["markdown", "html", "csv", None], ) @pytest.mark.parametrize( - "parser_factory,params", + "parser_class,params", [ - ("PyMuPDFParser", {}), + (PDFPlumberParser, {}), + (PyMuPDFParser, {}), ], ) def test_parser_with_table( - parser_factory: str, + parser_class: Type, params: dict, mode: str, extract_tables: str, @@ -303,8 +302,6 @@ class EmptyImageBlobParser(BaseImageBlobParser): def _analyze_image(self, img: Image) -> str: return "![image](.)" - parser_class = getattr(pdf_parsers, parser_factory) - parser = parser_class( mode=mode, extract_tables=extract_tables, diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index 891308a04..b7e3a4696 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -1,14 +1,18 @@ import os from pathlib import Path -from typing import Sequence, Union +from typing import Sequence, Type, Union import pytest -import langchain_community.document_loaders as pdf_loaders -from langchain_community.document_loaders import ( +from langchain_community.document_loaders.pdf import ( AmazonTextractPDFLoader, MathpixPDFLoader, + PDFMinerLoader, PDFMinerPDFasHTMLLoader, + PDFPlumberLoader, + PyMuPDFLoader, + PyPDFium2Loader, + PyPDFLoader, UnstructuredPDFLoader, ) @@ -164,20 +168,19 @@ def test_amazontextract_loader_failures() -> None: @pytest.mark.parametrize( - "parser_factory,params", + "loader_class,params", [ - ("PDFMinerLoader", {}), - ("PyMuPDFLoader", {}), - ("PyPDFium2Loader", {}), - ("PyPDFLoader", {}), + (PDFMinerLoader, {}), + (PDFPlumberLoader, {"metadata_format": "standard"}), + (PyMuPDFLoader, {}), + (PyPDFium2Loader, {}), + (PyPDFLoader, {}), ], ) def test_standard_parameters( - parser_factory: str, + loader_class: Type, params: dict, ) -> None: - loader_class = getattr(pdf_loaders, parser_factory) - file_path = Path(__file__).parent.parent / "examples/hello.pdf" loader = loader_class(file_path) docs = loader.load()