diff --git a/.gitignore b/.gitignore
index 45d553be4..93ddeafd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ __pycache__
.mypy_cache_test
.env
.venv*
+.idea
+
diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt
index 6ad4f43e8..2dd58f9e7 100644
--- a/libs/community/extended_testing_deps.txt
+++ b/libs/community/extended_testing_deps.txt
@@ -59,8 +59,8 @@ openapi-pydantic>=0.3.2,<0.4
oracle-ads>=2.9.1,<3
oracledb>=2.2.0,<3
pandas>=2.0.1,<3
-pdfminer-six==20231228
-pdfplumber>=0.11
+pdfminer-six>=20250324
+pdfplumber>=0.11.6
pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2
praw>=7.7.1,<8
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index 6b3a0a065..fa151861e 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -129,6 +129,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
The standard keys are:
- source
+ - page (if mode='page')
- total_page
- creationdate
- creator
@@ -1386,97 +1387,534 @@ def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> st
class PDFPlumberParser(BaseBlobParser):
- """Parse `PDF` with `PDFPlumber`."""
+ """Parse a blob from a PDF using `pdfplumber` library.
+
+ This class provides methods to parse a blob from a PDF document, supporting various
+ configurations such as handling password-protected PDFs, extracting images, and
+ defining extraction mode.
+ It integrates the 'pdfplumber' library for PDF processing and offers synchronous
+ blob parsing.
+
+ Examples:
+ Setup:
+
+ .. code-block:: bash
+
+ pip install -U langchain-community pdfplumber
+
+ Load a blob from a PDF file:
+
+ .. code-block:: python
+
+ from langchain_core.documents.base import Blob
+
+ blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
+
+ Instantiate the parser:
+
+ .. code-block:: python
+
+ from langchain_community.document_loaders.parsers import PDFPlumberParser
+
+ parser = PDFPlumberParser(
+ # password = None,
+ mode = "single",
+ pages_delimiter = "\n\f",
+ # extract_tables="markdown",
+ metadata_format="standard",
+ )
+
+ Lazily parse the blob:
+
+ .. code-block:: python
+
+ docs = []
+ docs_lazy = parser.lazy_parse(blob)
+
+ for doc in docs_lazy:
+ docs.append(doc)
+ print(docs[0].page_content[:100])
+ print(docs[0].metadata)
+ """
def __init__(
self,
text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False,
extract_images: bool = False,
+ *,
+ password: Optional[str] = None,
+ mode: Literal["single", "page"] = "page",
+ pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
+ images_parser: Optional[BaseImageBlobParser] = None,
+ images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
+ extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
+ extract_tables_settings: Optional[dict[str, Any]] = None,
+ metadata_format: Literal["legacy", "standard"] = "legacy",
) -> None:
"""Initialize the parser.
Args:
+ password: Optional password for opening encrypted PDFs.
+ mode: The extraction mode, either "single" for the entire document or "page"
+ for page-wise extraction.
+ pages_delimiter: A string delimiter to separate pages in single-mode
+ extraction.
+ extract_images: Whether to extract images from the PDF.
+ images_parser: Optional image blob parser.
+ images_inner_format: The format for the parsed output.
+ - "text" = return the content as is
+ - "markdown-img" = wrap the content into an image markdown link, w/ link
+ pointing to (`![body)(#)`]
+ - "html-img" = wrap the content as the `alt` text of an tag and link to
+ (``)
+ extract_tables: Whether to extract images from the PDF in a specific
+ format, such as "csv", "markdown" or "html".
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
- dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
+ dedupe: Avoiding the error of duplicate characters if `dedupe=True`
+ extract_tables_settings: Optional dictionary of settings for customizing
+ table extraction.
+ metadata_format: Use CamelCase keys with 'legacy'
+ and lower keys with 'standard'.
+
+ Returns:
+ This method does not directly return data. Use the `parse` or `lazy_parse`
+ methods to retrieve parsed documents with content and metadata.
+
+ Raises:
+ ValueError: If the `mode` is not "single" or "page".
+ ValueError: If the `extract_tables` is not "csv", "markdown" or "html".
+ """
+ super().__init__()
+ if mode not in ["single", "page"]:
+ raise ValueError("mode must be single or page")
+ if extract_tables and extract_tables not in ["csv", "markdown", "html"]:
+ raise ValueError("mode must be csv, markdown or html")
+ if extract_images and not images_parser:
+ images_parser = RapidOCRBlobParser()
+ self.password = password
+ self.extract_images = extract_images
+ self.images_parser = images_parser
+ self.images_inner_format = images_inner_format
+ self.mode = mode
+ self.pages_delimiter = pages_delimiter
+ self.dedupe = dedupe
+ self.text_kwargs = text_kwargs or {}
+ self.extract_tables = extract_tables
+ self.extract_tables_settings = extract_tables_settings or {
+ "vertical_strategy": "lines",
+ "horizontal_strategy": "lines",
+ "snap_y_tolerance": 5,
+ "intersection_x_tolerance": 15,
+ }
+ if metadata_format == "legacy":
+ warnings.warn(
+ "The default value 'legacy' use some CamelCase keys. "
+ "It's will be deprecated in the next major version."
+ )
+
+ self.metadata_format = metadata_format
+
+ def _validate_metadata(self, metadata: dict[str, Any]) -> dict[str, Any]:
+ if self.metadata_format == "legacy":
+ return metadata
+ else:
+ return _validate_metadata(metadata)
+
+ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+ """Lazily parse the blob.
+
+ Args:
+ blob: The blob to parse.
+
+ Raises:
+ ImportError: If the `pypdf` package is not found.
+
+ Yield:
+ An iterator over the parsed documents.
"""
try:
- import PIL # noqa:F401
+ import pdfplumber
except ImportError:
raise ImportError(
- "pillow package not found, please install it with `pip install pillow`"
+ "pdfplumber package not found, please install it "
+ "with `pip install pdfplumber`"
)
- self.text_kwargs = text_kwargs or {}
- self.dedupe = dedupe
- self.extract_images = extract_images
-
- def lazy_parse(self, blob: Blob) -> Iterator[Document]:
- """Lazily parse the blob."""
- import pdfplumber
with blob.as_bytes_io() as file_path:
- doc = pdfplumber.open(file_path) # open document
-
- yield from [
- Document(
- page_content=self._process_page_content(page)
- + "\n"
- + self._extract_images_from_page(page),
- metadata=dict(
- {
+ doc = pdfplumber.open(file_path, password=self.password) # open document
+ from pdfplumber.utils import geometry
+
+ contents = []
+ # The legacy version, use CreationDate, Creator, etc.
+ # The new 'standard' version must use lower case key.
+ if self.metadata_format == "legacy":
+ doc_metadata = (
+ {
+ "producer": "PDFPlumber",
+ "creator": "PDFPlumber",
+ "creationdate": "",
+ }
+ | doc.metadata # Add parser metdata
+ | { # with more keys
+ "source": blob.source,
+ "file_path": blob.source,
+ "total_pages": len(doc.pages),
+ }
+ )
+ else:
+ doc_metadata = _purge_metadata(
+ (
+ doc.metadata # Add parser metdata
+ | { # with more keys
"source": blob.source,
"file_path": blob.source,
- "page": page.page_number - 1,
"total_pages": len(doc.pages),
- },
- **{
- k: doc.metadata[k]
- for k in doc.metadata
- if type(doc.metadata[k]) in [str, int]
- },
- ),
+ }
+ )
+ )
+
+ for page in doc.pages:
+ tables_bbox: list[tuple[float, float, float, float]] = (
+ self._extract_tables_bbox_from_page(page)
+ )
+ tables_content = self._extract_tables_from_page(page)
+ images_bbox = [geometry.obj_to_bbox(image) for image in page.images]
+ image_from_page = self._extract_images_from_page(page)
+ page_text = []
+ extras = []
+ for content in self._split_page_content(
+ page,
+ tables_bbox,
+ tables_content,
+ images_bbox,
+ image_from_page,
+ ):
+ if isinstance(content, str): # Text
+ page_text.append(content)
+ elif isinstance(content, list): # Table
+ page_text.append(_JOIN_TABLES + self._convert_table(content))
+ else: # Image
+ if self.images_parser:
+ try:
+ from PIL import Image as Img
+
+ Img.fromarray(content) # Check if image is valid
+ image_bytes = io.BytesIO()
+ numpy.save(image_bytes, content)
+ blob = Blob.from_data(
+ image_bytes.getvalue(),
+ mime_type="application/x-npy",
+ )
+ text_from_image = next(
+ self.images_parser.lazy_parse(blob)
+ ).page_content
+ extras.append(
+ _format_inner_image(
+ blob, text_from_image, self.images_inner_format
+ )
+ )
+ except TypeError:
+ pass
+ except EOFError:
+ pass
+
+ all_text = _merge_text_and_extras(extras, "".join(page_text).strip())
+
+ if self.mode == "page":
+ # For legacy compatibility, add the last '\n'_
+ if not all_text.endswith("\n"):
+ all_text += "\n"
+ yield Document(
+ page_content=all_text,
+ metadata=self._validate_metadata(
+ doc_metadata
+ | {
+ "page": page.page_number - 1,
+ }
+ ),
+ )
+ else:
+ contents.append(all_text)
+ if self.mode == "single":
+ yield Document(
+ page_content=self.pages_delimiter.join(contents),
+ metadata=self._validate_metadata(doc_metadata),
)
- for page in doc.pages
- ]
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
- """Process the page content based on dedupe."""
+ """Process the page content based on dedupe.
+
+ Args:
+ page: The PDF page to process.
+
+ Returns:
+ The extracted text from the page.
+ """
if self.dedupe:
return page.dedupe_chars().extract_text(**self.text_kwargs)
return page.extract_text(**self.text_kwargs)
- def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
- """Extract images from page and get the text with RapidOCR."""
+ def _split_page_content(
+ self,
+ page: pdfplumber.page.Page,
+ tables_bbox: list[tuple[float, float, float, float]],
+ tables_content: list[list[list[Any]]],
+ images_bbox: list[tuple[float, float, float, float]],
+ images_content: list[np.ndarray],
+ **kwargs: Any,
+ ) -> Iterator[Union[str, list[list[str]], np.ndarray]]:
+ """Split the page content into text, tables, and images.
+
+ Args:
+ page: The PDF page to process.
+ tables_bbox: Bounding boxes of tables on the page.
+ tables_content: Content of tables on the page.
+ images_bbox: Bounding boxes of images on the page.
+ images_content: Content of images on the page.
+ **kwargs: Additional keyword arguments.
+
+ Yields:
+ An iterator over the split content (text, tables, images).
+ """
+ from pdfplumber.utils import (
+ geometry,
+ text,
+ )
+
+ # Iterate over words. If a word is in a table,
+ # yield the accumulated text, and the table
+ # A the word is in a previously see table, ignore it
+ # Finish with the accumulated text
+ kwargs.update(
+ {
+ "keep_blank_chars": True,
+ # "use_text_flow": True,
+ "presorted": True,
+ "layout_bbox": kwargs.get("layout_bbox") or page.cropbox,
+ }
+ )
+ chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars
+
+ extractor = text.WordExtractor(
+ **{k: kwargs[k] for k in text.WORD_EXTRACTOR_KWARGS if k in kwargs}
+ )
+ wordmap = extractor.extract_wordmap(chars)
+ extract_wordmaps: list[Any] = []
+ used_arrays = [False] * len(tables_bbox)
+ for word, o in wordmap.tuples:
+ is_table = False
+ word_bbox = geometry.obj_to_bbox(word)
+ for i, table_bbox in enumerate(tables_bbox):
+ if geometry.get_bbox_overlap(word_bbox, table_bbox):
+ # Find a world in a table
+ is_table = True
+ if not used_arrays[i]:
+ # First time I see a word in this array
+ # Yield the previous part
+ if extract_wordmaps:
+ new_wordmap = text.WordMap(tuples=extract_wordmaps)
+ new_textmap = new_wordmap.to_textmap(
+ **{
+ k: kwargs[k]
+ for k in text.TEXTMAP_KWARGS
+ if k in kwargs
+ }
+ )
+ yield new_textmap.to_string()
+ extract_wordmaps.clear()
+ # And yield the table
+ used_arrays[i] = True
+ yield tables_content[i]
+ break
+ if not is_table:
+ extract_wordmaps.append((word, o))
+ if extract_wordmaps:
+ new_wordmap = text.WordMap(tuples=extract_wordmaps)
+ new_textmap = new_wordmap.to_textmap(
+ **{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs}
+ )
+ yield new_textmap.to_string()
+ # Add images
+ for content in images_content:
+ yield content
+
+ def _extract_images_from_page(self, page: pdfplumber.page.Page) -> list[np.ndarray]:
+ """Extract images from a PDF page.
+
+ Args:
+ page: The PDF page to extract images from.
+
+ Returns:
+ A list of extracted images as numpy arrays.
+ """
from PIL import Image
- if not self.extract_images:
- return ""
+ if not self.images_parser:
+ return []
images = []
for img in page.images:
- if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
- if img["stream"]["BitsPerComponent"] == 1:
- images.append(
- np.array(
- Image.frombytes(
- "1",
- (img["stream"]["Width"], img["stream"]["Height"]),
- img["stream"].get_data(),
- ).convert("L")
- )
- )
- else:
+ if "Filter" in img["stream"]:
+ if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS:
images.append(
np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape(
img["stream"]["Height"], img["stream"]["Width"], -1
)
)
- elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
- images.append(img["stream"].get_data())
- else:
- warnings.warn("Unknown PDF Filter!")
+ elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS:
+ buf = np.frombuffer(img["stream"].get_data(), dtype=np.uint8)
+ images.append(np.array(Image.open(io.BytesIO(buf.tobytes()))))
+ else:
+ logger.warning("Unknown PDF Filter!")
+
+ return images
+
+ def _extract_tables_bbox_from_page(
+ self,
+ page: pdfplumber.page.Page,
+ ) -> list[tuple[float, float, float, float]]:
+ """Extract bounding boxes of tables from a PDF page.
+
+ Args:
+ page: The PDF page to extract table bounding boxes from.
+
+ Returns:
+ A list of bounding boxes for tables on the page.
+ """
+ if not self.extract_tables:
+ return []
+ from pdfplumber.table import TableSettings
- return extract_from_images_with_rapidocr(images)
+ table_settings = self.extract_tables_settings
+ tset = TableSettings.resolve(table_settings)
+ return [table.bbox for table in page.find_tables(tset)]
+
+ def _extract_tables_from_page(
+ self,
+ page: pdfplumber.page.Page,
+ ) -> list[list[list[Any]]]:
+ """Extract tables from a PDF page.
+
+ Args:
+ page: The PDF page to extract tables from.
+
+ Returns:
+ A list of tables, where each table is a list of rows, and each row is a
+ list of cell values.
+ """
+ if not self.extract_tables:
+ return []
+ table_settings = self.extract_tables_settings
+ tables_list = page.extract_tables(table_settings)
+ return tables_list
+
+ def _convert_table(self, table: list[list[str]]) -> str:
+ """Convert a table to the specified format.
+
+ Args:
+ table: The table to convert.
+
+ Returns:
+ The table content as a string in the specified format.
+ """
+ format = self.extract_tables
+ if format is None:
+ return ""
+ if format == "markdown":
+ return self._convert_table_to_markdown(table)
+ elif format == "html":
+ return self._convert_table_to_html(table)
+ elif format == "csv":
+ return self._convert_table_to_csv(table)
+ else:
+ raise ValueError(f"Unknown table format: {format}")
+
+ def _convert_table_to_csv(self, table: list[list[str]]) -> str:
+ """Convert a table to CSV format.
+
+ Args:
+ table: The table to convert.
+
+ Returns:
+ The table content as a string in CSV format.
+ Replace "\n" with " ".
+ """
+ if not table:
+ return ""
+
+ output = ["\n\n"]
+
+ # iterate over detail rows
+ for row in table:
+ line = ""
+ for i, cell in enumerate(row):
+ # output None cells with empty string
+ cell = "" if cell is None else cell.replace("\n", " ")
+ line += cell + ","
+ output.append(line)
+ return "\n".join(output) + "\n\n"
+
+ def _convert_table_to_html(self, table: list[list[str]]) -> str:
+ """
+ Convert table content as a string in HTML format.
+ If clean is true, markdown syntax is removed from cell content.
+
+ Args:
+ table: The table to convert.
+
+ Returns:
+ The table content as a string in HTML format.
+ """
+ if not len(table):
+ return ""
+ output = "
| " + cell + " | " + line += "