Fix one bug, update some typos, and style doc strings while reading

eyurtsev · eyurtsev · commit 3d15d3986ac1 · 2025-01-16T11:38:02.000-05:00
diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py
@@ -22,21 +22,14 @@
 
 
 class BaseImageBlobParser(BaseBlobParser):
-    """
-    Abstract base class for parsing image blobs into text.
-
-    Attributes:
-        format (Literal["text", "markdown-img", "html-img"]):
-          Output format of the parsed text.
-    """
+    """Abstract base class for parsing image blobs into text."""
 
     def __init__(
         self,
         *,
         format: Union[Literal["text", "markdown-img", "html-img"], str] = "text",
-    ):
-        """
-        Initializes the BaseImageBlobParser.
+    ) -> None:
+        """Initializes the BaseImageBlobParser.
 
         Args:
             format (Literal["text", "markdown-img", "html-img"]|str):
@@ -52,28 +45,21 @@ def __init__(
 
     @abstractmethod
     def _analyze_image(self, img: "Image", format: str) -> str:
-        """
-        Abstract method to analyze an image and extract textual content.
+        """Abstract method to analyze an image and extract textual content.
 
         Args:
-            img (Image):
-              The image to be analyzed.
-            format (str):
-              The format to use if it's possible
+            img: The image to be analyzed.
+            format: The format to use if it's possible
 
         Returns:
-            str:
-              The extracted text content.
+          The extracted text content.
         """
-        pass
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:
-        """
-        Lazily parses a blob and yields Document objects containing the parsed content.
+        """Lazily parse a blob and yields Documents containing the parsed content.
 
         Args:
-            blob (Blob):
-              The blob to be parsed.
+            blob (Blob): The blob to be parsed.
 
         Yields:
             Document:
@@ -116,8 +102,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
 
 
 class RapidOCRBlobParser(BaseImageBlobParser):
-    """
-    Parser for extracting text from images using the RapidOCR library.
+    """Parser for extracting text from images using the RapidOCR library.
 
     Attributes:
         ocr:
@@ -183,8 +168,7 @@ def _analyze_image(self, img: "Image", format: str) -> str:
 
 
 class TesseractBlobParser(BaseImageBlobParser):
-    """
-    Parser for extracting text from images using the Tesseract OCR library.
+    """Parse for extracting text from images using the Tesseract OCR library.
 
     Attributes:
         format (Literal["text", "markdown-img", "html-img"]):
@@ -204,8 +188,7 @@ def __init__(
         format: Literal["text", "markdown-img", "html-img"] = "text",
         langs: Iterable[str] = ("eng",),
     ):
-        """
-        Initializes the TesseractBlobParser.
+        """Initialize the TesseractBlobParser.
 
         Args:
             format (Literal["text", "markdown-img", "html-img"]):
@@ -222,14 +205,11 @@ def __init__(
         self.langs = list(langs)
 
     def _analyze_image(self, img: "Image", format: str) -> str:
-        """
-        Analyzes an image and extracts text using Tesseract OCR.
+        """Analyze an image and extracts text using Tesseract OCR.
 
         Args:
-            img (Image):
-              The image to be analyzed.
-            format (str):
-              The format to use if it's possible
+            img: The image to be analyzed.
+            format: The format to use if it's possible
 
         Returns:
             str: The extracted text content.
@@ -257,8 +237,7 @@ def _analyze_image(self, img: "Image", format: str) -> str:
 
 
 class LLMImageBlobParser(BaseImageBlobParser):
-    """
-    Parser for analyzing images using a language model (LLM).
+    """Parser for analyzing images using a language model (LLM).
 
     Attributes:
         format (Literal["text", "markdown-img", "html-img"]):
@@ -285,8 +264,7 @@ def __init__(
         model: BaseChatModel,
         prompt: BasePromptTemplate = _PROMPT_IMAGES_TO_DESCRIPTION,
     ):
-        """
-        Initializes the LLMImageBlobParser.
+        """Initializes the LLMImageBlobParser.
 
         Args:
             format (Literal["text", "markdown", "html"]):
@@ -301,16 +279,13 @@ def __init__(
         self.prompt = prompt
 
     def _analyze_image(self, img: "Image", format: str) -> str:
-        """
-        Analyzes an image using the provided language model.
+        """Analyze an image using the provided language model.
 
         Args:
-            img (Image):
-              The image to be analyzed.
+            img: The image to be analyzed.
 
         Returns:
-            str: *
-              The extracted textual content.
+            The extracted textual content.
         """
         image_bytes = io.BytesIO()
         img.save(image_bytes, format="PNG")
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -93,19 +93,22 @@ def extract_from_images_with_rapidocr(
 _FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
 _JOIN_IMAGES = "\n"
 _JOIN_TABLES = "\n"
-_DEFAULT_PAGE_DELIMITOR = "\n\f"
+_DEFAULT_PAGES_DELIMITER = "\n\f"
 
 _STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
 
 
 def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
-    """Validates the presence of at least the following keys:
+    """Validate that the metadata has all the standard keys and the page is an integer.
+
+    The standard keys are:
     - source
-    - page (if mode='page')
     - total_page
     - creationdate
     - creator
     - producer
+
+    Validate that page is an integer if it is present.
     """
     if not _STD_METADATA_KEYS.issubset(metadata.keys()):
         raise ValueError("The PDF parser must valorize the standard metadata.")
@@ -142,7 +145,7 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
             except ValueError:
                 new_metadata[k] = v
         elif k in map_key:
-            # Normaliaze key with others PDF parser
+            # Normalize key with others PDF parser
             new_metadata[map_key[k]] = v
             new_metadata[k] = v
         elif isinstance(v, str):
@@ -152,7 +155,7 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
     return new_metadata
 
 
-_PARAGRAPH_DELIMITOR = [
+_PARAGRAPH_DELIMITER = [
     "\n\n\n",
     "\n\n",
 ]  # To insert images or table in the middle of the page.
@@ -174,7 +177,7 @@ def _recurs_merge_text_and_extras(
         extras: list[str], text_from_page: str, recurs: bool
     ) -> Optional[str]:
         if extras:
-            for delim in _PARAGRAPH_DELIMITOR:
+            for delim in _PARAGRAPH_DELIMITER:
                 pos = text_from_page.rfind(delim)
                 if pos != -1:
                     # search penultimate, to bypass an error in footer
@@ -205,7 +208,7 @@ def _recurs_merge_text_and_extras(
         all_extras = ""
         str_extras = "\n\n".join(filter(lambda x: x, extras))
         if str_extras:
-            all_extras = _PARAGRAPH_DELIMITOR[-1] + str_extras
+            all_extras = _PARAGRAPH_DELIMITER[-1] + str_extras
         all_text = text_from_page + all_extras
 
     return all_text
@@ -470,7 +473,7 @@ def __init__(
         *,
         password: Optional[str] = None,
         mode: Literal["single", "page"] = "page",
-        pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
+        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
         images_parser: Optional[BaseImageBlobParser] = None,
         extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
         extract_tables_settings: Optional[dict[str, Any]] = None,
@@ -481,16 +484,14 @@ def __init__(
             password: Optional password for opening encrypted PDFs.
             mode: The extraction mode, either "single" for the entire document or "page"
                 for page-wise extraction.
-            pages_delimitor: A string delimiter to separate pages in single-mode
+            pages_delimiter: A string delimiter to separate pages in single-mode
                 extraction.
             extract_images: Whether to extract images from the PDF.
             images_parser: Optional image blob parser.
             extract_tables: Whether to extract tables in a specific format, such as
                 "csv", "markdown", or "html".
             extract_tables_settings: Optional dictionary of settings for customizing
                 table extraction.
-            **kwargs: Additional keyword arguments for customizing text extraction
-                behavior.
 
         Returns:
             This method does not directly return data. Use the `parse` or `lazy_parse`
@@ -508,7 +509,7 @@ def __init__(
             raise ValueError("mode must be markdown")
 
         self.mode = mode
-        self.pages_delimitor = pages_delimitor
+        self.pages_delimiter = pages_delimiter
         self.password = password
         self.text_kwargs = text_kwargs or {}
         if extract_images and not images_parser:
@@ -526,14 +527,18 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
     def _lazy_parse(
         self,
         blob: Blob,
-        text_kwargs: Optional[dict[str, Any]] = None,  # deprectaed
+        # text-kwargs is present for backwards compatibility.
+        # Users should not use it directly.
+        text_kwargs: Optional[dict[str, Any]] = None,
     ) -> Iterator[Document]:  # type: ignore[valid-type]
         """Lazily parse the blob.
         Insert image, if possible, between two paragraphs.
         In this way, a paragraph can be continued on the next page.
 
         Args:
             blob: The blob to parse.
+            text_kwargs: Optional keyword arguments to pass to the `get_text` method.
+                If provided at run time, it will override the default text_kwargs.
 
         Raises:
             ImportError: If the `pypdf` package is not found.
@@ -544,8 +549,7 @@ def _lazy_parse(
         try:
             import pymupdf
 
-            if not text_kwargs:
-                text_kwargs = {}
+            text_kwargs = text_kwargs or self.text_kwargs
             if not self.extract_tables_settings:
                 from pymupdf.table import (
                     DEFAULT_JOIN_TOLERANCE,
@@ -609,7 +613,7 @@ def _lazy_parse(
 
                 if self.mode == "single":
                     yield Document(
-                        page_content=self.pages_delimitor.join(full_content),
+                        page_content=self.pages_delimiter.join(full_content),
                         metadata=_validate_metadata(doc_metadata),
                     )
 
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
@@ -30,7 +30,7 @@
 from langchain_community.document_loaders.dedoc import DedocBaseLoader
 from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
 from langchain_community.document_loaders.parsers.pdf import (
-    _DEFAULT_PAGE_DELIMITOR,
+    _DEFAULT_PAGES_DELIMITER,
     AmazonTextractPDFParser,
     DocumentIntelligenceParser,
     PDFMinerParser,
@@ -458,7 +458,7 @@ class PyMuPDFLoader(BasePDFLoader):
                 # headers = None
                 # password = None,
                 mode = "single",
-                pages_delimitor = "\n\f",
+                pages_delimiter = "\n\f",
                 # extract_images = True,
                 # images_parser = TesseractBlobParser(),
                 # extract_tables = "markdown",
@@ -492,7 +492,7 @@ def __init__(
         *,
         password: Optional[str] = None,
         mode: Literal["single", "page"] = "page",
-        pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
+        pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
         extract_images: bool = False,
         images_parser: Optional[BaseImageBlobParser] = None,
         extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
@@ -509,7 +509,7 @@ def __init__(
             password: Optional password for opening encrypted PDFs.
             mode: The extraction mode, either "single" for the entire document or "page"
                 for page-wise extraction.
-            pages_delimitor: A string delimiter to separate pages in single-mode
+            pages_delimiter: A string delimiter to separate pages in single-mode
                 extraction.
             extract_images: Whether to extract images from the PDF.
             images_parser: Optional image blob parser.
@@ -533,7 +533,7 @@ def __init__(
         self.parser = PyMuPDFParser(
             password=password,
             mode=mode,
-            pages_delimitor=pages_delimitor,
+            pages_delimiter=pages_delimiter,
             text_kwargs=kwargs,
             extract_images=extract_images,
             images_parser=images_parser,
@@ -862,8 +862,8 @@ def lazy_load(
     ) -> Iterator[Document]:
         """Lazy load documents"""
         # the self.file_path is local, but the blob has to include
-        # the S3 location if the file originated from S3 for multi-page documents
-        # raises ValueError when multi-page and not on S3"""
+        # the S3 location if the file originated from S3 for multipage documents
+        # raises ValueError when multipage and not on S3"""
 
         if self.web_path and self._is_s3_url(self.web_path):
             blob = Blob(path=self.web_path)  # type: ignore[call-arg] # type: ignore[misc]
@@ -1059,7 +1059,7 @@ class ZeroxPDFLoader(BasePDFLoader):
     """Document loader utilizing Zerox library:
     https://github.com/getomni-ai/zerox
 
-    Zerox converts PDF document to serties of images (page-wise) and
+    Zerox converts PDF document to series of images (page-wise) and
     uses vision-capable LLM model to generate Markdown representation.
 
     Zerox utilizes anyc operations. Therefore when using this loader
@@ -1079,7 +1079,7 @@ def __init__(
     ) -> None:
         super().__init__(file_path=file_path)
         """Initialize the parser with arguments to be passed to the zerox function.
-        Make sure to set necessary environmnet variables such as API key, endpoint, etc.
+        Make sure to set necessary environment variables such as API key, endpoint, etc.
         Check zerox documentation for list of necessary environment variables for
         any given model.
 
@@ -1100,12 +1100,7 @@ def __init__(
         self.model = model
 
     def lazy_load(self) -> Iterator[Document]:
-        """Loads documnts from pdf utilizing zerox library:
-        https://github.com/getomni-ai/zerox
-
-        Returns:
-            Iterator[Document]: An iterator over parsed Document instances.
-        """
+        """Lazily load pages."""
         import asyncio
 
         from pyzerox import zerox