Skip to content

Commit 3d15d39

Browse files
committed
Fix one bug, update some typos, and style doc strings while reading
1 parent 5d4a256 commit 3d15d39

File tree

3 files changed

+50
-76
lines changed

3 files changed

+50
-76
lines changed

libs/community/langchain_community/document_loaders/parsers/images.py

Lines changed: 20 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,14 @@
2222

2323

2424
class BaseImageBlobParser(BaseBlobParser):
25-
"""
26-
Abstract base class for parsing image blobs into text.
27-
28-
Attributes:
29-
format (Literal["text", "markdown-img", "html-img"]):
30-
Output format of the parsed text.
31-
"""
25+
"""Abstract base class for parsing image blobs into text."""
3226

3327
def __init__(
3428
self,
3529
*,
3630
format: Union[Literal["text", "markdown-img", "html-img"], str] = "text",
37-
):
38-
"""
39-
Initializes the BaseImageBlobParser.
31+
) -> None:
32+
"""Initializes the BaseImageBlobParser.
4033
4134
Args:
4235
format (Literal["text", "markdown-img", "html-img"]|str):
@@ -52,28 +45,21 @@ def __init__(
5245

5346
@abstractmethod
5447
def _analyze_image(self, img: "Image", format: str) -> str:
55-
"""
56-
Abstract method to analyze an image and extract textual content.
48+
"""Abstract method to analyze an image and extract textual content.
5749
5850
Args:
59-
img (Image):
60-
The image to be analyzed.
61-
format (str):
62-
The format to use if it's possible
51+
img: The image to be analyzed.
52+
format: The format to use if it's possible
6353
6454
Returns:
65-
str:
66-
The extracted text content.
55+
The extracted text content.
6756
"""
68-
pass
6957

7058
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
71-
"""
72-
Lazily parses a blob and yields Document objects containing the parsed content.
59+
"""Lazily parse a blob and yields Documents containing the parsed content.
7360
7461
Args:
75-
blob (Blob):
76-
The blob to be parsed.
62+
blob (Blob): The blob to be parsed.
7763
7864
Yields:
7965
Document:
@@ -116,8 +102,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
116102

117103

118104
class RapidOCRBlobParser(BaseImageBlobParser):
119-
"""
120-
Parser for extracting text from images using the RapidOCR library.
105+
"""Parser for extracting text from images using the RapidOCR library.
121106
122107
Attributes:
123108
ocr:
@@ -183,8 +168,7 @@ def _analyze_image(self, img: "Image", format: str) -> str:
183168

184169

185170
class TesseractBlobParser(BaseImageBlobParser):
186-
"""
187-
Parser for extracting text from images using the Tesseract OCR library.
171+
"""Parse for extracting text from images using the Tesseract OCR library.
188172
189173
Attributes:
190174
format (Literal["text", "markdown-img", "html-img"]):
@@ -204,8 +188,7 @@ def __init__(
204188
format: Literal["text", "markdown-img", "html-img"] = "text",
205189
langs: Iterable[str] = ("eng",),
206190
):
207-
"""
208-
Initializes the TesseractBlobParser.
191+
"""Initialize the TesseractBlobParser.
209192
210193
Args:
211194
format (Literal["text", "markdown-img", "html-img"]):
@@ -222,14 +205,11 @@ def __init__(
222205
self.langs = list(langs)
223206

224207
def _analyze_image(self, img: "Image", format: str) -> str:
225-
"""
226-
Analyzes an image and extracts text using Tesseract OCR.
208+
"""Analyze an image and extracts text using Tesseract OCR.
227209
228210
Args:
229-
img (Image):
230-
The image to be analyzed.
231-
format (str):
232-
The format to use if it's possible
211+
img: The image to be analyzed.
212+
format: The format to use if it's possible
233213
234214
Returns:
235215
str: The extracted text content.
@@ -257,8 +237,7 @@ def _analyze_image(self, img: "Image", format: str) -> str:
257237

258238

259239
class LLMImageBlobParser(BaseImageBlobParser):
260-
"""
261-
Parser for analyzing images using a language model (LLM).
240+
"""Parser for analyzing images using a language model (LLM).
262241
263242
Attributes:
264243
format (Literal["text", "markdown-img", "html-img"]):
@@ -285,8 +264,7 @@ def __init__(
285264
model: BaseChatModel,
286265
prompt: BasePromptTemplate = _PROMPT_IMAGES_TO_DESCRIPTION,
287266
):
288-
"""
289-
Initializes the LLMImageBlobParser.
267+
"""Initializes the LLMImageBlobParser.
290268
291269
Args:
292270
format (Literal["text", "markdown", "html"]):
@@ -301,16 +279,13 @@ def __init__(
301279
self.prompt = prompt
302280

303281
def _analyze_image(self, img: "Image", format: str) -> str:
304-
"""
305-
Analyzes an image using the provided language model.
282+
"""Analyze an image using the provided language model.
306283
307284
Args:
308-
img (Image):
309-
The image to be analyzed.
285+
img: The image to be analyzed.
310286
311287
Returns:
312-
str: *
313-
The extracted textual content.
288+
The extracted textual content.
314289
"""
315290
image_bytes = io.BytesIO()
316291
img.save(image_bytes, format="PNG")

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -93,19 +93,22 @@ def extract_from_images_with_rapidocr(
9393
_FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
9494
_JOIN_IMAGES = "\n"
9595
_JOIN_TABLES = "\n"
96-
_DEFAULT_PAGE_DELIMITOR = "\n\f"
96+
_DEFAULT_PAGES_DELIMITER = "\n\f"
9797

9898
_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
9999

100100

101101
def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
102-
"""Validates the presence of at least the following keys:
102+
"""Validate that the metadata has all the standard keys and the page is an integer.
103+
104+
The standard keys are:
103105
- source
104-
- page (if mode='page')
105106
- total_page
106107
- creationdate
107108
- creator
108109
- producer
110+
111+
Validate that page is an integer if it is present.
109112
"""
110113
if not _STD_METADATA_KEYS.issubset(metadata.keys()):
111114
raise ValueError("The PDF parser must valorize the standard metadata.")
@@ -142,7 +145,7 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
142145
except ValueError:
143146
new_metadata[k] = v
144147
elif k in map_key:
145-
# Normaliaze key with others PDF parser
148+
# Normalize key with others PDF parser
146149
new_metadata[map_key[k]] = v
147150
new_metadata[k] = v
148151
elif isinstance(v, str):
@@ -152,7 +155,7 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
152155
return new_metadata
153156

154157

155-
_PARAGRAPH_DELIMITOR = [
158+
_PARAGRAPH_DELIMITER = [
156159
"\n\n\n",
157160
"\n\n",
158161
] # To insert images or table in the middle of the page.
@@ -174,7 +177,7 @@ def _recurs_merge_text_and_extras(
174177
extras: list[str], text_from_page: str, recurs: bool
175178
) -> Optional[str]:
176179
if extras:
177-
for delim in _PARAGRAPH_DELIMITOR:
180+
for delim in _PARAGRAPH_DELIMITER:
178181
pos = text_from_page.rfind(delim)
179182
if pos != -1:
180183
# search penultimate, to bypass an error in footer
@@ -205,7 +208,7 @@ def _recurs_merge_text_and_extras(
205208
all_extras = ""
206209
str_extras = "\n\n".join(filter(lambda x: x, extras))
207210
if str_extras:
208-
all_extras = _PARAGRAPH_DELIMITOR[-1] + str_extras
211+
all_extras = _PARAGRAPH_DELIMITER[-1] + str_extras
209212
all_text = text_from_page + all_extras
210213

211214
return all_text
@@ -470,7 +473,7 @@ def __init__(
470473
*,
471474
password: Optional[str] = None,
472475
mode: Literal["single", "page"] = "page",
473-
pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
476+
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
474477
images_parser: Optional[BaseImageBlobParser] = None,
475478
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
476479
extract_tables_settings: Optional[dict[str, Any]] = None,
@@ -481,16 +484,14 @@ def __init__(
481484
password: Optional password for opening encrypted PDFs.
482485
mode: The extraction mode, either "single" for the entire document or "page"
483486
for page-wise extraction.
484-
pages_delimitor: A string delimiter to separate pages in single-mode
487+
pages_delimiter: A string delimiter to separate pages in single-mode
485488
extraction.
486489
extract_images: Whether to extract images from the PDF.
487490
images_parser: Optional image blob parser.
488491
extract_tables: Whether to extract tables in a specific format, such as
489492
"csv", "markdown", or "html".
490493
extract_tables_settings: Optional dictionary of settings for customizing
491494
table extraction.
492-
**kwargs: Additional keyword arguments for customizing text extraction
493-
behavior.
494495
495496
Returns:
496497
This method does not directly return data. Use the `parse` or `lazy_parse`
@@ -508,7 +509,7 @@ def __init__(
508509
raise ValueError("mode must be markdown")
509510

510511
self.mode = mode
511-
self.pages_delimitor = pages_delimitor
512+
self.pages_delimiter = pages_delimiter
512513
self.password = password
513514
self.text_kwargs = text_kwargs or {}
514515
if extract_images and not images_parser:
@@ -526,14 +527,18 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
526527
def _lazy_parse(
527528
self,
528529
blob: Blob,
529-
text_kwargs: Optional[dict[str, Any]] = None, # deprectaed
530+
# text-kwargs is present for backwards compatibility.
531+
# Users should not use it directly.
532+
text_kwargs: Optional[dict[str, Any]] = None,
530533
) -> Iterator[Document]: # type: ignore[valid-type]
531534
"""Lazily parse the blob.
532535
Insert image, if possible, between two paragraphs.
533536
In this way, a paragraph can be continued on the next page.
534537
535538
Args:
536539
blob: The blob to parse.
540+
text_kwargs: Optional keyword arguments to pass to the `get_text` method.
541+
If provided at run time, it will override the default text_kwargs.
537542
538543
Raises:
539544
ImportError: If the `pypdf` package is not found.
@@ -544,8 +549,7 @@ def _lazy_parse(
544549
try:
545550
import pymupdf
546551

547-
if not text_kwargs:
548-
text_kwargs = {}
552+
text_kwargs = text_kwargs or self.text_kwargs
549553
if not self.extract_tables_settings:
550554
from pymupdf.table import (
551555
DEFAULT_JOIN_TOLERANCE,
@@ -609,7 +613,7 @@ def _lazy_parse(
609613

610614
if self.mode == "single":
611615
yield Document(
612-
page_content=self.pages_delimitor.join(full_content),
616+
page_content=self.pages_delimiter.join(full_content),
613617
metadata=_validate_metadata(doc_metadata),
614618
)
615619

libs/community/langchain_community/document_loaders/pdf.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from langchain_community.document_loaders.dedoc import DedocBaseLoader
3131
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
3232
from langchain_community.document_loaders.parsers.pdf import (
33-
_DEFAULT_PAGE_DELIMITOR,
33+
_DEFAULT_PAGES_DELIMITER,
3434
AmazonTextractPDFParser,
3535
DocumentIntelligenceParser,
3636
PDFMinerParser,
@@ -458,7 +458,7 @@ class PyMuPDFLoader(BasePDFLoader):
458458
# headers = None
459459
# password = None,
460460
mode = "single",
461-
pages_delimitor = "\n\f",
461+
pages_delimiter = "\n\f",
462462
# extract_images = True,
463463
# images_parser = TesseractBlobParser(),
464464
# extract_tables = "markdown",
@@ -492,7 +492,7 @@ def __init__(
492492
*,
493493
password: Optional[str] = None,
494494
mode: Literal["single", "page"] = "page",
495-
pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
495+
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
496496
extract_images: bool = False,
497497
images_parser: Optional[BaseImageBlobParser] = None,
498498
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
@@ -509,7 +509,7 @@ def __init__(
509509
password: Optional password for opening encrypted PDFs.
510510
mode: The extraction mode, either "single" for the entire document or "page"
511511
for page-wise extraction.
512-
pages_delimitor: A string delimiter to separate pages in single-mode
512+
pages_delimiter: A string delimiter to separate pages in single-mode
513513
extraction.
514514
extract_images: Whether to extract images from the PDF.
515515
images_parser: Optional image blob parser.
@@ -533,7 +533,7 @@ def __init__(
533533
self.parser = PyMuPDFParser(
534534
password=password,
535535
mode=mode,
536-
pages_delimitor=pages_delimitor,
536+
pages_delimiter=pages_delimiter,
537537
text_kwargs=kwargs,
538538
extract_images=extract_images,
539539
images_parser=images_parser,
@@ -862,8 +862,8 @@ def lazy_load(
862862
) -> Iterator[Document]:
863863
"""Lazy load documents"""
864864
# the self.file_path is local, but the blob has to include
865-
# the S3 location if the file originated from S3 for multi-page documents
866-
# raises ValueError when multi-page and not on S3"""
865+
# the S3 location if the file originated from S3 for multipage documents
866+
# raises ValueError when multipage and not on S3"""
867867

868868
if self.web_path and self._is_s3_url(self.web_path):
869869
blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc]
@@ -1059,7 +1059,7 @@ class ZeroxPDFLoader(BasePDFLoader):
10591059
"""Document loader utilizing Zerox library:
10601060
https://github.com/getomni-ai/zerox
10611061
1062-
Zerox converts PDF document to serties of images (page-wise) and
1062+
Zerox converts PDF document to series of images (page-wise) and
10631063
uses vision-capable LLM model to generate Markdown representation.
10641064
10651065
Zerox utilizes anyc operations. Therefore when using this loader
@@ -1079,7 +1079,7 @@ def __init__(
10791079
) -> None:
10801080
super().__init__(file_path=file_path)
10811081
"""Initialize the parser with arguments to be passed to the zerox function.
1082-
Make sure to set necessary environmnet variables such as API key, endpoint, etc.
1082+
Make sure to set necessary environment variables such as API key, endpoint, etc.
10831083
Check zerox documentation for list of necessary environment variables for
10841084
any given model.
10851085
@@ -1100,12 +1100,7 @@ def __init__(
11001100
self.model = model
11011101

11021102
def lazy_load(self) -> Iterator[Document]:
1103-
"""Loads documnts from pdf utilizing zerox library:
1104-
https://github.com/getomni-ai/zerox
1105-
1106-
Returns:
1107-
Iterator[Document]: An iterator over parsed Document instances.
1108-
"""
1103+
"""Lazily load pages."""
11091104
import asyncio
11101105

11111106
from pyzerox import zerox

0 commit comments

Comments
 (0)