Skip to content

Commit 743a83e

Browse files
committed
Fix review - step 1
1 parent 3beda82 commit 743a83e

File tree

4 files changed

+105
-116
lines changed

4 files changed

+105
-116
lines changed

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 84 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -60,21 +60,37 @@
6060

6161
logger = logging.getLogger(__name__)
6262

63-
_format_image_str = "\n\n{image_text}\n\n"
64-
_join_images = "\n"
65-
_join_tables = "\n"
66-
_default_page_delimitor = "\n\f"
63+
_FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
64+
_JOIN_IMAGES = "\n"
65+
_JOIN_TABLES = "\n"
66+
_DEFAULT_PAGE_DELIMITOR = "\n\f"
67+
68+
_STD_METADATA_KEYS={"source", "total_pages", "creationdate", "creator", "producer"}
69+
70+
def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
71+
"""Validates the presence of at least the following keys:
72+
- source
73+
- page (if mode='page')
74+
- total_page
75+
- creationdate
76+
- creator
77+
- producer
78+
"""
79+
if not _STD_METADATA_KEYS.issubset(metadata.keys()):
80+
raise ValueError("The PDF parser must valorize the standard metadata.")
81+
if not isinstance(metadata.get("page",0), int):
82+
raise ValueError("The PDF metadata page must be a integer.")
83+
return metadata
6784

6885

69-
def purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
70-
"""
71-
Purge metadata from unwanted keys and normalize key names.
86+
def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
87+
"""Purge metadata from unwanted keys and normalize key names.
7288
7389
Args:
7490
metadata: The original metadata dictionary.
7591
7692
Returns:
77-
The cleaned and normalized metadata dictionary.
93+
The cleaned and normalized the key format of metadata dictionary.
7894
"""
7995
new_metadata: dict[str, Any] = {}
8096
map_key = {
@@ -95,7 +111,7 @@ def purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
95111
except ValueError:
96112
new_metadata[k] = v
97113
elif k in map_key:
98-
# Normliaze key with others PDF parser
114+
# Normaliaze key with others PDF parser
99115
new_metadata[map_key[k]] = v
100116
new_metadata[k] = v
101117
elif isinstance(v, str):
@@ -105,53 +121,11 @@ def purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
105121
return new_metadata
106122

107123

108-
_delim = ["\n\n\n", "\n\n"] # To insert images or table in the middle of the page.
109-
110-
111-
def __merge_text_and_extras(
112-
extras: list[str], text_from_page: str, recurs: bool
113-
) -> Optional[str]:
114-
"""
115-
Insert extras such as image/table in a text between two paragraphs if possible.
116-
Recursive version.
117-
118-
Args:
119-
extras: List of extra content (images/tables) to insert.
120-
text_from_page: The text content from the page.
121-
recurs: Flag to indicate if the function should recurse.
122-
123-
Returns:
124-
The merged text with extras inserted, or None if no insertion point is found.
125-
"""
126-
if extras:
127-
for delim in _delim:
128-
pos = text_from_page.rfind(delim)
129-
if pos != -1:
130-
# search penultimate, to bypass an error in footer
131-
previous_text = None
132-
if recurs:
133-
previous_text = __merge_text_and_extras(
134-
extras, text_from_page[:pos], False
135-
)
136-
if previous_text:
137-
all_text = previous_text + text_from_page[pos:]
138-
else:
139-
all_extras = ""
140-
str_extras = "\n\n".join(filter(lambda x: x, extras))
141-
if str_extras:
142-
all_extras = delim + str_extras
143-
all_text = text_from_page[:pos] + all_extras + text_from_page[pos:]
144-
break
145-
else:
146-
all_text = None
147-
else:
148-
all_text = text_from_page
149-
return all_text
124+
_PARAGRAPH_DELIMITOR = ["\n\n\n", "\n\n"] # To insert images or table in the middle of the page.
150125

151126

152127
def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
153-
"""
154-
Insert extras such as image/table in a text between two paragraphs if possible,
128+
"""Insert extras such as image/table in a text between two paragraphs if possible,
155129
else at the end of the text.
156130
157131
Args:
@@ -161,12 +135,42 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
161135
Returns:
162136
The merged text with extras inserted.
163137
"""
164-
all_text = __merge_text_and_extras(extras, text_from_page, True)
138+
139+
def _recurs_merge_text_and_extras(
140+
extras: list[str], text_from_page: str, recurs: bool
141+
) -> Optional[str]:
142+
if extras:
143+
for delim in _PARAGRAPH_DELIMITOR:
144+
pos = text_from_page.rfind(delim)
145+
if pos != -1:
146+
# search penultimate, to bypass an error in footer
147+
previous_text = None
148+
if recurs:
149+
previous_text = _recurs_merge_text_and_extras(
150+
extras, text_from_page[:pos], False
151+
)
152+
if previous_text:
153+
all_text = previous_text + text_from_page[pos:]
154+
else:
155+
all_extras = ""
156+
str_extras = "\n\n".join(filter(lambda x: x, extras))
157+
if str_extras:
158+
all_extras = delim + str_extras
159+
all_text = text_from_page[:pos] + all_extras + text_from_page[
160+
pos:]
161+
break
162+
else:
163+
all_text = None
164+
else:
165+
all_text = text_from_page
166+
return all_text
167+
168+
all_text = _recurs_merge_text_and_extras(extras, text_from_page, True)
165169
if not all_text:
166170
all_extras = ""
167171
str_extras = "\n\n".join(filter(lambda x: x, extras))
168172
if str_extras:
169-
all_extras = _delim[-1] + str_extras
173+
all_extras = _PARAGRAPH_DELIMITOR[-1] + str_extras
170174
all_text = text_from_page + all_extras
171175

172176
return all_text
@@ -212,8 +216,7 @@ def convert_images_to_text_with_rapidocr(
212216
*,
213217
format: Literal["text", "markdown", "html"] = "text",
214218
) -> CONVERT_IMAGE_TO_TEXT:
215-
"""
216-
Return a function to convert images to text using RapidOCR.
219+
"""Return a function to convert images to text using RapidOCR.
217220
218221
Note: RapidOCR is compatible english and chinese languages.
219222
@@ -258,8 +261,7 @@ def convert_images_to_text_with_tesseract(
258261
format: Literal["text", "markdown", "html"] = "text",
259262
langs: list[str] = ["eng"],
260263
) -> CONVERT_IMAGE_TO_TEXT:
261-
"""
262-
Return a function to convert images to text using Tesseract.
264+
"""Return a function to convert images to text using Tesseract.
263265
Args:
264266
format: Format of the output text. Either "text" or "markdown".
265267
langs: Array of langs for Tesseract
@@ -291,22 +293,20 @@ def _convert_images_to_text(images: Iterable[np.ndarray]) -> Iterator[str]:
291293
return _convert_images_to_text
292294

293295

294-
_prompt_images_to_description = PromptTemplate.from_template(
295-
"""You are an assistant tasked with summarizing images for retrieval. \
296+
_prompt_images_to_description = """You are an assistant tasked with summarizing \
297+
images for retrieval. \
296298
These summaries will be embedded and used to retrieve the raw image. \
297299
Give a concise summary of the image that is well optimized for retrieval \
298300
and extract all the text from the image."""
299-
)
300301

301302

302303
def convert_images_to_description(
303304
model: BaseChatModel,
304305
*,
305-
prompt: BasePromptTemplate = _prompt_images_to_description,
306+
prompt: str = _prompt_images_to_description,
306307
format: Literal["text", "markdown", "html"] = "markdown",
307308
) -> CONVERT_IMAGE_TO_TEXT:
308-
"""
309-
Return a function to convert images to text using a multimodal model.
309+
"""Return a function to convert images to text using a multimodal model.
310310
311311
Args:
312312
model: Multimodal model to use to describe the images.
@@ -326,16 +326,15 @@ def _convert_images_to_description(
326326
raise ImportError(
327327
"`PIL` package not found, please install it with `pip install pillow`"
328328
)
329-
chat = model
330329
for image in images:
331330
image_bytes = io.BytesIO()
332331
Image.fromarray(image).save(image_bytes, format="PNG")
333332
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
334-
msg = chat.invoke(
333+
msg = model.invoke(
335334
[
336335
HumanMessage(
337336
content=[
338-
{"type": "text", "text": prompt.format()},
337+
{"type": "text", "text": prompt},
339338
{
340339
"type": "image_url",
341340
"image_url": {
@@ -416,8 +415,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
416415
)
417416

418417
def _extract_text_from_page(page: pypdf.PageObject) -> str:
419-
"""
420-
Extract text from image given the version of pypdf.
418+
"""Extract text from image given the version of pypdf.
421419
"""
422420
if pypdf.__version__.startswith("3"):
423421
return page.extract_text()
@@ -646,7 +644,7 @@ def __init__(
646644
*,
647645
password: Optional[str] = None,
648646
mode: Literal["single", "page"] = "page",
649-
pages_delimitor: str = _default_page_delimitor,
647+
pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
650648
images_to_text: CONVERT_IMAGE_TO_TEXT = None,
651649
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
652650
extract_tables_settings: Optional[dict[str, Any]] = None,
@@ -693,8 +691,7 @@ def __init__(
693691
self.extract_tables_settings = extract_tables_settings
694692

695693
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
696-
"""
697-
Lazily parse the blob.
694+
"""Lazily parse the blob.
698695
Insert image, if possible, between two paragraphs.
699696
In this way, a paragraph can be continued on the next page.
700697
@@ -719,6 +716,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
719716
)
720717

721718
self.extract_tables_settings = {
719+
# See https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables
722720
"clip": None,
723721
"vertical_strategy": "lines",
724722
"horizontal_strategy": "lines",
@@ -761,24 +759,25 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
761759
for page in doc:
762760
all_text = self._get_page_content(doc, page, blob).strip()
763761
if self.mode == "page":
762+
764763
yield Document(
765764
page_content=all_text,
766-
metadata=(doc_metadata | {"page": page.number}),
765+
metadata=_validate_metadata(doc_metadata |
766+
{"page": page.number}),
767767
)
768768
else:
769769
full_content.append(all_text)
770770

771771
if self.mode == "single":
772772
yield Document(
773773
page_content=self.pages_delimitor.join(full_content),
774-
metadata=doc_metadata,
774+
metadata=_validate_metadata(doc_metadata),
775775
)
776776

777777
def _get_page_content(
778778
self, doc: pymupdf.Document, page: pymupdf.Page, blob: Blob
779779
) -> str:
780-
"""
781-
Get the text of the page using PyMuPDF and RapidOCR and issue a warning
780+
"""Get the text of the page using PyMuPDF and RapidOCR and issue a warning
782781
if it is empty.
783782
784783
Args:
@@ -819,7 +818,7 @@ def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict:
819818
Returns:
820819
dict: The extracted metadata.
821820
"""
822-
return purge_metadata(
821+
return _purge_metadata(
823822
dict(
824823
{
825824
"source": blob.source, # type: ignore[attr-defined]
@@ -860,12 +859,12 @@ def _extract_images_from_page(
860859
pix.height, pix.width, -1
861860
)
862861
)
863-
_format_image_str.format(
864-
image_text=_join_images.join(self.convert_image_to_text(images))
862+
_FORMAT_IMAGE_STR.format(
863+
image_text=_JOIN_IMAGES.join(self.convert_image_to_text(images))
865864
)
866865

867-
return _format_image_str.format(
868-
image_text=_join_images.join(self.convert_image_to_text(images))
866+
return _FORMAT_IMAGE_STR.format(
867+
image_text=_JOIN_IMAGES.join(self.convert_image_to_text(images))
869868
)
870869

871870
def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
@@ -886,9 +885,9 @@ def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
886885
)
887886
if tables_list:
888887
if self.extract_tables == "markdown":
889-
return _join_tables.join([table.to_markdown() for table in tables_list])
888+
return _JOIN_TABLES.join([table.to_markdown() for table in tables_list])
890889
elif self.extract_tables == "html":
891-
return _join_tables.join(
890+
return _JOIN_TABLES.join(
892891
[
893892
table.to_pandas().to_html(
894893
header=False,
@@ -899,7 +898,7 @@ def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
899898
]
900899
)
901900
elif self.extract_tables == "csv":
902-
return _join_tables.join(
901+
return _JOIN_TABLES.join(
903902
[
904903
table.to_pandas().to_csv(
905904
header=False,

0 commit comments

Comments
 (0)