6060
6161logger = logging .getLogger (__name__ )
6262
63- _format_image_str = "\n \n {image_text}\n \n "
64- _join_images = "\n "
65- _join_tables = "\n "
66- _default_page_delimitor = "\n \f "
63+ _FORMAT_IMAGE_STR = "\n \n {image_text}\n \n "
64+ _JOIN_IMAGES = "\n "
65+ _JOIN_TABLES = "\n "
66+ _DEFAULT_PAGE_DELIMITOR = "\n \f "
67+
68+ _STD_METADATA_KEYS = {"source" , "total_pages" , "creationdate" , "creator" , "producer" }
69+
70+ def _validate_metadata (metadata : dict [str , Any ]) -> dict [str ,Any ]:
71+ """Validates the presence of at least the following keys:
72+ - source
73+ - page (if mode='page')
74+ - total_page
75+ - creationdate
76+ - creator
77+ - producer
78+ """
79+ if not _STD_METADATA_KEYS .issubset (metadata .keys ()):
80+ raise ValueError ("The PDF parser must valorize the standard metadata." )
81+ if not isinstance (metadata .get ("page" ,0 ), int ):
82+ raise ValueError ("The PDF metadata page must be a integer." )
83+ return metadata
6784
6885
69- def purge_metadata (metadata : dict [str , Any ]) -> dict [str , Any ]:
70- """
71- Purge metadata from unwanted keys and normalize key names.
86+ def _purge_metadata (metadata : dict [str , Any ]) -> dict [str , Any ]:
87+ """Purge metadata from unwanted keys and normalize key names.
7288
7389 Args:
7490 metadata: The original metadata dictionary.
7591
7692 Returns:
77- The cleaned and normalized metadata dictionary.
93+ The cleaned and normalized the key format of metadata dictionary.
7894 """
7995 new_metadata : dict [str , Any ] = {}
8096 map_key = {
@@ -95,7 +111,7 @@ def purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
95111 except ValueError :
96112 new_metadata [k ] = v
97113 elif k in map_key :
98- # Normliaze key with others PDF parser
114+ # Normaliaze key with others PDF parser
99115 new_metadata [map_key [k ]] = v
100116 new_metadata [k ] = v
101117 elif isinstance (v , str ):
@@ -105,53 +121,11 @@ def purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
105121 return new_metadata
106122
107123
108- _delim = ["\n \n \n " , "\n \n " ] # To insert images or table in the middle of the page.
109-
110-
111- def __merge_text_and_extras (
112- extras : list [str ], text_from_page : str , recurs : bool
113- ) -> Optional [str ]:
114- """
115- Insert extras such as image/table in a text between two paragraphs if possible.
116- Recursive version.
117-
118- Args:
119- extras: List of extra content (images/tables) to insert.
120- text_from_page: The text content from the page.
121- recurs: Flag to indicate if the function should recurse.
122-
123- Returns:
124- The merged text with extras inserted, or None if no insertion point is found.
125- """
126- if extras :
127- for delim in _delim :
128- pos = text_from_page .rfind (delim )
129- if pos != - 1 :
130- # search penultimate, to bypass an error in footer
131- previous_text = None
132- if recurs :
133- previous_text = __merge_text_and_extras (
134- extras , text_from_page [:pos ], False
135- )
136- if previous_text :
137- all_text = previous_text + text_from_page [pos :]
138- else :
139- all_extras = ""
140- str_extras = "\n \n " .join (filter (lambda x : x , extras ))
141- if str_extras :
142- all_extras = delim + str_extras
143- all_text = text_from_page [:pos ] + all_extras + text_from_page [pos :]
144- break
145- else :
146- all_text = None
147- else :
148- all_text = text_from_page
149- return all_text
124+ _PARAGRAPH_DELIMITOR = ["\n \n \n " , "\n \n " ] # To insert images or table in the middle of the page.
150125
151126
152127def _merge_text_and_extras (extras : list [str ], text_from_page : str ) -> str :
153- """
154- Insert extras such as image/table in a text between two paragraphs if possible,
128+ """Insert extras such as image/table in a text between two paragraphs if possible,
155129 else at the end of the text.
156130
157131 Args:
@@ -161,12 +135,42 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
161135 Returns:
162136 The merged text with extras inserted.
163137 """
164- all_text = __merge_text_and_extras (extras , text_from_page , True )
138+
139+ def _recurs_merge_text_and_extras (
140+ extras : list [str ], text_from_page : str , recurs : bool
141+ ) -> Optional [str ]:
142+ if extras :
143+ for delim in _PARAGRAPH_DELIMITOR :
144+ pos = text_from_page .rfind (delim )
145+ if pos != - 1 :
146+ # search penultimate, to bypass an error in footer
147+ previous_text = None
148+ if recurs :
149+ previous_text = _recurs_merge_text_and_extras (
150+ extras , text_from_page [:pos ], False
151+ )
152+ if previous_text :
153+ all_text = previous_text + text_from_page [pos :]
154+ else :
155+ all_extras = ""
156+ str_extras = "\n \n " .join (filter (lambda x : x , extras ))
157+ if str_extras :
158+ all_extras = delim + str_extras
159+ all_text = text_from_page [:pos ] + all_extras + text_from_page [
160+ pos :]
161+ break
162+ else :
163+ all_text = None
164+ else :
165+ all_text = text_from_page
166+ return all_text
167+
168+ all_text = _recurs_merge_text_and_extras (extras , text_from_page , True )
165169 if not all_text :
166170 all_extras = ""
167171 str_extras = "\n \n " .join (filter (lambda x : x , extras ))
168172 if str_extras :
169- all_extras = _delim [- 1 ] + str_extras
173+ all_extras = _PARAGRAPH_DELIMITOR [- 1 ] + str_extras
170174 all_text = text_from_page + all_extras
171175
172176 return all_text
@@ -212,8 +216,7 @@ def convert_images_to_text_with_rapidocr(
212216 * ,
213217 format : Literal ["text" , "markdown" , "html" ] = "text" ,
214218) -> CONVERT_IMAGE_TO_TEXT :
215- """
216- Return a function to convert images to text using RapidOCR.
219+ """Return a function to convert images to text using RapidOCR.
217220
218221 Note: RapidOCR is compatible english and chinese languages.
219222
@@ -258,8 +261,7 @@ def convert_images_to_text_with_tesseract(
258261 format : Literal ["text" , "markdown" , "html" ] = "text" ,
259262 langs : list [str ] = ["eng" ],
260263) -> CONVERT_IMAGE_TO_TEXT :
261- """
262- Return a function to convert images to text using Tesseract.
264+ """Return a function to convert images to text using Tesseract.
263265 Args:
264266 format: Format of the output text. Either "text" or "markdown".
265267 langs: Array of langs for Tesseract
@@ -291,22 +293,20 @@ def _convert_images_to_text(images: Iterable[np.ndarray]) -> Iterator[str]:
291293 return _convert_images_to_text
292294
293295
294- _prompt_images_to_description = PromptTemplate . from_template (
295- """You are an assistant tasked with summarizing images for retrieval. \
296+ _prompt_images_to_description = """You are an assistant tasked with summarizing \
297+ images for retrieval. \
296298 These summaries will be embedded and used to retrieve the raw image. \
297299 Give a concise summary of the image that is well optimized for retrieval \
298300 and extract all the text from the image."""
299- )
300301
301302
302303def convert_images_to_description (
303304 model : BaseChatModel ,
304305 * ,
305- prompt : BasePromptTemplate = _prompt_images_to_description ,
306+ prompt : str = _prompt_images_to_description ,
306307 format : Literal ["text" , "markdown" , "html" ] = "markdown" ,
307308) -> CONVERT_IMAGE_TO_TEXT :
308- """
309- Return a function to convert images to text using a multimodal model.
309+ """Return a function to convert images to text using a multimodal model.
310310
311311 Args:
312312 model: Multimodal model to use to describe the images.
@@ -326,16 +326,15 @@ def _convert_images_to_description(
326326 raise ImportError (
327327 "`PIL` package not found, please install it with `pip install pillow`"
328328 )
329- chat = model
330329 for image in images :
331330 image_bytes = io .BytesIO ()
332331 Image .fromarray (image ).save (image_bytes , format = "PNG" )
333332 img_base64 = base64 .b64encode (image_bytes .getvalue ()).decode ("utf-8" )
334- msg = chat .invoke (
333+ msg = model .invoke (
335334 [
336335 HumanMessage (
337336 content = [
338- {"type" : "text" , "text" : prompt . format () },
337+ {"type" : "text" , "text" : prompt },
339338 {
340339 "type" : "image_url" ,
341340 "image_url" : {
@@ -416,8 +415,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
416415 )
417416
418417 def _extract_text_from_page (page : pypdf .PageObject ) -> str :
419- """
420- Extract text from image given the version of pypdf.
418+ """Extract text from image given the version of pypdf.
421419 """
422420 if pypdf .__version__ .startswith ("3" ):
423421 return page .extract_text ()
@@ -646,7 +644,7 @@ def __init__(
646644 * ,
647645 password : Optional [str ] = None ,
648646 mode : Literal ["single" , "page" ] = "page" ,
649- pages_delimitor : str = _default_page_delimitor ,
647+ pages_delimitor : str = _DEFAULT_PAGE_DELIMITOR ,
650648 images_to_text : CONVERT_IMAGE_TO_TEXT = None ,
651649 extract_tables : Union [Literal ["csv" , "markdown" , "html" ], None ] = None ,
652650 extract_tables_settings : Optional [dict [str , Any ]] = None ,
@@ -693,8 +691,7 @@ def __init__(
693691 self .extract_tables_settings = extract_tables_settings
694692
695693 def lazy_parse (self , blob : Blob ) -> Iterator [Document ]: # type: ignore[valid-type]
696- """
697- Lazily parse the blob.
694+ """Lazily parse the blob.
698695 Insert image, if possible, between two paragraphs.
699696 In this way, a paragraph can be continued on the next page.
700697
@@ -719,6 +716,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
719716 )
720717
721718 self .extract_tables_settings = {
719+ # See https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables
722720 "clip" : None ,
723721 "vertical_strategy" : "lines" ,
724722 "horizontal_strategy" : "lines" ,
@@ -761,24 +759,25 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
761759 for page in doc :
762760 all_text = self ._get_page_content (doc , page , blob ).strip ()
763761 if self .mode == "page" :
762+
764763 yield Document (
765764 page_content = all_text ,
766- metadata = (doc_metadata | {"page" : page .number }),
765+ metadata = _validate_metadata (doc_metadata |
766+ {"page" : page .number }),
767767 )
768768 else :
769769 full_content .append (all_text )
770770
771771 if self .mode == "single" :
772772 yield Document (
773773 page_content = self .pages_delimitor .join (full_content ),
774- metadata = doc_metadata ,
774+ metadata = _validate_metadata ( doc_metadata ) ,
775775 )
776776
777777 def _get_page_content (
778778 self , doc : pymupdf .Document , page : pymupdf .Page , blob : Blob
779779 ) -> str :
780- """
781- Get the text of the page using PyMuPDF and RapidOCR and issue a warning
780+ """Get the text of the page using PyMuPDF and RapidOCR and issue a warning
782781 if it is empty.
783782
784783 Args:
@@ -819,7 +818,7 @@ def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict:
819818 Returns:
820819 dict: The extracted metadata.
821820 """
822- return purge_metadata (
821+ return _purge_metadata (
823822 dict (
824823 {
825824 "source" : blob .source , # type: ignore[attr-defined]
@@ -860,12 +859,12 @@ def _extract_images_from_page(
860859 pix .height , pix .width , - 1
861860 )
862861 )
863- _format_image_str .format (
864- image_text = _join_images .join (self .convert_image_to_text (images ))
862+ _FORMAT_IMAGE_STR .format (
863+ image_text = _JOIN_IMAGES .join (self .convert_image_to_text (images ))
865864 )
866865
867- return _format_image_str .format (
868- image_text = _join_images .join (self .convert_image_to_text (images ))
866+ return _FORMAT_IMAGE_STR .format (
867+ image_text = _JOIN_IMAGES .join (self .convert_image_to_text (images ))
869868 )
870869
871870 def _extract_tables_from_page (self , page : pymupdf .Page ) -> str :
@@ -886,9 +885,9 @@ def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
886885 )
887886 if tables_list :
888887 if self .extract_tables == "markdown" :
889- return _join_tables .join ([table .to_markdown () for table in tables_list ])
888+ return _JOIN_TABLES .join ([table .to_markdown () for table in tables_list ])
890889 elif self .extract_tables == "html" :
891- return _join_tables .join (
890+ return _JOIN_TABLES .join (
892891 [
893892 table .to_pandas ().to_html (
894893 header = False ,
@@ -899,7 +898,7 @@ def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
899898 ]
900899 )
901900 elif self .extract_tables == "csv" :
902- return _join_tables .join (
901+ return _JOIN_TABLES .join (
903902 [
904903 table .to_pandas ().to_csv (
905904 header = False ,
0 commit comments