22
33from __future__ import annotations
44
5- import html
65import io
76import logging
87import threading
98import warnings
109from datetime import datetime
11- from urllib .parse import urlparse
12-
13- import numpy as np
1410from typing import (
1511 TYPE_CHECKING ,
1612 Any ,
13+ Iterable ,
1714 Iterator ,
1815 Literal ,
1916 Mapping ,
2017 Optional ,
2118 Sequence ,
2219 Union ,
2320)
21+ from urllib .parse import urlparse
22+
23+ import numpy as np
24+ from langchain_core .documents import Document
2425
2526from langchain_community .document_loaders .base import BaseBlobParser
2627from langchain_community .document_loaders .blob_loaders import Blob
27- from langchain_community .document_loaders .parsers .images import ImageBlobParser , \
28- RapidOCRBlobParser
29- from langchain_core .documents import Document
28+ from langchain_community .document_loaders .parsers .images import (
29+ ImageBlobParser ,
30+ RapidOCRBlobParser ,
31+ )
3032
3133if TYPE_CHECKING :
3234 import pdfminer
5355 "JBIG2Decode" ,
5456]
5557
58+
59+ def extract_from_images_with_rapidocr (
60+ images : Sequence [Union [Iterable [np .ndarray ], bytes ]],
61+ ) -> str :
62+ """Extract text from images with RapidOCR.
63+
64+ Args:
65+ images: Images to extract text from.
66+
67+ Returns:
68+ Text extracted from images.
69+
70+ Raises:
71+ ImportError: If `rapidocr-onnxruntime` package is not installed.
72+ """
73+ try :
74+ from rapidocr_onnxruntime import RapidOCR
75+ except ImportError :
76+ raise ImportError (
77+ "`rapidocr-onnxruntime` package not found, please install it with "
78+ "`pip install rapidocr-onnxruntime`"
79+ )
80+ ocr = RapidOCR ()
81+ text = ""
82+ for img in images :
83+ result , _ = ocr (img )
84+ if result :
85+ result = [text [1 ] for text in result ]
86+ text += "\n " .join (result )
87+ return text
88+
89+
5690logger = logging .getLogger (__name__ )
5791
5892_FORMAT_IMAGE_STR = "\n \n {image_text}\n \n "
5993_JOIN_IMAGES = "\n "
6094_JOIN_TABLES = "\n "
6195_DEFAULT_PAGE_DELIMITOR = "\n \f "
6296
63- _STD_METADATA_KEYS = {"source" , "total_pages" , "creationdate" , "creator" , "producer" }
97+ _STD_METADATA_KEYS = {"source" , "total_pages" , "creationdate" , "creator" , "producer" }
98+
6499
65- def _validate_metadata (metadata : dict [str , Any ]) -> dict [str ,Any ]:
100+ def _validate_metadata (metadata : dict [str , Any ]) -> dict [str , Any ]:
66101 """Validates the presence of at least the following keys:
67102 - source
68103 - page (if mode='page')
@@ -73,7 +108,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
73108 """
74109 if not _STD_METADATA_KEYS .issubset (metadata .keys ()):
75110 raise ValueError ("The PDF parser must valorize the standard metadata." )
76- if not isinstance (metadata .get ("page" ,0 ), int ):
111+ if not isinstance (metadata .get ("page" , 0 ), int ):
77112 raise ValueError ("The PDF metadata page must be a integer." )
78113 return metadata
79114
@@ -116,7 +151,10 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
116151 return new_metadata
117152
118153
119- _PARAGRAPH_DELIMITOR = ["\n \n \n " , "\n \n " ] # To insert images or table in the middle of the page.
154+ _PARAGRAPH_DELIMITOR = [
155+ "\n \n \n " ,
156+ "\n \n " ,
157+ ] # To insert images or table in the middle of the page.
120158
121159
122160def _merge_text_and_extras (extras : list [str ], text_from_page : str ) -> str :
@@ -132,7 +170,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
132170 """
133171
134172 def _recurs_merge_text_and_extras (
135- extras : list [str ], text_from_page : str , recurs : bool
173+ extras : list [str ], text_from_page : str , recurs : bool
136174 ) -> Optional [str ]:
137175 if extras :
138176 for delim in _PARAGRAPH_DELIMITOR :
@@ -151,8 +189,9 @@ def _recurs_merge_text_and_extras(
151189 str_extras = "\n \n " .join (filter (lambda x : x , extras ))
152190 if str_extras :
153191 all_extras = delim + str_extras
154- all_text = text_from_page [:pos ] + all_extras + text_from_page [
155- pos :]
192+ all_text = (
193+ text_from_page [:pos ] + all_extras + text_from_page [pos :]
194+ )
156195 break
157196 else :
158197 all_text = None
@@ -171,7 +210,6 @@ def _recurs_merge_text_and_extras(
171210 return all_text
172211
173212
174-
175213class ImagesPdfParser (BaseBlobParser ):
176214 """Abstract interface for blob parsers with images_to_text."""
177215
@@ -218,8 +256,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
218256 )
219257
220258 def _extract_text_from_page (page : pypdf .PageObject ) -> str :
221- """Extract text from image given the version of pypdf.
222- """
259+ """Extract text from image given the version of pypdf."""
223260 if pypdf .__version__ .startswith ("3" ):
224261 return page .extract_text ()
225262 else :
@@ -561,11 +598,11 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
561598 for page in doc :
562599 all_text = self ._get_page_content (doc , page , blob ).strip ()
563600 if self .mode == "page" :
564-
565601 yield Document (
566602 page_content = all_text ,
567- metadata = _validate_metadata (doc_metadata |
568- {"page" : page .number }),
603+ metadata = _validate_metadata (
604+ doc_metadata | {"page" : page .number }
605+ ),
569606 )
570607 else :
571608 full_content .append (all_text )
@@ -658,17 +695,16 @@ def _extract_images_from_page(
658695 if self .images_parser :
659696 xref = img [0 ]
660697 pix = pymupdf .Pixmap (doc , xref )
661- image = np .frombuffer (pix .samples , dtype = np .uint8 ).reshape (
662- pix .height , pix .width , - 1
663- )
698+ image = np .frombuffer (pix .samples , dtype = np .uint8 ).reshape (
699+ pix .height , pix .width , - 1
700+ )
664701 image_bytes = io .BytesIO ()
665702 Image .fromarray (image ).save (image_bytes , format = "PNG" )
666- blob = Blob .from_data (image_bytes .getvalue (), mime_type = "image/png" )
703+ blob = Blob .from_data (image_bytes .getvalue (), mime_type = "image/png" )
667704 images .append (next (self .images_parser .lazy_parse (blob )).page_content )
668705 return _FORMAT_IMAGE_STR .format (
669- image_text = _JOIN_IMAGES .join (filter (None ,images ))
670- )
671-
706+ image_text = _JOIN_IMAGES .join (filter (None , images ))
707+ )
672708
673709 def _extract_tables_from_page (self , page : pymupdf .Page ) -> str :
674710 """Extract tables from a PDF page.
0 commit comments