55from abc import abstractmethod
66from typing import TYPE_CHECKING , Iterator , Literal
77
8+ import numpy
89import numpy as np
910from langchain_core .documents import Document
1011from langchain_core .language_models import BaseChatModel
1920logger = logging .getLogger (__name__ )
2021
2122
22- class ImageBlobParser (BaseBlobParser ):
23+ class BaseImageBlobParser (BaseBlobParser ):
24+ """
25+ Abstract base class for parsing image blobs into text.
26+
27+ Attributes:
28+ format (Literal["text", "markdown", "html"]):
29+ Output format of the parsed text.
30+ """
31+
2332 def __init__ (
2433 self ,
2534 * ,
2635 format : Literal ["text" , "markdown" , "html" ] = "text" ,
2736 ):
37+ """
38+ Initializes the BaseImageBlobParser.
39+
40+ Args:
41+ format (Literal["text", "markdown", "html"]):
42+ The format for the parsed output.
43+ """
2844 self .format = format
2945
3046 @abstractmethod
3147 def _analyze_image (self , img : "Image" ) -> str :
48+ """
49+ Abstract method to analyze an image and extract textual content.
50+
51+ Args:
52+ img (Image):
53+ The image to be analyzed.
54+
55+ Returns:
56+ str:
57+ The extracted text content.
58+ """
3259 pass
3360
3461 def lazy_parse (self , blob : Blob ) -> Iterator [Document ]:
62+ """
63+ Lazily parses a blob and yields Document objects containing the parsed content.
64+
65+ Args:
66+ blob (Blob):
67+ The blob to be parsed.
68+
69+ Yields:
70+ Document:
71+ A document containing the parsed content and metadata.
72+ """
3573 try :
3674 from PIL import Image as Img
3775
3876 with blob .as_bytes_io () as buf :
39- img = Img .open (buf )
77+ if blob .mimetype == "application/x-npy" :
78+ img = Img .fromarray (numpy .load (buf ))
79+ else :
80+ img = Img .open (buf )
4081 content = self ._analyze_image (img )
4182 if content :
83+ source = blob .source or "#"
4284 if self .format == "markdown" :
4385 content = content .replace ("]" , r"\\]" )
44- content = f""
86+ content = f""
4587 elif self .format == "html" :
46- content = f'<img alt="{ html .escape (content , quote = True )} " />'
88+ content = (
89+ f'<img alt="{ html .escape (content , quote = True )} '
90+ f'src="{ source } " />'
91+ )
4792 logger .debug ("Image text: %s" , content .replace ("\n " , "\\ n" ))
4893 yield Document (
4994 page_content = content ,
50- metadata = {"source" : blob .source },
95+ metadata = {** blob . metadata , ** { "source" : blob .source } },
5196 )
5297 except ImportError :
5398 raise ImportError (
54- "`rapidocr-onnxruntime ` package not found, please install it with "
99+ "`Pillow ` package not found, please install it with "
55100 "`pip install Pillow`"
56101 )
57102
58103
59- class RapidOCRBlobParser (ImageBlobParser ):
104+ class RapidOCRBlobParser (BaseImageBlobParser ):
105+ """
106+ Parser for extracting text from images using the RapidOCR library.
107+
108+ Attributes:
109+ ocr:
110+ The RapidOCR instance for performing OCR.
111+ """
112+
60113 def __init__ (
61114 self ,
62115 * ,
63116 format : Literal ["text" , "markdown" , "html" ] = "text" ,
64117 ):
118+ """
119+ Initializes the RapidOCRBlobParser.
120+
121+ Args:
122+ format (Literal["text", "markdown", "html"]):
123+ The format for the parsed output.
124+ """
65125 super ().__init__ (format = format )
66126 self .ocr = None
67127
68128 def _analyze_image (self , img : "Image" ) -> str :
129+ """
130+ Analyzes an image and extracts text using RapidOCR.
131+
132+ Args:
133+ img (Image):
134+ The image to be analyzed.
135+
136+ Returns:
137+ str:
138+ The extracted text content.
139+ """
69140 if not self .ocr :
70141 try :
71142 from rapidocr_onnxruntime import RapidOCR
@@ -83,17 +154,44 @@ def _analyze_image(self, img: "Image") -> str:
83154 return content
84155
85156
86- class TesseractBlobParser (ImageBlobParser ):
157+ class TesseractBlobParser (BaseImageBlobParser ):
158+ """
159+ Parser for extracting text from images using the Tesseract OCR library.
160+
161+ Attributes:
162+ langs (list[str]):
163+ The languages to use for OCR.
164+ """
165+
87166 def __init__ (
88167 self ,
89168 * ,
90169 format : Literal ["text" , "markdown" , "html" ] = "text" ,
91- langs : list [str ] = [ "eng" ] ,
170+ langs : list [str ] = ( "eng" ,) ,
92171 ):
172+ """
173+ Initializes the TesseractBlobParser.
174+
175+ Args:
176+ format (Literal["text", "markdown", "html"]):
177+ The format for the parsed output.
178+ langs (list[str]):
179+ The languages to use for OCR.
180+ """
93181 super ().__init__ (format = format )
94182 self .langs = langs
95183
96184 def _analyze_image (self , img : "Image" ) -> str :
185+ """
186+ Analyzes an image and extracts text using Tesseract OCR.
187+
188+ Args:
189+ img (Image):
190+ The image to be analyzed.
191+
192+ Returns:
193+ str: The extracted text content.
194+ """
97195 try :
98196 import pytesseract
99197 except ImportError :
@@ -104,7 +202,7 @@ def _analyze_image(self, img: "Image") -> str:
104202 return pytesseract .image_to_string (img , lang = "+" .join (self .langs )).strip ()
105203
106204
107- _prompt_images_to_description = (
205+ _PROMPT_IMAGES_TO_DESCRIPTION = (
108206 "You are an assistant tasked with summarizing "
109207 "images for retrieval. "
110208 "These summaries will be embedded and used to retrieve the raw image. "
@@ -113,19 +211,51 @@ def _analyze_image(self, img: "Image") -> str:
113211)
114212
115213
116- class MultimodalBlobParser (ImageBlobParser ):
214+ class LLMImageBlobParser (BaseImageBlobParser ):
215+ """
216+ Parser for analyzing images using a language model (LLM).
217+
218+ Attributes:
219+ model (BaseChatModel):
220+ The language model to use for analysis.
221+ prompt (str):
222+ The prompt to provide to the language model.
223+ """
224+
117225 def __init__ (
118226 self ,
119227 * ,
120228 format : Literal ["text" , "markdown" , "html" ] = "text" ,
121229 model : BaseChatModel ,
122- prompt : str = _prompt_images_to_description ,
230+ prompt : str = _PROMPT_IMAGES_TO_DESCRIPTION ,
123231 ):
232+ """
233+ Initializes the LLMImageBlobParser.
234+
235+ Args:
236+ format (Literal["text", "markdown", "html"]):
237+ The format for the parsed output.
238+ model (BaseChatModel):
239+ The language model to use for analysis.
240+ prompt (str):
241+ The prompt to provide to the language model.
242+ """
124243 super ().__init__ (format = format )
125244 self .model = model
126245 self .prompt = prompt
127246
128247 def _analyze_image (self , img : "Image" ) -> str :
248+ """
249+ Analyzes an image using the provided language model.
250+
251+ Args:
252+ img (Image):
253+ The image to be analyzed.
254+
255+ Returns:
256+ str: *
257+ The extracted textual content.
258+ """
129259 image_bytes = io .BytesIO ()
130260 img .save (image_bytes , format = "PNG" )
131261 img_base64 = base64 .b64encode (image_bytes .getvalue ()).decode ("utf-8" )
0 commit comments