11import base64
2- import html
32import io
43import logging
54from abc import abstractmethod
109from langchain_core .documents import Document
1110from langchain_core .language_models import BaseChatModel
1211from langchain_core .messages import HumanMessage
13- from langchain_core .prompts import BasePromptTemplate , PromptTemplate
1412
1513from langchain_community .document_loaders .base import BaseBlobParser
1614from langchain_community .document_loaders .blob_loaders import Blob
@@ -44,12 +42,11 @@ def __init__(
4442 self .format = format
4543
4644 @abstractmethod
47- def _analyze_image (self , img : "Image" , format : str ) -> str :
45+ def _analyze_image (self , img : "Image" ) -> str :
4846 """Abstract method to analyze an image and extract textual content.
4947
5048 Args:
5149 img: The image to be analyzed.
52- format: The format to use if it's possible
5350
5451 Returns:
5552 The extracted text content.
@@ -73,22 +70,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
7370 img = Img .fromarray (numpy .load (buf ))
7471 else :
7572 img = Img .open (buf )
76- format = (
77- "text"
78- if self .format in ("markdown-img" , "html-img" )
79- else self .format
80- )
81- content = self ._analyze_image (img , format )
82- if content :
83- source = blob .source or "#"
84- if self .format == "markdown-img" :
85- content = content .replace ("]" , r"\\]" )
86- content = f""
87- elif self .format == "html-img" :
88- content = (
89- f'<img alt="{ html .escape (content , quote = True )} '
90- f'src="{ source } " />'
91- )
73+ content = self ._analyze_image (img )
9274 logger .debug ("Image text: %s" , content .replace ("\n " , "\\ n" ))
9375 yield Document (
9476 page_content = content ,
@@ -107,44 +89,24 @@ class RapidOCRBlobParser(BaseImageBlobParser):
10789 Attributes:
10890 ocr:
10991 The RapidOCR instance for performing OCR.
110- format (Literal["text", "markdown-img", "html-img"]):
111- The format for the parsed output.
112- - "text" = return the content as is
113- - "markdown-img" = wrap the content into an image markdown link, w/ link
114- pointing to (`![body)(#)`]
115- - "html-img" = wrap the content as the `alt` text of an tag and link to
116- (`<img alt="{body}" src="#"/>`)
11792 """
11893
11994 def __init__ (
12095 self ,
121- * ,
122- format : Literal ["text" , "markdown-img" , "html-img" ] = "text" ,
123- ):
96+ ) -> None :
12497 """
12598 Initializes the RapidOCRBlobParser.
126-
127- Args:
128- format (Literal["text", "markdown-img", "html-img"]):
129- The format for the parsed output.
130- - "text" = return the content as is
131- - "markdown-img" = wrap the content into an image markdown link, w/ link
132- pointing to (`![body)(#)`]
133- - "html-img" = wrap the content as the `alt` text of an tag and link to
134- (`<img alt="{body}" src="#"/>`)
13599 """
136- super ().__init__ (format = format )
100+ super ().__init__ ()
137101 self .ocr = None
138102
139- def _analyze_image (self , img : "Image" , format : str ) -> str :
103+ def _analyze_image (self , img : "Image" ) -> str :
140104 """
141105 Analyzes an image and extracts text using RapidOCR.
142106
143107 Args:
144108 img (Image):
145109 The image to be analyzed.
146- format (str):
147- The format to use if it's possible
148110
149111 Returns:
150112 str:
@@ -168,48 +130,27 @@ def _analyze_image(self, img: "Image", format: str) -> str:
168130
169131
170132class TesseractBlobParser (BaseImageBlobParser ):
171- """Parse for extracting text from images using the Tesseract OCR library.
172-
173- Attributes:
174- format (Literal["text", "markdown-img", "html-img"]):
175- The format for the parsed output.
176- - "text" = return the content as is
177- - "markdown-img" = wrap the content into an image markdown link, w/ link
178- pointing to (`![body)(#)`]
179- - "html-img" = wrap the content as the `alt` text of an tag and link to
180- (`<img alt="{body}" src="#"/>`)
181- langs (list[str]):
182- The languages to use for OCR.
183- """
133+ """Parse for extracting text from images using the Tesseract OCR library."""
184134
185135 def __init__ (
186136 self ,
187137 * ,
188- format : Literal ["text" , "markdown-img" , "html-img" ] = "text" ,
189138 langs : Iterable [str ] = ("eng" ,),
190139 ):
191140 """Initialize the TesseractBlobParser.
192141
193142 Args:
194- format (Literal["text", "markdown-img", "html-img"]):
195- The format for the parsed output.
196- - "text" = return the content as is
197- - "markdown-img" = wrap the content into an image markdown link, w/ link
198- pointing to (`![body)(#)`]
199- - "html-img" = wrap the content as the `alt` text of an tag and link to
200- (`<img alt="{body}" src="#"/>`)
201143 langs (list[str]):
202144 The languages to use for OCR.
203145 """
204- super ().__init__ (format = format )
146+ super ().__init__ ()
205147 self .langs = list (langs )
206148
207- def _analyze_image (self , img : "Image" , format : str ) -> str :
149+ def _analyze_image (self , img : "Image" ) -> str :
208150 """Analyze an image and extracts text using Tesseract OCR.
209151
210152 Args:
211153 img: The image to be analyzed.
212- format: The format to use if it's possible
213154
214155 Returns:
215156 str: The extracted text content.
@@ -224,31 +165,21 @@ def _analyze_image(self, img: "Image", format: str) -> str:
224165 return pytesseract .image_to_string (img , lang = "+" .join (self .langs )).strip ()
225166
226167
227- _PROMPT_IMAGES_TO_DESCRIPTION : BasePromptTemplate = PromptTemplate . from_template (
168+ _PROMPT_IMAGES_TO_DESCRIPTION : str = (
228169 "You are an assistant tasked with summarizing images for retrieval. "
229170 "1. These summaries will be embedded and used to retrieve the raw image. "
230171 "Give a concise summary of the image that is well optimized for retrieval\n "
231172 "2. extract all the text from the image. "
232173 "Do not exclude any content from the page.\n "
233- "Format answer in {format} without explanatory text "
174+ "Format answer in markdown without explanatory text "
234175 "and without markdown delimiter ``` at the beginning. "
235- "Respects the start of the format."
236176)
237177
238178
239179class LLMImageBlobParser (BaseImageBlobParser ):
240180 """Parser for analyzing images using a language model (LLM).
241181
242182 Attributes:
243- format (Literal["text", "markdown-img", "html-img"]):
244- The format for the parsed output.
245- - "text" = return the content as is
246- - "markdown-img" = wrap the content into an image markdown link, w/ link
247- pointing to (`![body)(#)`]
248- - "html-img" = wrap the content as the `alt` text of an tag and link to
249- (`<img alt="{body}" src="#"/>`)
250- - "markdown" = return markdown content
251- - "html" = return html content
252183 model (BaseChatModel):
253184 The language model to use for analysis.
254185 prompt (str):
@@ -258,27 +189,22 @@ class LLMImageBlobParser(BaseImageBlobParser):
258189 def __init__ (
259190 self ,
260191 * ,
261- format : Literal [
262- "text" , "markdown-img" , "html-img" , "markdown" , "html"
263- ] = "text" ,
264192 model : BaseChatModel ,
265- prompt : BasePromptTemplate = _PROMPT_IMAGES_TO_DESCRIPTION ,
193+ prompt : str = _PROMPT_IMAGES_TO_DESCRIPTION ,
266194 ):
267195 """Initializes the LLMImageBlobParser.
268196
269197 Args:
270- format (Literal["text", "markdown", "html"]):
271- The format for the parsed output.
272198 model (BaseChatModel):
273199 The language model to use for analysis.
274200 prompt (str):
275201 The prompt to provide to the language model.
276202 """
277- super ().__init__ (format = format )
203+ super ().__init__ ()
278204 self .model = model
279205 self .prompt = prompt
280206
281- def _analyze_image (self , img : "Image" , format : str ) -> str :
207+ def _analyze_image (self , img : "Image" ) -> str :
282208 """Analyze an image using the provided language model.
283209
284210 Args:
0 commit comments