21
21
from openai import AsyncOpenAI
22
22
23
23
from .mediadescriber import MediaDescriber , ContentUnderstandingDescriber , MultimodalModelDescriber
24
- from .page import Page
24
+ from .page import Page , ImageOnPage
25
25
from .parser import Parser
26
26
27
27
logger = logging .getLogger ("scripts" )
@@ -50,6 +50,8 @@ class MediaDescriptionStrategy(Enum):
50
50
OPENAI = "openai"
51
51
CONTENTUNDERSTANDING = "content_understanding"
52
52
53
+
54
+
53
55
class DocumentAnalysisParser (Parser ):
54
56
"""
55
57
Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages
@@ -68,6 +70,7 @@ def __init__(
68
70
openai_deployment : Optional [str ] = None ,
69
71
# If using Content Understanding, this is the endpoint for the service
70
72
content_understanding_endpoint : Union [str , None ] = None ,
73
+ # should this take the blob storage info too?
71
74
):
72
75
self .model_id = model_id
73
76
self .endpoint = endpoint
@@ -137,6 +140,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
137
140
analyze_result : AnalyzeResult = await poller .result ()
138
141
139
142
offset = 0
143
+
140
144
for page in analyze_result .pages :
141
145
tables_on_page = [
142
146
table
@@ -150,6 +154,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
150
154
for figure in (analyze_result .figures or [])
151
155
if figure .bounding_regions and figure .bounding_regions [0 ].page_number == page .page_number
152
156
]
157
+ page_images : list [ImageOnPage ] = []
153
158
154
159
class ObjectType (Enum ):
155
160
NONE = - 1
@@ -195,24 +200,25 @@ class ObjectType(Enum):
195
200
if object_idx is None :
196
201
raise ValueError ("Expected object_idx to be set" )
197
202
if mask_char not in added_objects :
198
- figure_html = await DocumentAnalysisParser .figure_to_html (
203
+ image_on_page = await DocumentAnalysisParser .process_figure (
199
204
doc_for_pymupdf , figures_on_page [object_idx ], media_describer
200
205
)
201
- page_text += figure_html
206
+ page_images .append (image_on_page )
207
+ page_text += image_on_page .description
202
208
added_objects .add (mask_char )
203
209
# We remove these comments since they are not needed and skew the page numbers
204
210
page_text = page_text .replace ("<!-- PageBreak -->" , "" )
205
211
# We remove excess newlines at the beginning and end of the page
206
212
page_text = page_text .strip ()
207
- yield Page (page_num = page .page_number - 1 , offset = offset , text = page_text )
213
+ yield Page (page_num = page .page_number - 1 , offset = offset , text = page_text , images = page_images )
208
214
offset += len (page_text )
209
215
210
216
@staticmethod
211
- async def figure_to_html (
217
+ async def process_figure (
212
218
doc : pymupdf .Document , figure : DocumentFigure , media_describer : MediaDescriber
213
219
) -> str :
214
220
figure_title = (figure .caption and figure .caption .content ) or ""
215
- logger .info ("Describing figure %s with title '%s'" , figure .id , figure_title )
221
+ logger .info ("Describing figure %s with title '%s' using %s " , figure .id , figure_title , type ( media_describer ). __name__ )
216
222
if not figure .bounding_regions :
217
223
return f"<figure><figcaption>{ figure_title } </figcaption></figure>"
218
224
if len (figure .bounding_regions ) > 1 :
@@ -228,7 +234,12 @@ async def figure_to_html(
228
234
page_number = first_region ["pageNumber" ] # 1-indexed
229
235
cropped_img = DocumentAnalysisParser .crop_image_from_pdf_page (doc , page_number - 1 , bounding_box )
230
236
figure_description = await media_describer .describe_image (cropped_img )
231
- return f"<figure><figcaption>{ figure_title } <br>{ figure_description } </figcaption></figure>"
237
+ return ImageOnPage (
238
+ bytes = cropped_img ,
239
+ filename = f"page_{ page_number } _figure_{ figure .id } .png" ,
240
+ bbox = bounding_box ,
241
+ description = f"<figure><figcaption>{ figure_title } <br>{ figure_description } </figcaption></figure>"
242
+ )
232
243
233
244
@staticmethod
234
245
def table_to_html (table : DocumentTable ):
@@ -274,10 +285,6 @@ def crop_image_from_pdf_page(
274
285
pix = page .get_pixmap (matrix = pymupdf .Matrix (page_dpi / bbox_dpi , page_dpi / bbox_dpi ), clip = rect )
275
286
276
287
img = Image .frombytes ("RGB" , (pix .width , pix .height ), pix .samples )
277
- # print out the number of pixels
278
- print (f"Cropped image size: { img .size } pixels" )
279
288
bytes_io = io .BytesIO ()
280
289
img .save (bytes_io , format = "PNG" )
281
- with open (f"cropped_page_{ page_number + 1 } .png" , "wb" ) as f :
282
- f .write (bytes_io .getvalue ())
283
290
return bytes_io .getvalue ()
0 commit comments