11import html
22import io
33import logging
4- import os
4+ from enum import Enum
55from typing import IO , AsyncGenerator , Union
66
77import pymupdf
88from azure .ai .documentintelligence .aio import DocumentIntelligenceClient
9- from azure .ai .documentintelligence .models import AnalyzeDocumentRequest
10- from azure .ai .documentintelligence .models import DocumentTable
9+ from azure .ai .documentintelligence .models import (
10+ AnalyzeDocumentRequest ,
11+ DocumentFigure ,
12+ DocumentTable ,
13+ )
1114from azure .core .credentials import AzureKeyCredential
1215from azure .core .credentials_async import AsyncTokenCredential
1316from PIL import Image
1417from pypdf import PdfReader
1518
19+ from .cu_image import ContentUnderstandingManager
1620from .page import Page
1721from .parser import Parser
18- from .cu_image import ContentUnderstandingManager
1922
2023logger = logging .getLogger ("scripts" )
2124
@@ -71,11 +74,11 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
7174 poller = await document_intelligence_client .begin_analyze_document (
7275 model_id = "prebuilt-layout" ,
7376 analyze_request = AnalyzeDocumentRequest (bytes_source = content_bytes ),
74- # content_type="application/octet-stream",
7577 output = ["figures" ],
7678 features = ["ocrHighResolution" ],
7779 output_content_format = "markdown" ,
7880 )
81+ doc_for_pymupdf = pymupdf .open (stream = io .BytesIO (content_bytes ))
7982 else :
8083 poller = await document_intelligence_client .begin_analyze_document (
8184 model_id = self .model_id , analyze_request = content , content_type = "application/octet-stream"
@@ -89,81 +92,74 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
8992 for table in (form_recognizer_results .tables or [])
9093 if table .bounding_regions and table .bounding_regions [0 ].page_number == page_num + 1
9194 ]
95+ figures_on_page = [
96+ figure
97+ for figure in (form_recognizer_results .figures or [])
98+ if figure .bounding_regions and figure .bounding_regions [0 ].page_number == page_num + 1
99+ ]
100+
101+ class ObjectType (Enum ):
102+ NONE = - 1
103+ TABLE = 0
104+ FIGURE = 1
92105
93106 # mark all positions of the table spans in the page
94107 page_offset = page .spans [0 ].offset
95108 page_length = page .spans [0 ].length
96- table_chars = [- 1 ] * page_length
97- for table_id , table in enumerate (tables_on_page ):
109+ mask_chars = [( ObjectType . NONE , None ) ] * page_length
110+ for table_idx , table in enumerate (tables_on_page ):
98111 for span in table .spans :
99112 # replace all table spans with "table_id" in table_chars array
100113 for i in range (span .length ):
101114 idx = span .offset - page_offset + i
102115 if idx >= 0 and idx < page_length :
103- table_chars [idx ] = table_id
116+ mask_chars [idx ] = (ObjectType .TABLE , table_idx )
117+ for figure_idx , figure in enumerate (figures_on_page ):
118+ for span in figure .spans :
119+ # replace all figure spans with "figure_id" in figure_chars array
120+ for i in range (span .length ):
121+ idx = span .offset - page_offset + i
122+ if idx >= 0 and idx < page_length :
123+ mask_chars [idx ] = (ObjectType .FIGURE , figure_idx )
104124
105125 # build page text by replacing characters in table spans with table html
106126 page_text = ""
107- added_tables = set ()
108- for idx , table_id in enumerate (table_chars ):
109- if table_id == - 1 :
127+ added_objects = set () # set of object types todo mypy
128+ for idx , mask_char in enumerate (mask_chars ):
129+ object_type , object_idx = mask_char
130+ if object_type == ObjectType .NONE :
110131 page_text += form_recognizer_results .content [page_offset + idx ]
111- elif table_id not in added_tables :
112- page_text += DocumentAnalysisParser .table_to_html (tables_on_page [table_id ])
113- added_tables .add (table_id )
114-
132+ elif object_type == ObjectType .TABLE :
133+ if mask_char not in added_objects :
134+ page_text += DocumentAnalysisParser .table_to_html (tables_on_page [object_idx ])
135+ added_objects .add (mask_char )
136+ elif object_type == ObjectType .FIGURE :
137+ if mask_char not in added_objects :
138+ page_text += await DocumentAnalysisParser .figure_to_html (
139+ doc_for_pymupdf , cu_manager , figures_on_page [object_idx ]
140+ )
141+ added_objects .add (mask_char )
142+ # TODO: reset page numbers based on the mask
115143 yield Page (page_num = page_num , offset = offset , text = page_text )
116144 offset += len (page_text )
117145
118- figure_results = {}
119- if form_recognizer_results .figures :
120- doc = pymupdf .open (stream = io .BytesIO (content_bytes ))
121- for figures_idx , figure in enumerate (form_recognizer_results .figures ):
122- for region in figure .bounding_regions :
123- print (f"\t Figure body bounding regions: { region } " )
124- # To learn more about bounding regions, see https://aka.ms/bounding-region
125- bounding_box = (
126- region .polygon [0 ], # x0 (left)
127- region .polygon [1 ], # y0 (top
128- region .polygon [4 ], # x1 (right)
129- region .polygon [5 ], # y1 (bottom)
130- )
131- page_number = figure .bounding_regions [0 ]["pageNumber" ]
132- cropped_img = DocumentAnalysisParser .crop_image_from_pdf_page (doc , page_number - 1 , bounding_box )
133-
134- # Save the figure
135- bytes_io = io .BytesIO ()
136- cropped_img .save (bytes_io , format = "PNG" )
137- image_fields = await cu_manager .run_cu_image (bytes_io .getvalue ())
138- figure_results [figure .id ] = image_fields
139-
140- md_content = analyze_result .content
141- page_to_figure = {}
142- for figure in analyze_result .figures :
143- # Parse figure id
144- # https://learn.microsoft.com/azure/ai-services/document-intelligence/concept/analyze-document-response?view=doc-intel-4.0.0#figures
145- figure_id = figure .id .split ("." ) # 3.1 where 3 is the page number and 1 is the figure number, 1-indexed
146- page = int (figure_id [0 ])
147- if page not in page_to_figure :
148- page_to_figure [page ] = []
149- page_to_figure [page ].append (figure .id )
150- for page in form_recognizer_results .pages :
151- # Use the text span to extract the markdown on the page
152- span = page .spans [0 ]
153- page_md_content = md_content [span .offset : span .offset + span .length ]
154- if page .page_number in page_to_figure :
155- page_figures = page_to_figure [page .page_number ]
156- # split the content on the figure tag
157- parts = page_md_content .split ("\n <figure>\n " )
158- for i , figure_id in enumerate (page_figures ):
159- with open (
160- os .path .join (figures_directory , f"figure_imagecrop_{ figure_id } _verbalized.json" ), "r"
161- ) as f :
162- figure_content = json .dumps (json .load (f )["result" ]["contents" ][0 ])
163- parts [i ] = parts [i ] + f'<!-- FigureContent="{ figure_content } " -->'
164- page_md_content = "\n " .join (parts )
165- with open (os .path .join (pages_md_directory , f"page_{ page .page_number } .md" ), "w" , encoding = "utf-8" ) as f :
166- f .write (page_md_content )
146+ @staticmethod
147+ async def figure_to_html (
148+ doc : pymupdf .Document , cu_manager : ContentUnderstandingManager , figure : DocumentFigure
149+ ) -> str :
150+ for region in figure .bounding_regions :
151+ # To learn more about bounding regions, see https://aka.ms/bounding-region
152+ bounding_box = (
153+ region .polygon [0 ], # x0 (left)
154+ region .polygon [1 ], # y0 (top
155+ region .polygon [4 ], # x1 (right)
156+ region .polygon [5 ], # y1 (bottom)
157+ )
158+ page_number = figure .bounding_regions [0 ]["pageNumber" ] # 1-indexed
159+ cropped_img = DocumentAnalysisParser .crop_image_from_pdf_page (doc , page_number - 1 , bounding_box )
160+ figure_description = await cu_manager .verbalize_figure (cropped_img )
161+ # TODO: add DI's original figcaption to this caption - figure.caption.content
162+ return f"<figure><figcaption>{ figure_description } </figcaption></figure>"
167163
168164 @staticmethod
169165 def table_to_html (table : DocumentTable ):
@@ -187,7 +183,7 @@ def table_to_html(table: DocumentTable):
187183 return table_html
188184
189185 @staticmethod
190- def crop_image_from_pdf_page (doc : pymupdf .Document , page_number , bounding_box ):
186+ def crop_image_from_pdf_page (doc : pymupdf .Document , page_number , bounding_box ) -> bytes :
191187 """
192188 Crops a region from a given page in a PDF and returns it as an image.
193189
@@ -205,4 +201,7 @@ def crop_image_from_pdf_page(doc: pymupdf.Document, page_number, bounding_box):
205201 # 72 is the DPI ? what? explain this from CU
206202 pix = page .get_pixmap (matrix = pymupdf .Matrix (300 / 72 , 300 / 72 ), clip = rect )
207203
208- return Image .frombytes ("RGB" , [pix .width , pix .height ], pix .samples )
204+ img = Image .frombytes ("RGB" , [pix .width , pix .height ], pix .samples )
205+ bytes_io = io .BytesIO ()
206+ img .save (bytes_io , format = "PNG" )
207+ return bytes_io .getvalue ()
0 commit comments