66
77import pymupdf
88from azure .ai .documentintelligence .aio import DocumentIntelligenceClient
9+ from azure .ai .documentintelligence .models import AnalyzeDocumentRequest
910from azure .ai .documentintelligence .models import DocumentTable
1011from azure .core .credentials import AzureKeyCredential
1112from azure .core .credentials_async import AsyncTokenCredential
1415
1516from .page import Page
1617from .parser import Parser
18+ from .cu_image import ContentUnderstandingManager
1719
1820logger = logging .getLogger ("scripts" )
1921
@@ -48,24 +50,28 @@ def __init__(
4850 credential : Union [AsyncTokenCredential , AzureKeyCredential ],
4951 model_id = "prebuilt-layout" ,
5052 use_content_understanding = True ,
53+ content_understanding_endpoint : str = None ,
5154 ):
5255 self .model_id = model_id
5356 self .endpoint = endpoint
5457 self .credential = credential
5558 self .use_content_understanding = use_content_understanding
59+ self .content_understanding_endpoint = content_understanding_endpoint
5660
5761 async def parse (self , content : IO ) -> AsyncGenerator [Page , None ]:
5862 logger .info ("Extracting text from '%s' using Azure Document Intelligence" , content .name )
5963
60- # TODO: do we also need output=figures on the client itself? seems odd.
64+ cu_manager = ContentUnderstandingManager ( self . content_understanding_endpoint , self . credential )
6165 async with DocumentIntelligenceClient (
62- endpoint = self .endpoint , credential = self .credential , output = "figures"
66+ endpoint = self .endpoint , credential = self .credential
6367 ) as document_intelligence_client :
68+ # turn content into bytes
69+ content_bytes = content .read ()
6470 if self .use_content_understanding :
6571 poller = await document_intelligence_client .begin_analyze_document (
6672 model_id = "prebuilt-layout" ,
67- analyze_request = content ,
68- content_type = "application/octet-stream" ,
73+ analyze_request = AnalyzeDocumentRequest ( bytes_source = content_bytes ) ,
74+ # content_type="application/octet-stream",
6975 output = ["figures" ],
7076 features = ["ocrHighResolution" ],
7177 output_content_format = "markdown" ,
@@ -109,7 +115,9 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
109115 yield Page (page_num = page_num , offset = offset , text = page_text )
110116 offset += len (page_text )
111117
118+ figure_results = {}
112119 if form_recognizer_results .figures :
120+ doc = pymupdf .open (stream = io .BytesIO (content_bytes ))
113121 for figures_idx , figure in enumerate (form_recognizer_results .figures ):
114122 for region in figure .bounding_regions :
115123 print (f"\t Figure body bounding regions: { region } " )
@@ -121,28 +129,44 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
121129 region .polygon [5 ], # y1 (bottom)
122130 )
123131 page_number = figure .bounding_regions [0 ]["pageNumber" ]
124- cropped_img = DocumentAnalysisParser .crop_image_from_pdf_page (
125- content , page_number - 1 , bounding_box
126- )
127-
128- os .makedirs ("figures" , exist_ok = True )
129-
130- filename = "figure_imagecrop" + str (figures_idx ) + ".png"
131- # Full path for the file
132- filepath = os .path .join ("figures" , filename )
132+ cropped_img = DocumentAnalysisParser .crop_image_from_pdf_page (doc , page_number - 1 , bounding_box )
133133
134134 # Save the figure
135- cropped_img .save (filepath )
136135 bytes_io = io .BytesIO ()
137136 cropped_img .save (bytes_io , format = "PNG" )
138- cropped_img = bytes_io .getvalue ()
139- # _ , figure_description = run_cu_image(analyzer_name, filepath)
140-
141- # md_content = replace_figure_description(md_content, figure_description, figures_idx+1)
142- # figure_content.append(figure_description)
143-
144- @classmethod
145- def table_to_html (cls , table : DocumentTable ):
137+ image_fields = await cu_manager .run_cu_image (bytes_io .getvalue ())
138+ figure_results [figure .id ] = image_fields
139+
140+ md_content = analyze_result .content
141+ page_to_figure = {}
142+ for figure in analyze_result .figures :
143+ # Parse figure id
144+ # https://learn.microsoft.com/azure/ai-services/document-intelligence/concept/analyze-document-response?view=doc-intel-4.0.0#figures
145+ figure_id = figure .id .split ("." ) # 3.1 where 3 is the page number and 1 is the figure number, 1-indexed
146+ page = int (figure_id [0 ])
147+ if page not in page_to_figure :
148+ page_to_figure [page ] = []
149+ page_to_figure [page ].append (figure .id )
150+ for page in form_recognizer_results .pages :
151+ # Use the text span to extract the markdown on the page
152+ span = page .spans [0 ]
153+ page_md_content = md_content [span .offset : span .offset + span .length ]
154+ if page .page_number in page_to_figure :
155+ page_figures = page_to_figure [page .page_number ]
156+ # split the content on the figure tag
157+ parts = page_md_content .split ("\n <figure>\n " )
158+ for i , figure_id in enumerate (page_figures ):
159+ with open (
160+ os .path .join (figures_directory , f"figure_imagecrop_{ figure_id } _verbalized.json" ), "r"
161+ ) as f :
162+ figure_content = json .dumps (json .load (f )["result" ]["contents" ][0 ])
163+ parts [i ] = parts [i ] + f'<!-- FigureContent="{ figure_content } " -->'
164+ page_md_content = "\n " .join (parts )
165+ with open (os .path .join (pages_md_directory , f"page_{ page .page_number } .md" ), "w" , encoding = "utf-8" ) as f :
166+ f .write (page_md_content )
167+
168+ @staticmethod
169+ def table_to_html (table : DocumentTable ):
146170 table_html = "<table>"
147171 rows = [
148172 sorted ([cell for cell in table .cells if cell .row_index == i ], key = lambda cell : cell .column_index )
@@ -162,8 +186,8 @@ def table_to_html(cls, table: DocumentTable):
162186 table_html += "</table>"
163187 return table_html
164188
165- @classmethod
166- def crop_image_from_pdf_page (pdf_path , page_number , bounding_box ):
189+ @staticmethod
190+ def crop_image_from_pdf_page (doc : pymupdf . Document , page_number , bounding_box ):
167191 """
168192 Crops a region from a given page in a PDF and returns it as an image.
169193
@@ -172,16 +196,13 @@ def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
172196 :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
173197 :return: A PIL Image of the cropped area.
174198 """
175- doc = pymupdf . open ( pdf_path )
199+ logger . info ( f"Cropping image from PDF page { page_number } with bounding box { bounding_box } " )
176200 page = doc .load_page (page_number )
177201
178202 # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
179203 bbx = [x * 72 for x in bounding_box ]
180204 rect = pymupdf .Rect (bbx )
205+ # 72 is the DPI ? what? explain this from CU
181206 pix = page .get_pixmap (matrix = pymupdf .Matrix (300 / 72 , 300 / 72 ), clip = rect )
182207
183- img = Image .frombytes ("RGB" , [pix .width , pix .height ], pix .samples )
184-
185- doc .close ()
186-
187- return img
208+ return Image .frombytes ("RGB" , [pix .width , pix .height ], pix .samples )
0 commit comments