11import html
2+ import io
23import logging
4+ import os
35from typing import IO , AsyncGenerator , Union
46
7+ import pymupdf
58from azure .ai .documentintelligence .aio import DocumentIntelligenceClient
69from azure .ai .documentintelligence .models import DocumentTable
710from azure .core .credentials import AzureKeyCredential
811from azure .core .credentials_async import AsyncTokenCredential
12+ from PIL import Image
913from pypdf import PdfReader
1014
1115from .page import Page
@@ -39,21 +43,37 @@ class DocumentAnalysisParser(Parser):
3943 """
4044
4145 def __init__ (
42- self , endpoint : str , credential : Union [AsyncTokenCredential , AzureKeyCredential ], model_id = "prebuilt-layout"
46+ self ,
47+ endpoint : str ,
48+ credential : Union [AsyncTokenCredential , AzureKeyCredential ],
49+ model_id = "prebuilt-layout" ,
50+ use_content_understanding = True ,
4351 ):
4452 self .model_id = model_id
4553 self .endpoint = endpoint
4654 self .credential = credential
55+ self .use_content_understanding = use_content_understanding
4756
4857 async def parse (self , content : IO ) -> AsyncGenerator [Page , None ]:
4958 logger .info ("Extracting text from '%s' using Azure Document Intelligence" , content .name )
5059
60+ # TODO: do we also need output=figures on the client itself? seems odd.
5161 async with DocumentIntelligenceClient (
52- endpoint = self .endpoint , credential = self .credential
62+ endpoint = self .endpoint , credential = self .credential , output = "figures"
5363 ) as document_intelligence_client :
54- poller = await document_intelligence_client .begin_analyze_document (
55- model_id = self .model_id , analyze_request = content , content_type = "application/octet-stream"
56- )
64+ if self .use_content_understanding :
65+ poller = await document_intelligence_client .begin_analyze_document (
66+ model_id = "prebuilt-layout" ,
67+ analyze_request = content ,
68+ content_type = "application/octet-stream" ,
69+ output = ["figures" ],
70+ features = ["ocrHighResolution" ],
71+ output_content_format = "markdown" ,
72+ )
73+ else :
74+ poller = await document_intelligence_client .begin_analyze_document (
75+ model_id = self .model_id , analyze_request = content , content_type = "application/octet-stream"
76+ )
5777 form_recognizer_results = await poller .result ()
5878
5979 offset = 0
@@ -89,6 +109,38 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
89109 yield Page (page_num = page_num , offset = offset , text = page_text )
90110 offset += len (page_text )
91111
112+ if form_recognizer_results .figures :
113+ for figures_idx , figure in enumerate (form_recognizer_results .figures ):
114+ for region in figure .bounding_regions :
115+ print (f"\t Figure body bounding regions: { region } " )
116+ # To learn more about bounding regions, see https://aka.ms/bounding-region
117+ bounding_box = (
118+ region .polygon [0 ], # x0 (left)
119+ region .polygon [1 ], # y0 (top
120+ region .polygon [4 ], # x1 (right)
121+ region .polygon [5 ], # y1 (bottom)
122+ )
123+ page_number = figure .bounding_regions [0 ]["pageNumber" ]
124+ cropped_img = DocumentAnalysisParser .crop_image_from_pdf_page (
125+ content , page_number - 1 , bounding_box
126+ )
127+
128+ os .makedirs ("figures" , exist_ok = True )
129+
130+ filename = "figure_imagecrop" + str (figures_idx ) + ".png"
131+ # Full path for the file
132+ filepath = os .path .join ("figures" , filename )
133+
134+ # Save the figure
135+ cropped_img .save (filepath )
136+ bytes_io = io .BytesIO ()
137+ cropped_img .save (bytes_io , format = "PNG" )
138+ cropped_img = bytes_io .getvalue ()
139+ # _ , figure_description = run_cu_image(analyzer_name, filepath)
140+
141+ # md_content = replace_figure_description(md_content, figure_description, figures_idx+1)
142+ # figure_content.append(figure_description)
143+
92144 @classmethod
93145 def table_to_html (cls , table : DocumentTable ):
94146 table_html = "<table>"
@@ -109,3 +161,27 @@ def table_to_html(cls, table: DocumentTable):
109161 table_html += "</tr>"
110162 table_html += "</table>"
111163 return table_html
164+
165+ @classmethod
166+ def crop_image_from_pdf_page (pdf_path , page_number , bounding_box ):
167+ """
168+ Crops a region from a given page in a PDF and returns it as an image.
169+
170+ :param pdf_path: Path to the PDF file.
171+ :param page_number: The page number to crop from (0-indexed).
172+ :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
173+ :return: A PIL Image of the cropped area.
174+ """
175+ doc = pymupdf .open (pdf_path )
176+ page = doc .load_page (page_number )
177+
178+ # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
179+ bbx = [x * 72 for x in bounding_box ]
180+ rect = pymupdf .Rect (bbx )
181+ pix = page .get_pixmap (matrix = pymupdf .Matrix (300 / 72 , 300 / 72 ), clip = rect )
182+
183+ img = Image .frombytes ("RGB" , [pix .width , pix .height ], pix .samples )
184+
185+ doc .close ()
186+
187+ return img
0 commit comments