Skip to content

Commit c19a9f3

Browse files
committed
First pass
1 parent 194876a commit c19a9f3

File tree

1 file changed

+81
-5
lines changed

1 file changed

+81
-5
lines changed

app/backend/prepdocslib/pdfparser.py

Lines changed: 81 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
import html
2+
import io
23
import logging
4+
import os
35
from typing import IO, AsyncGenerator, Union
46

7+
import pymupdf
58
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
69
from azure.ai.documentintelligence.models import DocumentTable
710
from azure.core.credentials import AzureKeyCredential
811
from azure.core.credentials_async import AsyncTokenCredential
12+
from PIL import Image
913
from pypdf import PdfReader
1014

1115
from .page import Page
@@ -39,21 +43,37 @@ class DocumentAnalysisParser(Parser):
3943
"""
4044

4145
def __init__(
42-
self, endpoint: str, credential: Union[AsyncTokenCredential, AzureKeyCredential], model_id="prebuilt-layout"
46+
self,
47+
endpoint: str,
48+
credential: Union[AsyncTokenCredential, AzureKeyCredential],
49+
model_id="prebuilt-layout",
50+
use_content_understanding=True,
4351
):
4452
self.model_id = model_id
4553
self.endpoint = endpoint
4654
self.credential = credential
55+
self.use_content_understanding = use_content_understanding
4756

4857
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
4958
logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)
5059

60+
# TODO: do we also need output=figures on the client itself? seems odd.
5161
async with DocumentIntelligenceClient(
52-
endpoint=self.endpoint, credential=self.credential
62+
endpoint=self.endpoint, credential=self.credential, output="figures"
5363
) as document_intelligence_client:
54-
poller = await document_intelligence_client.begin_analyze_document(
55-
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
56-
)
64+
if self.use_content_understanding:
65+
poller = await document_intelligence_client.begin_analyze_document(
66+
model_id="prebuilt-layout",
67+
analyze_request=content,
68+
content_type="application/octet-stream",
69+
output=["figures"],
70+
features=["ocrHighResolution"],
71+
output_content_format="markdown",
72+
)
73+
else:
74+
poller = await document_intelligence_client.begin_analyze_document(
75+
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
76+
)
5777
form_recognizer_results = await poller.result()
5878

5979
offset = 0
@@ -89,6 +109,38 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
89109
yield Page(page_num=page_num, offset=offset, text=page_text)
90110
offset += len(page_text)
91111

112+
if form_recognizer_results.figures:
113+
for figures_idx, figure in enumerate(form_recognizer_results.figures):
114+
for region in figure.bounding_regions:
115+
print(f"\tFigure body bounding regions: {region}")
116+
# To learn more about bounding regions, see https://aka.ms/bounding-region
117+
bounding_box = (
118+
region.polygon[0], # x0 (left)
119+
region.polygon[1], # y0 (top
120+
region.polygon[4], # x1 (right)
121+
region.polygon[5], # y1 (bottom)
122+
)
123+
page_number = figure.bounding_regions[0]["pageNumber"]
124+
cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(
125+
content, page_number - 1, bounding_box
126+
)
127+
128+
os.makedirs("figures", exist_ok=True)
129+
130+
filename = "figure_imagecrop" + str(figures_idx) + ".png"
131+
# Full path for the file
132+
filepath = os.path.join("figures", filename)
133+
134+
# Save the figure
135+
cropped_img.save(filepath)
136+
bytes_io = io.BytesIO()
137+
cropped_img.save(bytes_io, format="PNG")
138+
cropped_img = bytes_io.getvalue()
139+
# _ , figure_description = run_cu_image(analyzer_name, filepath)
140+
141+
# md_content = replace_figure_description(md_content, figure_description, figures_idx+1)
142+
# figure_content.append(figure_description)
143+
92144
@classmethod
93145
def table_to_html(cls, table: DocumentTable):
94146
table_html = "<table>"
@@ -109,3 +161,27 @@ def table_to_html(cls, table: DocumentTable):
109161
table_html += "</tr>"
110162
table_html += "</table>"
111163
return table_html
164+
165+
@classmethod
166+
def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
167+
"""
168+
Crops a region from a given page in a PDF and returns it as an image.
169+
170+
:param pdf_path: Path to the PDF file.
171+
:param page_number: The page number to crop from (0-indexed).
172+
:param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
173+
:return: A PIL Image of the cropped area.
174+
"""
175+
doc = pymupdf.open(pdf_path)
176+
page = doc.load_page(page_number)
177+
178+
# Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
179+
bbx = [x * 72 for x in bounding_box]
180+
rect = pymupdf.Rect(bbx)
181+
pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect)
182+
183+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
184+
185+
doc.close()
186+
187+
return img

0 commit comments

Comments
 (0)