Skip to content

Commit 7b52dac

Browse files
committed
CU kinda working
1 parent c19a9f3 commit 7b52dac

14 files changed

+213
-35
lines changed

app/backend/prepdocs.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,8 @@ def setup_file_processors(
158158
local_pdf_parser: bool = False,
159159
local_html_parser: bool = False,
160160
search_images: bool = False,
161+
use_content_understanding: bool = False,
162+
content_understanding_endpoint: Union[str, None] = None,
161163
):
162164
sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
163165

@@ -170,6 +172,8 @@ def setup_file_processors(
170172
doc_int_parser = DocumentAnalysisParser(
171173
endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
172174
credential=documentintelligence_creds,
175+
use_content_understanding=use_content_understanding,
176+
content_understanding_endpoint=content_understanding_endpoint,
173177
)
174178

175179
pdf_parser: Optional[Parser] = None
@@ -298,14 +302,15 @@ async def main(strategy: Strategy, setup_index: bool = True):
298302
logging.basicConfig(format="%(message)s")
299303
# We only set the level to INFO for our logger,
300304
# to avoid seeing the noisy INFO level logs from the Azure SDKs
301-
logger.setLevel(logging.INFO)
305+
logger.setLevel(logging.DEBUG)
302306

303307
load_azd_env()
304308

305309
use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true"
306310
use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true"
307311
use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None
308312
dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false"
313+
use_content_understanding = os.getenv("USE_CONTENT_UNDERSTANDING", "").lower() == "true"
309314

310315
# Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments.
311316
if tenant_id := os.getenv("AZURE_TENANT_ID"):
@@ -403,6 +408,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
403408
local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true",
404409
local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true",
405410
search_images=use_gptvision,
411+
use_content_understanding=use_content_understanding,
412+
content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
406413
)
407414
image_embeddings_service = setup_image_embeddings_service(
408415
azure_credential=azd_credential,
@@ -421,6 +428,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
421428
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
422429
use_acls=use_acls,
423430
category=args.category,
431+
use_content_understanding=use_content_understanding,
432+
content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
424433
)
425434

426435
loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall))
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
from typing import Union
2+
import logging
3+
4+
import aiohttp
5+
from azure.core.credentials_async import AsyncTokenCredential
6+
from tenacity import retry, stop_after_attempt, wait_fixed
7+
from tenacity import retry_if_exception_type
8+
9+
from azure.identity.aio import get_bearer_token_provider
10+
11+
12+
logger = logging.getLogger("scripts")
13+
14+
CU_API_VERSION = "2024-12-01-preview"
15+
16+
PATH_ANALYZER_MANAGEMENT = "/analyzers/{analyzerId}"
17+
PATH_ANALYZER_MANAGEMENT_OPERATION = "/analyzers/{analyzerId}/operations/{operationId}"
18+
19+
# Define Analyzer inference paths
20+
PATH_ANALYZER_INFERENCE = "/analyzers/{analyzerId}:analyze"
21+
PATH_ANALYZER_INFERENCE_GET_IMAGE = "/analyzers/{analyzerId}/results/{operationId}/images/{imageId}"
22+
23+
analyzer_name = "image_schema_analyzer"
24+
image_schema = {
25+
"analyzerId": analyzer_name,
26+
"name": "Image understanding",
27+
"description": "Extract detailed structured information from images extracted from documents.",
28+
"baseAnalyzerId": "prebuilt-image",
29+
"scenario": "image",
30+
"config": {"returnDetails": False},
31+
"fieldSchema": {
32+
"name": "ImageInformation",
33+
"descriptions": "Structured information from images.",
34+
"fields": {
35+
"Title": {
36+
"type": "string",
37+
"description": "Title for the image (either taken from the image directly or a good short title based off content)",
38+
},
39+
"ImageType": {
40+
"type": "string",
41+
"description": "The type of image.",
42+
"kind": "classify",
43+
"enum": [
44+
"chart",
45+
"diagram",
46+
"table",
47+
"figure",
48+
"photo",
49+
"screenshot",
50+
"logo",
51+
"icon",
52+
"map",
53+
"infographic",
54+
"other",
55+
],
56+
},
57+
"MarkdownDescription": {
58+
"type": "string",
59+
"description": "Description of the image in markdown format. Start with a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in tabular markdown format, with valid syntax and accurate numbers. If the image is a chart, describe any axis or legends.",
60+
},
61+
},
62+
},
63+
}
64+
65+
66+
class ContentUnderstandingManager:
67+
68+
def __init__(self, endpoint: str, credential: Union[AsyncTokenCredential, str]):
69+
self.endpoint = endpoint
70+
self.credential = credential
71+
72+
async def create_analyzer(self):
73+
74+
token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default")
75+
token = await token_provider()
76+
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
77+
params = {"api-version": CU_API_VERSION}
78+
analyzer_id = image_schema["analyzerId"]
79+
cu_endpoint = f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_id}"
80+
async with aiohttp.ClientSession() as session:
81+
async with session.put(url=cu_endpoint, params=params, headers=headers, json=image_schema) as response:
82+
if response.status == 409:
83+
print(f"Analyzer '{analyzer_id}' already exists.")
84+
return
85+
elif response.status != 201:
86+
data = await response.text()
87+
# TODO: log it
88+
print(data)
89+
response.raise_for_status()
90+
else:
91+
poll_url = response.headers.get("Operation-Location")
92+
93+
@retry(stop=stop_after_attempt(60), wait=wait_fixed(2))
94+
async def poll():
95+
async with session.get(poll_url, headers=headers) as response:
96+
response.raise_for_status()
97+
response_json = await response.json()
98+
if response_json["status"] != "Succeeded":
99+
raise ValueError("Retry")
100+
print(response_json)
101+
102+
await poll()
103+
104+
def run_cu_image(self, analyzer_name, image):
105+
result = self.run_inference(analyzer_name, image)
106+
model_output = result["result"]["contents"][0]["fields"]
107+
model_output_raw = str(model_output)
108+
return model_output, model_output_raw
109+
110+
async def run_cu_image(self, image_bytes):
111+
async with aiohttp.ClientSession() as session:
112+
token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
113+
headers = {"Authorization": "Bearer " + token.token}
114+
params = {"api-version": CU_API_VERSION}
115+
116+
async with session.post(
117+
url=f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_name}:analyze",
118+
params=params,
119+
headers=headers,
120+
data=image_bytes,
121+
) as response:
122+
result = await response.json()
123+
print(result)
124+
poll_url = response.headers["Operation-Location"]
125+
126+
@retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError))
127+
async def poll():
128+
async with session.get(poll_url, headers=headers) as response:
129+
response.raise_for_status()
130+
response_json = await response.json()
131+
print(response_json)
132+
if response_json["status"] == "Failed":
133+
raise Exception("Failed")
134+
if response_json["status"] == "Running":
135+
raise ValueError("Running")
136+
return response_json
137+
138+
response = await poll()
139+
return response["result"]["contents"][0]["fields"]

app/backend/prepdocslib/filestrategy.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .listfilestrategy import File, ListFileStrategy
88
from .searchmanager import SearchManager, Section
99
from .strategy import DocumentAction, SearchInfo, Strategy
10+
from .cu_image import ContentUnderstandingManager
1011

1112
logger = logging.getLogger("scripts")
1213

@@ -50,6 +51,8 @@ def __init__(
5051
search_analyzer_name: Optional[str] = None,
5152
use_acls: bool = False,
5253
category: Optional[str] = None,
54+
use_content_understanding: bool = False,
55+
content_understanding_endpoint: Optional[str] = None,
5356
):
5457
self.list_file_strategy = list_file_strategy
5558
self.blob_manager = blob_manager
@@ -61,6 +64,8 @@ def __init__(
6164
self.search_info = search_info
6265
self.use_acls = use_acls
6366
self.category = category
67+
self.use_content_understanding = use_content_understanding
68+
self.content_understanding_endpoint = content_understanding_endpoint
6469

6570
async def setup(self):
6671
search_manager = SearchManager(
@@ -73,6 +78,10 @@ async def setup(self):
7378
)
7479
await search_manager.create_index()
7580

81+
if self.use_content_understanding:
82+
cu_manager = ContentUnderstandingManager(self.content_understanding_endpoint, self.search_info.credential)
83+
await cu_manager.create_analyzer()
84+
7685
async def run(self):
7786
search_manager = SearchManager(
7887
self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings

app/backend/prepdocslib/pdfparser.py

Lines changed: 51 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import pymupdf
88
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
9+
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
910
from azure.ai.documentintelligence.models import DocumentTable
1011
from azure.core.credentials import AzureKeyCredential
1112
from azure.core.credentials_async import AsyncTokenCredential
@@ -14,6 +15,7 @@
1415

1516
from .page import Page
1617
from .parser import Parser
18+
from .cu_image import ContentUnderstandingManager
1719

1820
logger = logging.getLogger("scripts")
1921

@@ -48,24 +50,28 @@ def __init__(
4850
credential: Union[AsyncTokenCredential, AzureKeyCredential],
4951
model_id="prebuilt-layout",
5052
use_content_understanding=True,
53+
content_understanding_endpoint: str = None,
5154
):
5255
self.model_id = model_id
5356
self.endpoint = endpoint
5457
self.credential = credential
5558
self.use_content_understanding = use_content_understanding
59+
self.content_understanding_endpoint = content_understanding_endpoint
5660

5761
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
5862
logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)
5963

60-
# TODO: do we also need output=figures on the client itself? seems odd.
64+
cu_manager = ContentUnderstandingManager(self.content_understanding_endpoint, self.credential)
6165
async with DocumentIntelligenceClient(
62-
endpoint=self.endpoint, credential=self.credential, output="figures"
66+
endpoint=self.endpoint, credential=self.credential
6367
) as document_intelligence_client:
68+
# turn content into bytes
69+
content_bytes = content.read()
6470
if self.use_content_understanding:
6571
poller = await document_intelligence_client.begin_analyze_document(
6672
model_id="prebuilt-layout",
67-
analyze_request=content,
68-
content_type="application/octet-stream",
73+
analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
74+
# content_type="application/octet-stream",
6975
output=["figures"],
7076
features=["ocrHighResolution"],
7177
output_content_format="markdown",
@@ -109,7 +115,9 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
109115
yield Page(page_num=page_num, offset=offset, text=page_text)
110116
offset += len(page_text)
111117

118+
figure_results = {}
112119
if form_recognizer_results.figures:
120+
doc = pymupdf.open(stream=io.BytesIO(content_bytes))
113121
for figures_idx, figure in enumerate(form_recognizer_results.figures):
114122
for region in figure.bounding_regions:
115123
print(f"\tFigure body bounding regions: {region}")
@@ -121,28 +129,44 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
121129
region.polygon[5], # y1 (bottom)
122130
)
123131
page_number = figure.bounding_regions[0]["pageNumber"]
124-
cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(
125-
content, page_number - 1, bounding_box
126-
)
127-
128-
os.makedirs("figures", exist_ok=True)
129-
130-
filename = "figure_imagecrop" + str(figures_idx) + ".png"
131-
# Full path for the file
132-
filepath = os.path.join("figures", filename)
132+
cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
133133

134134
# Save the figure
135-
cropped_img.save(filepath)
136135
bytes_io = io.BytesIO()
137136
cropped_img.save(bytes_io, format="PNG")
138-
cropped_img = bytes_io.getvalue()
139-
# _ , figure_description = run_cu_image(analyzer_name, filepath)
140-
141-
# md_content = replace_figure_description(md_content, figure_description, figures_idx+1)
142-
# figure_content.append(figure_description)
143-
144-
@classmethod
145-
def table_to_html(cls, table: DocumentTable):
137+
image_fields = await cu_manager.run_cu_image(bytes_io.getvalue())
138+
figure_results[figure.id] = image_fields
139+
140+
md_content = analyze_result.content
141+
page_to_figure = {}
142+
for figure in analyze_result.figures:
143+
# Parse figure id
144+
# https://learn.microsoft.com/azure/ai-services/document-intelligence/concept/analyze-document-response?view=doc-intel-4.0.0#figures
145+
figure_id = figure.id.split(".") # 3.1 where 3 is the page number and 1 is the figure number, 1-indexed
146+
page = int(figure_id[0])
147+
if page not in page_to_figure:
148+
page_to_figure[page] = []
149+
page_to_figure[page].append(figure.id)
150+
for page in form_recognizer_results.pages:
151+
# Use the text span to extract the markdown on the page
152+
span = page.spans[0]
153+
page_md_content = md_content[span.offset : span.offset + span.length]
154+
if page.page_number in page_to_figure:
155+
page_figures = page_to_figure[page.page_number]
156+
# split the content on the figure tag
157+
parts = page_md_content.split("\n<figure>\n")
158+
for i, figure_id in enumerate(page_figures):
159+
with open(
160+
os.path.join(figures_directory, f"figure_imagecrop_{figure_id}_verbalized.json"), "r"
161+
) as f:
162+
figure_content = json.dumps(json.load(f)["result"]["contents"][0])
163+
parts[i] = parts[i] + f'<!-- FigureContent="{figure_content}" -->'
164+
page_md_content = "\n".join(parts)
165+
with open(os.path.join(pages_md_directory, f"page_{page.page_number}.md"), "w", encoding="utf-8") as f:
166+
f.write(page_md_content)
167+
168+
@staticmethod
169+
def table_to_html(table: DocumentTable):
146170
table_html = "<table>"
147171
rows = [
148172
sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index)
@@ -162,8 +186,8 @@ def table_to_html(cls, table: DocumentTable):
162186
table_html += "</table>"
163187
return table_html
164188

165-
@classmethod
166-
def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
189+
@staticmethod
190+
def crop_image_from_pdf_page(doc: pymupdf.Document, page_number, bounding_box):
167191
"""
168192
Crops a region from a given page in a PDF and returns it as an image.
169193
@@ -172,16 +196,13 @@ def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
172196
:param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
173197
:return: A PIL Image of the cropped area.
174198
"""
175-
doc = pymupdf.open(pdf_path)
199+
logger.info(f"Cropping image from PDF page {page_number} with bounding box {bounding_box}")
176200
page = doc.load_page(page_number)
177201

178202
# Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
179203
bbx = [x * 72 for x in bounding_box]
180204
rect = pymupdf.Rect(bbx)
205+
# 72 is the DPI ? what? explain this from CU
181206
pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect)
182207

183-
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
184-
185-
doc.close()
186-
187-
return img
208+
return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

app/backend/requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ openai>=1.3.7
55
numpy>=1,<2.1.0 # Used by openai embeddings.create to optimize embeddings (but not required)
66
tiktoken
77
tenacity
8-
azure-ai-documentintelligence
8+
azure-ai-documentintelligence==1.0.0b4
99
azure-cognitiveservices-speech
1010
azure-cosmos
1111
azure-search-documents==11.6.0b6

0 commit comments

Comments
 (0)