Skip to content

Commit 65e5616

Browse files
committed
CU integration
1 parent 7b52dac commit 65e5616

File tree

5 files changed

+84
-79
lines changed

5 files changed

+84
-79
lines changed

app/backend/prepdocslib/cu_image.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
1-
from typing import Union
21
import logging
2+
from typing import Union
33

44
import aiohttp
55
from azure.core.credentials_async import AsyncTokenCredential
6-
from tenacity import retry, stop_after_attempt, wait_fixed
7-
from tenacity import retry_if_exception_type
8-
96
from azure.identity.aio import get_bearer_token_provider
10-
7+
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
118

129
logger = logging.getLogger("scripts")
1310

@@ -97,7 +94,6 @@ async def poll():
9794
response_json = await response.json()
9895
if response_json["status"] != "Succeeded":
9996
raise ValueError("Retry")
100-
print(response_json)
10197

10298
await poll()
10399

@@ -107,7 +103,7 @@ def run_cu_image(self, analyzer_name, image):
107103
model_output_raw = str(model_output)
108104
return model_output, model_output_raw
109105

110-
async def run_cu_image(self, image_bytes):
106+
async def verbalize_figure(self, image_bytes) -> str:
111107
async with aiohttp.ClientSession() as session:
112108
token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
113109
headers = {"Authorization": "Bearer " + token.token}
@@ -119,8 +115,7 @@ async def run_cu_image(self, image_bytes):
119115
headers=headers,
120116
data=image_bytes,
121117
) as response:
122-
result = await response.json()
123-
print(result)
118+
response.raise_for_status()
124119
poll_url = response.headers["Operation-Location"]
125120

126121
@retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError))
@@ -129,11 +124,13 @@ async def poll():
129124
response.raise_for_status()
130125
response_json = await response.json()
131126
print(response_json)
127+
# rich.print it all pretty progress-y
132128
if response_json["status"] == "Failed":
133129
raise Exception("Failed")
134130
if response_json["status"] == "Running":
135131
raise ValueError("Running")
136132
return response_json
137133

138-
response = await poll()
139-
return response["result"]["contents"][0]["fields"]
134+
results = await poll()
135+
fields = results["result"]["contents"][0]["fields"]
136+
return f"Title: {fields['Title']['valueString']}\n\nType: {fields['ImageType']['valueString']}\n\nDescription: {fields['MarkdownDescription']['valueString']}"

app/backend/prepdocslib/filestrategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
from typing import List, Optional
33

44
from .blobmanager import BlobManager
5+
from .cu_image import ContentUnderstandingManager
56
from .embeddings import ImageEmbeddings, OpenAIEmbeddings
67
from .fileprocessor import FileProcessor
78
from .listfilestrategy import File, ListFileStrategy
89
from .searchmanager import SearchManager, Section
910
from .strategy import DocumentAction, SearchInfo, Strategy
10-
from .cu_image import ContentUnderstandingManager
1111

1212
logger = logging.getLogger("scripts")
1313

app/backend/prepdocslib/pdfparser.py

Lines changed: 65 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,24 @@
11
import html
22
import io
33
import logging
4-
import os
4+
from enum import Enum
55
from typing import IO, AsyncGenerator, Union
66

77
import pymupdf
88
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
9-
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
10-
from azure.ai.documentintelligence.models import DocumentTable
9+
from azure.ai.documentintelligence.models import (
10+
AnalyzeDocumentRequest,
11+
DocumentFigure,
12+
DocumentTable,
13+
)
1114
from azure.core.credentials import AzureKeyCredential
1215
from azure.core.credentials_async import AsyncTokenCredential
1316
from PIL import Image
1417
from pypdf import PdfReader
1518

19+
from .cu_image import ContentUnderstandingManager
1620
from .page import Page
1721
from .parser import Parser
18-
from .cu_image import ContentUnderstandingManager
1922

2023
logger = logging.getLogger("scripts")
2124

@@ -71,11 +74,11 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
7174
poller = await document_intelligence_client.begin_analyze_document(
7275
model_id="prebuilt-layout",
7376
analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
74-
# content_type="application/octet-stream",
7577
output=["figures"],
7678
features=["ocrHighResolution"],
7779
output_content_format="markdown",
7880
)
81+
doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
7982
else:
8083
poller = await document_intelligence_client.begin_analyze_document(
8184
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
@@ -89,81 +92,74 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
8992
for table in (form_recognizer_results.tables or [])
9093
if table.bounding_regions and table.bounding_regions[0].page_number == page_num + 1
9194
]
95+
figures_on_page = [
96+
figure
97+
for figure in (form_recognizer_results.figures or [])
98+
if figure.bounding_regions and figure.bounding_regions[0].page_number == page_num + 1
99+
]
100+
101+
class ObjectType(Enum):
102+
NONE = -1
103+
TABLE = 0
104+
FIGURE = 1
92105

93106
# mark all positions of the table spans in the page
94107
page_offset = page.spans[0].offset
95108
page_length = page.spans[0].length
96-
table_chars = [-1] * page_length
97-
for table_id, table in enumerate(tables_on_page):
109+
mask_chars = [(ObjectType.NONE, None)] * page_length
110+
for table_idx, table in enumerate(tables_on_page):
98111
for span in table.spans:
99112
# replace all table spans with "table_id" in table_chars array
100113
for i in range(span.length):
101114
idx = span.offset - page_offset + i
102115
if idx >= 0 and idx < page_length:
103-
table_chars[idx] = table_id
116+
mask_chars[idx] = (ObjectType.TABLE, table_idx)
117+
for figure_idx, figure in enumerate(figures_on_page):
118+
for span in figure.spans:
119+
# replace all figure spans with "figure_id" in figure_chars array
120+
for i in range(span.length):
121+
idx = span.offset - page_offset + i
122+
if idx >= 0 and idx < page_length:
123+
mask_chars[idx] = (ObjectType.FIGURE, figure_idx)
104124

105125
# build page text by replacing characters in table spans with table html
106126
page_text = ""
107-
added_tables = set()
108-
for idx, table_id in enumerate(table_chars):
109-
if table_id == -1:
127+
added_objects = set() # set of object types todo mypy
128+
for idx, mask_char in enumerate(mask_chars):
129+
object_type, object_idx = mask_char
130+
if object_type == ObjectType.NONE:
110131
page_text += form_recognizer_results.content[page_offset + idx]
111-
elif table_id not in added_tables:
112-
page_text += DocumentAnalysisParser.table_to_html(tables_on_page[table_id])
113-
added_tables.add(table_id)
114-
132+
elif object_type == ObjectType.TABLE:
133+
if mask_char not in added_objects:
134+
page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx])
135+
added_objects.add(mask_char)
136+
elif object_type == ObjectType.FIGURE:
137+
if mask_char not in added_objects:
138+
page_text += await DocumentAnalysisParser.figure_to_html(
139+
doc_for_pymupdf, cu_manager, figures_on_page[object_idx]
140+
)
141+
added_objects.add(mask_char)
142+
# TODO: reset page numbers based on the mask
115143
yield Page(page_num=page_num, offset=offset, text=page_text)
116144
offset += len(page_text)
117145

118-
figure_results = {}
119-
if form_recognizer_results.figures:
120-
doc = pymupdf.open(stream=io.BytesIO(content_bytes))
121-
for figures_idx, figure in enumerate(form_recognizer_results.figures):
122-
for region in figure.bounding_regions:
123-
print(f"\tFigure body bounding regions: {region}")
124-
# To learn more about bounding regions, see https://aka.ms/bounding-region
125-
bounding_box = (
126-
region.polygon[0], # x0 (left)
127-
region.polygon[1], # y0 (top
128-
region.polygon[4], # x1 (right)
129-
region.polygon[5], # y1 (bottom)
130-
)
131-
page_number = figure.bounding_regions[0]["pageNumber"]
132-
cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
133-
134-
# Save the figure
135-
bytes_io = io.BytesIO()
136-
cropped_img.save(bytes_io, format="PNG")
137-
image_fields = await cu_manager.run_cu_image(bytes_io.getvalue())
138-
figure_results[figure.id] = image_fields
139-
140-
md_content = analyze_result.content
141-
page_to_figure = {}
142-
for figure in analyze_result.figures:
143-
# Parse figure id
144-
# https://learn.microsoft.com/azure/ai-services/document-intelligence/concept/analyze-document-response?view=doc-intel-4.0.0#figures
145-
figure_id = figure.id.split(".") # 3.1 where 3 is the page number and 1 is the figure number, 1-indexed
146-
page = int(figure_id[0])
147-
if page not in page_to_figure:
148-
page_to_figure[page] = []
149-
page_to_figure[page].append(figure.id)
150-
for page in form_recognizer_results.pages:
151-
# Use the text span to extract the markdown on the page
152-
span = page.spans[0]
153-
page_md_content = md_content[span.offset : span.offset + span.length]
154-
if page.page_number in page_to_figure:
155-
page_figures = page_to_figure[page.page_number]
156-
# split the content on the figure tag
157-
parts = page_md_content.split("\n<figure>\n")
158-
for i, figure_id in enumerate(page_figures):
159-
with open(
160-
os.path.join(figures_directory, f"figure_imagecrop_{figure_id}_verbalized.json"), "r"
161-
) as f:
162-
figure_content = json.dumps(json.load(f)["result"]["contents"][0])
163-
parts[i] = parts[i] + f'<!-- FigureContent="{figure_content}" -->'
164-
page_md_content = "\n".join(parts)
165-
with open(os.path.join(pages_md_directory, f"page_{page.page_number}.md"), "w", encoding="utf-8") as f:
166-
f.write(page_md_content)
146+
@staticmethod
147+
async def figure_to_html(
148+
doc: pymupdf.Document, cu_manager: ContentUnderstandingManager, figure: DocumentFigure
149+
) -> str:
150+
for region in figure.bounding_regions:
151+
# To learn more about bounding regions, see https://aka.ms/bounding-region
152+
bounding_box = (
153+
region.polygon[0], # x0 (left)
154+
region.polygon[1], # y0 (top
155+
region.polygon[4], # x1 (right)
156+
region.polygon[5], # y1 (bottom)
157+
)
158+
page_number = figure.bounding_regions[0]["pageNumber"] # 1-indexed
159+
cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
160+
figure_description = await cu_manager.verbalize_figure(cropped_img)
161+
# TODO: add DI's original figcaption to this caption - figure.caption.content
162+
return f"<figure><figcaption>{figure_description}</figcaption></figure>"
167163

168164
@staticmethod
169165
def table_to_html(table: DocumentTable):
@@ -187,7 +183,7 @@ def table_to_html(table: DocumentTable):
187183
return table_html
188184

189185
@staticmethod
190-
def crop_image_from_pdf_page(doc: pymupdf.Document, page_number, bounding_box):
186+
def crop_image_from_pdf_page(doc: pymupdf.Document, page_number, bounding_box) -> bytes:
191187
"""
192188
Crops a region from a given page in a PDF and returns it as an image.
193189
@@ -205,4 +201,7 @@ def crop_image_from_pdf_page(doc: pymupdf.Document, page_number, bounding_box):
205201
# 72 is the DPI ? what? explain this from CU
206202
pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect)
207203

208-
return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
204+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
205+
bytes_io = io.BytesIO()
206+
img.save(bytes_io, format="PNG")
207+
return bytes_io.getvalue()

app/backend/prepdocslib/textsplitter.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ def find_page(offset):
193193
yield from self.split_page_by_max_tokens(page_num=find_page(start), text=section_text)
194194

195195
last_table_start = section_text.rfind("<table")
196+
last_figure_start = section_text.rfind("<figure")
196197
if last_table_start > 2 * self.sentence_search_limit and last_table_start > section_text.rfind("</table"):
197198
# If the section ends with an unclosed table, we need to start the next section with the table.
198199
# If table starts inside sentence_search_limit, we ignore it, as that will cause an infinite loop for tables longer than MAX_SECTION_LENGTH
@@ -201,6 +202,14 @@ def find_page(offset):
201202
f"Section ends with unclosed table, starting next section with the table at page {find_page(start)} offset {start} table start {last_table_start}"
202203
)
203204
start = min(end - self.section_overlap, start + last_table_start)
205+
elif last_figure_start > 2 * self.sentence_search_limit and last_figure_start > section_text.rfind(
206+
"</figure"
207+
):
208+
# If the section ends with an unclosed figure, we need to start the next section with the figure.
209+
logger.info(
210+
f"Section ends with unclosed figure, starting next section with the figure at page {find_page(start)} offset {start} figure start {last_figure_start}"
211+
)
212+
start = min(end - self.section_overlap, start + last_figure_start)
204213
else:
205214
start = end - self.section_overlap
206215

scripts/prepdocs.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/sh
22

3-
#. ./scripts/load_python_env.sh
3+
. ./scripts/load_python_env.sh
44

55
echo 'Running "prepdocs.py"'
66

0 commit comments

Comments
 (0)