@@ -90,20 +90,20 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
90
90
poller = await document_intelligence_client .begin_analyze_document (
91
91
model_id = self .model_id , analyze_request = content , content_type = "application/octet-stream"
92
92
)
93
- form_recognizer_results : AnalyzeResult = await poller .result ()
93
+ analyze_result : AnalyzeResult = await poller .result ()
94
94
95
95
offset = 0
96
- for page in form_recognizer_results .pages :
96
+ for page in analyze_result .pages :
97
97
tables_on_page = [
98
98
table
99
- for table in (form_recognizer_results .tables or [])
99
+ for table in (analyze_result .tables or [])
100
100
if table .bounding_regions and table .bounding_regions [0 ].page_number == page .page_number
101
101
]
102
102
figures_on_page = []
103
103
if self .use_content_understanding :
104
104
figures_on_page = [
105
105
figure
106
- for figure in (form_recognizer_results .figures or [])
106
+ for figure in (analyze_result .figures or [])
107
107
if figure .bounding_regions and figure .bounding_regions [0 ].page_number == page .page_number
108
108
]
109
109
@@ -112,17 +112,18 @@ class ObjectType(Enum):
112
112
TABLE = 0
113
113
FIGURE = 1
114
114
115
- # mark all positions of the table spans in the page
116
115
page_offset = page .spans [0 ].offset
117
116
page_length = page .spans [0 ].length
118
117
mask_chars : list [tuple [ObjectType , Union [int , None ]]] = [(ObjectType .NONE , None )] * page_length
118
+ # mark all positions of the table spans in the page
119
119
for table_idx , table in enumerate (tables_on_page ):
120
120
for span in table .spans :
121
121
# replace all table spans with "table_id" in table_chars array
122
122
for i in range (span .length ):
123
123
idx = span .offset - page_offset + i
124
124
if idx >= 0 and idx < page_length :
125
125
mask_chars [idx ] = (ObjectType .TABLE , table_idx )
126
+ # mark all positions of the figure spans in the page
126
127
for figure_idx , figure in enumerate (figures_on_page ):
127
128
for span in figure .spans :
128
129
# replace all figure spans with "figure_id" in figure_chars array
@@ -137,7 +138,7 @@ class ObjectType(Enum):
137
138
for idx , mask_char in enumerate (mask_chars ):
138
139
object_type , object_idx = mask_char
139
140
if object_type == ObjectType .NONE :
140
- page_text += form_recognizer_results .content [page_offset + idx ]
141
+ page_text += analyze_result .content [page_offset + idx ]
141
142
elif object_type == ObjectType .TABLE :
142
143
if object_idx is None :
143
144
raise ValueError ("Expected object_idx to be set" )
@@ -151,7 +152,7 @@ class ObjectType(Enum):
151
152
raise ValueError ("Expected object_idx to be set" )
152
153
if mask_char not in added_objects :
153
154
figure_html = await DocumentAnalysisParser .figure_to_html (
154
- doc_for_pymupdf , cu_describer , figures_on_page [object_idx ]
155
+ doc_for_pymupdf , figures_on_page [object_idx ], cu_describer
155
156
)
156
157
page_text += figure_html
157
158
added_objects .add (mask_char )
@@ -164,21 +165,23 @@ class ObjectType(Enum):
164
165
165
166
@staticmethod
166
167
async def figure_to_html (
167
- doc : pymupdf .Document , cu_describer : ContentUnderstandingDescriber , figure : DocumentFigure
168
+ doc : pymupdf .Document , figure : DocumentFigure , cu_describer : ContentUnderstandingDescriber
168
169
) -> str :
169
170
figure_title = (figure .caption and figure .caption .content ) or ""
170
171
logger .info ("Describing figure %s with title '%s'" , figure .id , figure_title )
171
172
if not figure .bounding_regions :
172
173
return f"<figure><figcaption>{ figure_title } </figcaption></figure>"
173
- for region in figure .bounding_regions :
174
- # To learn more about bounding regions, see https://aka.ms/bounding-region
175
- bounding_box = (
176
- region .polygon [0 ], # x0 (left)
177
- region .polygon [1 ], # y0 (top
178
- region .polygon [4 ], # x1 (right)
179
- region .polygon [5 ], # y1 (bottom)
180
- )
181
- page_number = figure .bounding_regions [0 ]["pageNumber" ] # 1-indexed
174
+ if len (figure .bounding_regions ) > 1 :
175
+ logger .warning ("Figure %s has more than one bounding region, using the first one" , figure .id )
176
+ first_region = figure .bounding_regions [0 ]
177
+ # To learn more about bounding regions, see https://aka.ms/bounding-region
178
+ bounding_box = (
179
+ first_region .polygon [0 ], # x0 (left)
180
+ first_region .polygon [1 ], # y0 (top
181
+ first_region .polygon [4 ], # x1 (right)
182
+ first_region .polygon [5 ], # y1 (bottom)
183
+ )
184
+ page_number = first_region ["pageNumber" ] # 1-indexed
182
185
cropped_img = DocumentAnalysisParser .crop_image_from_pdf_page (doc , page_number - 1 , bounding_box )
183
186
figure_description = await cu_describer .describe_image (cropped_img )
184
187
return f"<figure><figcaption>{ figure_title } <br>{ figure_description } </figcaption></figure>"
@@ -205,7 +208,7 @@ def table_to_html(table: DocumentTable):
205
208
return table_html
206
209
207
210
@staticmethod
208
- def crop_image_from_pdf_page (doc : pymupdf .Document , page_number , bounding_box ) -> bytes :
211
+ def crop_image_from_pdf_page (doc : pymupdf .Document , page_number : int , bounding_box : tuple [ float ] ) -> bytes :
209
212
"""
210
213
Crops a region from a given page in a PDF and returns it as an image.
211
214
0 commit comments