@@ -6,7 +6,7 @@ author: laujan
6
6
manager : nitinme
7
7
ms.service : azure-ai-document-intelligence
8
8
ms.topic : include
9
- ms.date : 12/18/2023
9
+ ms.date : 01/29/2024
10
10
ms.author : lajanuar
11
11
---
12
12
<!-- markdownlint-disable MD025 -->
@@ -135,11 +135,30 @@ Extract text, selection marks, text styles, table structures, and bounding regio
135
135
import os
136
136
from azure.core.credentials import AzureKeyCredential
137
137
from azure.ai.documentintelligence import DocumentIntelligenceClient
138
+ from azure.ai.documentintelligence.models import AnalyzeResult
138
139
139
140
# set `<your-endpoint>` and `<your-key>` variables with the values from the Azure portal
140
141
endpoint = " <your-endpoint>"
141
142
key = " <your-key>"
142
143
144
+ # helper functions
145
+
146
+ def get_words (page , line ):
147
+ result = []
148
+ for word in page.words:
149
+ if _in_span(word, line.spans):
150
+ result.append(word)
151
+ return result
152
+
153
+
154
+ def _in_span (word , spans ):
155
+ for span in spans:
156
+ if word.span.offset >= span.offset and (
157
+ word.span.offset + word.span.length
158
+ ) <= (span.offset + span.length):
159
+ return True
160
+ return False
161
+
143
162
144
163
def analyze_layout ():
145
164
# sample document
@@ -152,9 +171,10 @@ def analyze_layout():
152
171
poller = document_intelligence_client.begin_analyze_document_from_url(
153
172
" prebuilt-layout" , formUrl
154
173
)
155
- result = poller.result()
156
174
157
- if any ([style.is_handwritten for style in result.styles]):
175
+ result: AnalyzeResult = poller.result()
176
+
177
+ if result.styles and any ([style.is_handwritten for style in result.styles]):
158
178
print (" Document contains handwritten content" )
159
179
else :
160
180
print (" Document does not contain handwritten content" )
@@ -165,49 +185,53 @@ def analyze_layout():
165
185
f " Page has width: { page.width} and height: { page.height} , measured with unit: { page.unit} "
166
186
)
167
187
168
- for line_idx, line in enumerate (page.lines):
169
- words = get_words(page, line)
170
- print (
171
- f " ...Line # { line_idx} has word count { len (words)} and text ' { line.content} ' "
172
- f " within bounding polygon ' { line.polygon} ' "
173
- )
174
-
175
- for word in words:
188
+ if page.lines:
189
+ for line_idx, line in enumerate (page.lines):
190
+ words = get_words(page, line)
176
191
print (
177
- f " ......Word ' { word.content} ' has a confidence of { word.confidence} "
192
+ f " ...Line # { line_idx} has word count { len (words)} and text ' { line.content} ' "
193
+ f " within bounding polygon ' { line.polygon} ' "
178
194
)
179
195
180
- for selection_mark in page.selection_marks:
181
- print (
182
- f " Selection mark is ' { selection_mark.state} ' within bounding polygon "
183
- f " ' { selection_mark.polygon} ' and has a confidence of { selection_mark.confidence} "
184
- )
196
+ for word in words:
197
+ print (
198
+ f " ......Word ' { word.content} ' has a confidence of { word.confidence} "
199
+ )
185
200
186
- for table_idx, table in enumerate (result.tables):
187
- print (
188
- f " Table # { table_idx} has { table.row_count} rows and "
189
- f " { table.column_count} columns "
190
- )
191
- for region in table.bounding_regions:
192
- print (
193
- f " Table # { table_idx} location on page: { region.page_number} is { region.polygon} "
194
- )
195
- for cell in table.cells:
201
+ if page.selection_marks:
202
+ for selection_mark in page.selection_marks:
203
+ print (
204
+ f " Selection mark is ' { selection_mark.state} ' within bounding polygon "
205
+ f " ' { selection_mark.polygon} ' and has a confidence of { selection_mark.confidence} "
206
+ )
207
+
208
+ if result.tables:
209
+ for table_idx, table in enumerate (result.tables):
196
210
print (
197
- f " ...Cell[ { cell.row_index} ][ { cell.column_index} ] has text ' { cell.content} ' "
211
+ f " Table # { table_idx} has { table.row_count} rows and "
212
+ f " { table.column_count} columns "
198
213
)
199
- for region in cell.bounding_regions:
214
+ if table.bounding_regions:
215
+ for region in table.bounding_regions:
216
+ print (
217
+ f " Table # { table_idx} location on page: { region.page_number} is { region.polygon} "
218
+ )
219
+ for cell in table.cells:
200
220
print (
201
- f " ...content on page { region.page_number } is within bounding polygon ' { region.polygon } ' "
221
+ f " ...Cell[ { cell.row_index } ][ { cell.column_index } ] has text ' { cell.content } ' "
202
222
)
223
+ if cell.bounding_regions:
224
+ for region in cell.bounding_regions:
225
+ print (
226
+ f " ...content on page { region.page_number} is within bounding polygon ' { region.polygon} ' "
227
+ )
203
228
204
229
print (" ----------------------------------------" )
205
230
206
231
207
232
if __name__ == " __main__" :
208
233
analyze_layout()
209
234
210
-
211
235
```
212
236
213
237
** Run the application**
0 commit comments