Skip to content

Commit 15c50b7

Browse files
author
Yalin Li
authored
[DI] Fix layout extracting issues (#35913)
1 parent 48590d5 commit 15c50b7

14 files changed

+268
-80
lines changed

sdk/documentintelligence/azure-ai-documentintelligence/README.md

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -237,30 +237,47 @@ for page in result.pages:
237237
words = get_words(page, line)
238238
print(
239239
f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
240-
f"within bounding polygon '{line.polygon}'"
240+
f"within bounding polygon '{format_polygon(line.polygon)}'"
241241
)
242242

243-
for word in words:
244-
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
243+
if page.words:
244+
for word in page.words:
245+
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
245246

246247
if page.selection_marks:
247248
for selection_mark in page.selection_marks:
248249
print(
249250
f"Selection mark is '{selection_mark.state}' within bounding polygon "
250-
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
251+
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
251252
)
252253

254+
if result.paragraphs:
255+
print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
256+
# Sort all paragraphs by span's offset to read in the right order.
257+
result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
258+
print("-----Print sorted paragraphs-----")
259+
for paragraph in result.paragraphs:
260+
print(
261+
f"Found paragraph with role: '{paragraph.role}' within {format_bounding_region(paragraph.bounding_regions)} bounding region"
262+
)
263+
print(f"...with content: '{paragraph.content}'")
264+
print(f"...with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")
265+
253266
if result.tables:
254267
for table_idx, table in enumerate(result.tables):
255268
print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
256269
if table.bounding_regions:
257270
for region in table.bounding_regions:
258-
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
271+
print(
272+
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
273+
)
259274
for cell in table.cells:
260275
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
261276
if cell.bounding_regions:
262277
for region in cell.bounding_regions:
263-
print(f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'")
278+
print(
279+
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'"
280+
)
264281

265282
print("----------------------------------------")
266283
```
@@ -302,11 +319,14 @@ print("----Key-value pairs found in document----")
302319
if result.key_value_pairs:
303320
for kv_pair in result.key_value_pairs:
304321
if kv_pair.key:
305-
print(f"Key '{kv_pair.key.content}' found within " f"'{kv_pair.key.bounding_regions}' bounding regions")
322+
print(
323+
f"Key '{kv_pair.key.content}' found within "
324+
f"'{format_bounding_region(kv_pair.key.bounding_regions)}' bounding regions"
325+
)
306326
if kv_pair.value:
307327
print(
308328
f"Value '{kv_pair.value.content}' found within "
309-
f"'{kv_pair.value.bounding_regions}' bounding regions\n"
329+
f"'{format_bounding_region(kv_pair.value.bounding_regions)}' bounding regions\n"
310330
)
311331

312332
for page in result.pages:
@@ -318,17 +338,18 @@ for page in result.pages:
318338
words = get_words(page.words, line)
319339
print(
320340
f"...Line #{line_idx} has {len(words)} words and text '{line.content}' within "
321-
f"bounding polygon '{line.polygon}'"
341+
f"bounding polygon '{format_polygon(line.polygon)}'"
322342
)
323343

324-
for word in words:
325-
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
344+
if page.words:
345+
for word in page.words:
346+
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
326347

327348
if page.selection_marks:
328349
for selection_mark in page.selection_marks:
329350
print(
330351
f"Selection mark is '{selection_mark.state}' within bounding polygon "
331-
f"'{selection_mark.polygon}' and has a confidence of "
352+
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of "
332353
f"{selection_mark.confidence}"
333354
)
334355

@@ -337,13 +358,15 @@ if result.tables:
337358
print(f"Table # {table_idx} has {table.row_count} rows and {table.column_count} columns")
338359
if table.bounding_regions:
339360
for region in table.bounding_regions:
340-
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
361+
print(
362+
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
363+
)
341364
for cell in table.cells:
342365
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
343366
if cell.bounding_regions:
344367
for region in cell.bounding_regions:
345368
print(
346-
f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'\n"
369+
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'\n"
347370
)
348371
print("----------------------------------------")
349372
```

sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_addon_barcodes_async.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@
4343
import os
4444

4545

46+
def format_polygon(polygon):
47+
if not polygon:
48+
return "N/A"
49+
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])
50+
51+
4652
async def analyze_barcodes():
4753
path_to_sample_documents = os.path.abspath(
4854
os.path.join(
@@ -82,7 +88,7 @@ async def analyze_barcodes():
8288
print(f"- Barcode #{barcode_idx}: {barcode.value}")
8389
print(f" Kind: {barcode.kind}")
8490
print(f" Confidence: {barcode.confidence}")
85-
print(f" Bounding regions: {barcode.polygon}")
91+
print(f" Bounding regions: {format_polygon(barcode.polygon)}")
8692

8793
print("----------------------------------------")
8894
# [END analyze_barcodes]

sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_addon_formulas_async.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@
4343
import os
4444

4545

46+
def format_polygon(polygon):
47+
if not polygon:
48+
return "N/A"
49+
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])
50+
51+
4652
async def analyze_formulas():
4753
path_to_sample_documents = os.path.abspath(
4854
os.path.join(
@@ -85,13 +91,13 @@ async def analyze_formulas():
8591
for formula_idx, formula in enumerate(inline_formulas):
8692
print(f"- Inline #{formula_idx}: {formula.value}")
8793
print(f" Confidence: {formula.confidence}")
88-
print(f" Bounding regions: {formula.polygon}")
94+
print(f" Bounding regions: {format_polygon(formula.polygon)}")
8995

9096
print(f"\nDetected {len(display_formulas)} display formulas.")
9197
for formula_idx, formula in enumerate(display_formulas):
9298
print(f"- Display #{formula_idx}: {formula.value}")
9399
print(f" Confidence: {formula.confidence}")
94-
print(f" Bounding regions: {formula.polygon}")
100+
print(f" Bounding regions: {format_polygon(formula.polygon)}")
95101

96102
print("----------------------------------------")
97103
# [END analyze_formulas]

sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_addon_highres_async.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ def _in_span(word, spans):
5858
return False
5959

6060

61+
def format_polygon(polygon):
62+
if not polygon:
63+
return "N/A"
64+
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])
65+
66+
6167
async def analyze_with_highres():
6268
path_to_sample_documents = os.path.abspath(
6369
os.path.join(
@@ -102,30 +108,35 @@ async def analyze_with_highres():
102108
words = get_words(page, line)
103109
print(
104110
f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
105-
f"within bounding polygon '{line.polygon}'"
111+
f"within bounding polygon '{format_polygon(line.polygon)}'"
106112
)
107113

108-
for word in words:
109-
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
114+
if page.words:
115+
for word in page.words:
116+
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
110117

111118
if page.selection_marks:
112119
for selection_mark in page.selection_marks:
113120
print(
114121
f"Selection mark is '{selection_mark.state}' within bounding polygon "
115-
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
122+
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
116123
)
117124

118125
if result.tables:
119126
for table_idx, table in enumerate(result.tables):
120127
print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
121128
if table.bounding_regions:
122129
for region in table.bounding_regions:
123-
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
130+
print(
131+
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
132+
)
124133
for cell in table.cells:
125134
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
126135
if cell.bounding_regions:
127136
for region in cell.bounding_regions:
128-
print(f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'")
137+
print(
138+
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'"
139+
)
129140

130141
print("----------------------------------------")
131142
# [END analyze_with_highres]

sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_general_documents_async.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,18 @@ def _in_span(word, spans):
4040
return False
4141

4242

43+
def format_bounding_region(bounding_regions):
44+
if not bounding_regions:
45+
return "N/A"
46+
return ", ".join(f"Page #{region.page_number}: {format_polygon(region.polygon)}" for region in bounding_regions)
47+
48+
49+
def format_polygon(polygon):
50+
if not polygon:
51+
return "N/A"
52+
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])
53+
54+
4355
async def analyze_general_documents():
4456
path_to_sample_documents = os.path.abspath(
4557
os.path.join(
@@ -79,11 +91,14 @@ async def analyze_general_documents():
7991
if result.key_value_pairs:
8092
for kv_pair in result.key_value_pairs:
8193
if kv_pair.key:
82-
print(f"Key '{kv_pair.key.content}' found within " f"'{kv_pair.key.bounding_regions}' bounding regions")
94+
print(
95+
f"Key '{kv_pair.key.content}' found within "
96+
f"'{format_bounding_region(kv_pair.key.bounding_regions)}' bounding regions"
97+
)
8398
if kv_pair.value:
8499
print(
85100
f"Value '{kv_pair.value.content}' found within "
86-
f"'{kv_pair.value.bounding_regions}' bounding regions\n"
101+
f"'{format_bounding_region(kv_pair.value.bounding_regions)}' bounding regions\n"
87102
)
88103

89104
for page in result.pages:
@@ -95,17 +110,18 @@ async def analyze_general_documents():
95110
words = get_words(page.words, line)
96111
print(
97112
f"...Line #{line_idx} has {len(words)} words and text '{line.content}' within "
98-
f"bounding polygon '{line.polygon}'"
113+
f"bounding polygon '{format_polygon(line.polygon)}'"
99114
)
100115

101-
for word in words:
102-
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
116+
if page.words:
117+
for word in page.words:
118+
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
103119

104120
if page.selection_marks:
105121
for selection_mark in page.selection_marks:
106122
print(
107123
f"Selection mark is '{selection_mark.state}' within bounding polygon "
108-
f"'{selection_mark.polygon}' and has a confidence of "
124+
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of "
109125
f"{selection_mark.confidence}"
110126
)
111127

@@ -114,13 +130,15 @@ async def analyze_general_documents():
114130
print(f"Table # {table_idx} has {table.row_count} rows and {table.column_count} columns")
115131
if table.bounding_regions:
116132
for region in table.bounding_regions:
117-
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
133+
print(
134+
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
135+
)
118136
for cell in table.cells:
119137
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
120138
if cell.bounding_regions:
121139
for region in cell.bounding_regions:
122140
print(
123-
f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'\n"
141+
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'\n"
124142
)
125143
print("----------------------------------------")
126144
# [END analyze_general_documents]

sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_layout_async.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,25 @@ def _in_span(word, spans):
4444
return False
4545

4646

47+
def format_bounding_region(bounding_regions):
48+
if not bounding_regions:
49+
return "N/A"
50+
return ", ".join(f"Page #{region.page_number}: {format_polygon(region.polygon)}" for region in bounding_regions)
51+
52+
53+
def format_polygon(polygon):
54+
if not polygon:
55+
return "N/A"
56+
return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])
57+
58+
4759
async def analyze_layout():
4860
path_to_sample_documents = os.path.abspath(
4961
os.path.join(
5062
os.path.abspath(__file__),
5163
"..",
5264
"..",
53-
"./sample_forms/forms/form_selection_mark.png",
65+
"./sample_forms/forms/tabular_and_general_data.docx",
5466
)
5567
)
5668

@@ -84,30 +96,47 @@ async def analyze_layout():
8496
words = get_words(page, line)
8597
print(
8698
f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
87-
f"within bounding polygon '{line.polygon}'"
99+
f"within bounding polygon '{format_polygon(line.polygon)}'"
88100
)
89101

90-
for word in words:
91-
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
102+
if page.words:
103+
for word in page.words:
104+
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
92105

93106
if page.selection_marks:
94107
for selection_mark in page.selection_marks:
95108
print(
96109
f"Selection mark is '{selection_mark.state}' within bounding polygon "
97-
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
110+
f"'{format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
98111
)
99112

113+
if result.paragraphs:
114+
print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
115+
# Sort all paragraphs by span's offset to read in the right order.
116+
result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
117+
print("-----Print sorted paragraphs-----")
118+
for paragraph in result.paragraphs:
119+
print(
120+
f"Found paragraph with role: '{paragraph.role}' within {format_bounding_region(paragraph.bounding_regions)} bounding region"
121+
)
122+
print(f"...with content: '{paragraph.content}'")
123+
print(f"...with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")
124+
100125
if result.tables:
101126
for table_idx, table in enumerate(result.tables):
102127
print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
103128
if table.bounding_regions:
104129
for region in table.bounding_regions:
105-
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
130+
print(
131+
f"Table # {table_idx} location on page: {region.page_number} is {format_polygon(region.polygon)}"
132+
)
106133
for cell in table.cells:
107134
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
108135
if cell.bounding_regions:
109136
for region in cell.bounding_regions:
110-
print(f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'")
137+
print(
138+
f"...content on page {region.page_number} is within bounding polygon '{format_polygon(region.polygon)}'"
139+
)
111140

112141
print("----------------------------------------")
113142
# [END extract_layout]

0 commit comments

Comments
 (0)