Skip to content

Commit b5c8353

Browse files
authored
add sample code about extracting table cell values (#34686)
1 parent ed7cbc7 commit b5c8353

File tree

5 files changed

+165
-75
lines changed

5 files changed

+165
-75
lines changed

sdk/documentintelligence/azure-ai-documentintelligence/README.md

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -436,31 +436,40 @@ if result.documents:
436436
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
437437
)
438438

439-
# iterate over tables, lines, and selection marks on each page
440-
for page in result.pages:
441-
print(f"\nLines found on page {page.page_number}")
442-
if page.lines:
443-
for line in page.lines:
444-
print(f"...Line '{line.content}'")
445-
if page.words:
446-
for word in page.words:
447-
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
448-
if page.selection_marks:
449-
print(f"\nSelection marks found on page {page.page_number}")
450-
for selection_mark in page.selection_marks:
451-
print(
452-
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
453-
)
454-
455-
if result.tables:
456-
for i, table in enumerate(result.tables):
457-
print(f"\nTable {i + 1} can be found on page:")
458-
if table.bounding_regions:
459-
for region in table.bounding_regions:
460-
print(f"...{region.page_number}")
461-
for cell in table.cells:
462-
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
463-
print("-----------------------------------")
439+
# Extract table cell values
440+
SYMBOL_OF_TABLE_TYPE = "array"
441+
KEY_OF_VALUE_OBJECT = "valueObject"
442+
KEY_OF_CELL_CONTENT = "content"
443+
444+
for doc in result.documents:
445+
if not doc.fields is None:
446+
for field_name, field_value in doc.fields.items():
447+
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
448+
if (
449+
field_name == "MaintenanceLog"
450+
and field_value.type == SYMBOL_OF_TABLE_TYPE
451+
and field_value.value_array
452+
):
453+
col_names = []
454+
sample_obj = field_value.value_array[0]
455+
if KEY_OF_VALUE_OBJECT in sample_obj:
456+
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
457+
print("----Extracting Table Cell Values----")
458+
table_rows = []
459+
for obj in field_value.value_array:
460+
if KEY_OF_VALUE_OBJECT in obj:
461+
value_obj = obj[KEY_OF_VALUE_OBJECT]
462+
extract_value_by_col_name = lambda key: (
463+
value_obj[key].get(KEY_OF_CELL_CONTENT)
464+
if key in value_obj
465+
and KEY_OF_CELL_CONTENT in value_obj[key]
466+
else "None"
467+
)
468+
row_data = list(map(extract_value_by_col_name, col_names))
469+
table_rows.append(row_data)
470+
print_table(col_names, table_rows)
471+
472+
print("------------------------------------")
464473
```
465474

466475
<!-- END SNIPPET -->

sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py

Lines changed: 60 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,33 @@
3030
import os
3131
import asyncio
3232

33+
def print_table(header_names, table_data):
34+
"""Print a two-dimensional array like a table.
35+
36+
Based on provided column header names and two two-dimensional array data, print the strings like table.
37+
38+
Args:
39+
header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"]
40+
table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
41+
Return: None
42+
It's will print the string like table in output window. e.g.
43+
Name Gender Age
44+
Mike M 25
45+
John M 19
46+
Lily F 23
47+
"""
48+
max_len_list = []
49+
for i in range(len(header_names)):
50+
col_values = list(map(lambda row: len(str(row[i])), table_data))
51+
col_values.append(len(str(header_names[i])))
52+
max_len_list.append(max(col_values))
53+
54+
row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))
55+
56+
print(row_format_str.format(*header_names))
57+
for row in table_data:
58+
print(row_format_str.format(*row))
59+
3360

3461
async def analyze_custom_documents(custom_model_id):
3562
path_to_sample_documents = os.path.abspath(
@@ -65,31 +92,39 @@ async def analyze_custom_documents(custom_model_id):
6592
print(
6693
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
6794
)
68-
69-
# iterate over tables, lines, and selection marks on each page
70-
for page in result.pages:
71-
print(f"\nLines found on page {page.page_number}")
72-
if page.lines:
73-
for line in page.lines:
74-
print(f"...Line '{line.content}'")
75-
if page.words:
76-
for word in page.words:
77-
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
78-
if page.selection_marks:
79-
print(f"\nSelection marks found on page {page.page_number}")
80-
for selection_mark in page.selection_marks:
81-
print(
82-
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
83-
)
84-
85-
if result.tables:
86-
for i, table in enumerate(result.tables):
87-
print(f"\nTable {i + 1} can be found on page:")
88-
if table.bounding_regions:
89-
for region in table.bounding_regions:
90-
print(f"...{region.page_number}")
91-
for cell in table.cells:
92-
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
95+
96+
# Extract table cell values
97+
SYMBOL_OF_TABLE_TYPE = "array"
98+
KEY_OF_VALUE_OBJECT = "valueObject"
99+
KEY_OF_CELL_CONTENT = "content"
100+
101+
for doc in result.documents:
102+
if not doc.fields is None:
103+
for field_name, field_value in doc.fields.items():
104+
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
105+
if (
106+
field_name == "MaintenanceLog"
107+
and field_value.type == SYMBOL_OF_TABLE_TYPE
108+
and field_value.value_array
109+
):
110+
col_names = []
111+
sample_obj = field_value.value_array[0]
112+
if KEY_OF_VALUE_OBJECT in sample_obj:
113+
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
114+
print("----Extracting Table Cell Values----")
115+
table_rows = []
116+
for obj in field_value.value_array:
117+
if KEY_OF_VALUE_OBJECT in obj:
118+
value_obj = obj[KEY_OF_VALUE_OBJECT]
119+
extract_value_by_col_name = lambda key: (
120+
value_obj[key].get(KEY_OF_CELL_CONTENT)
121+
if key in value_obj
122+
and KEY_OF_CELL_CONTENT in value_obj[key]
123+
else "None"
124+
)
125+
row_data = list(map(extract_value_by_col_name, col_names))
126+
table_rows.append(row_data)
127+
print_table(col_names, table_rows)
93128
print("-----------------------------------")
94129
# [END analyze_custom_documents]
95130

sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_read_async.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ async def analyze_read():
109109
for paragraph in result.paragraphs:
110110
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
111111
print(f"...with content: '{paragraph.content}'")
112+
113+
result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
114+
print("-----Print sorted paragraphs-----")
115+
for idx, paragraph in enumerate(result.paragraphs):
116+
print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")
112117

113118
print("----------------------------------------")
114119

sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,33 @@
2929

3030
import os
3131

32+
def print_table(header_names, table_data):
33+
"""Print a two-dimensional array like a table.
34+
35+
Based on provided column header names and two two-dimensional array data, print the strings like table.
36+
37+
Args:
38+
header_names: An array of string, it's the column header names. e.g. ["name", "gender", "age"]
39+
table_data: A two-dimensional array, they're the table data. e.g. [["Mike", "M", 25], ["John", "M", 19], ["Lily", "F", 23]]
40+
Return: None
41+
It's will print the string like table in output window. e.g.
42+
Name Gender Age
43+
Mike M 25
44+
John M 19
45+
Lily F 23
46+
"""
47+
max_len_list = []
48+
for i in range(len(header_names)):
49+
col_values = list(map(lambda row: len(str(row[i])), table_data))
50+
col_values.append(len(str(header_names[i])))
51+
max_len_list.append(max(col_values))
52+
53+
row_format_str = "".join(map(lambda len: f"{{:<{len + 4}}}", max_len_list))
54+
55+
print(row_format_str.format(*header_names))
56+
for row in table_data:
57+
print(row_format_str.format(*row))
58+
3259

3360
def analyze_custom_documents(custom_model_id):
3461
path_to_sample_documents = os.path.abspath(
@@ -65,31 +92,40 @@ def analyze_custom_documents(custom_model_id):
6592
f"......found field of type '{field.type}' with value '{field_value}' and with confidence {field.confidence}"
6693
)
6794

68-
# iterate over tables, lines, and selection marks on each page
69-
for page in result.pages:
70-
print(f"\nLines found on page {page.page_number}")
71-
if page.lines:
72-
for line in page.lines:
73-
print(f"...Line '{line.content}'")
74-
if page.words:
75-
for word in page.words:
76-
print(f"...Word '{word.content}' has a confidence of {word.confidence}")
77-
if page.selection_marks:
78-
print(f"\nSelection marks found on page {page.page_number}")
79-
for selection_mark in page.selection_marks:
80-
print(
81-
f"...Selection mark is '{selection_mark.state}' and has a confidence of {selection_mark.confidence}"
82-
)
83-
84-
if result.tables:
85-
for i, table in enumerate(result.tables):
86-
print(f"\nTable {i + 1} can be found on page:")
87-
if table.bounding_regions:
88-
for region in table.bounding_regions:
89-
print(f"...{region.page_number}")
90-
for cell in table.cells:
91-
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
92-
print("-----------------------------------")
95+
# Extract table cell values
96+
SYMBOL_OF_TABLE_TYPE = "array"
97+
KEY_OF_VALUE_OBJECT = "valueObject"
98+
KEY_OF_CELL_CONTENT = "content"
99+
100+
for doc in result.documents:
101+
if not doc.fields is None:
102+
for field_name, field_value in doc.fields.items():
103+
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
104+
if (
105+
field_name == "MaintenanceLog"
106+
and field_value.type == SYMBOL_OF_TABLE_TYPE
107+
and field_value.value_array
108+
):
109+
col_names = []
110+
sample_obj = field_value.value_array[0]
111+
if KEY_OF_VALUE_OBJECT in sample_obj:
112+
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
113+
print("----Extracting Table Cell Values----")
114+
table_rows = []
115+
for obj in field_value.value_array:
116+
if KEY_OF_VALUE_OBJECT in obj:
117+
value_obj = obj[KEY_OF_VALUE_OBJECT]
118+
extract_value_by_col_name = lambda key: (
119+
value_obj[key].get(KEY_OF_CELL_CONTENT)
120+
if key in value_obj
121+
and KEY_OF_CELL_CONTENT in value_obj[key]
122+
else "None"
123+
)
124+
row_data = list(map(extract_value_by_col_name, col_names))
125+
table_rows.append(row_data)
126+
print_table(col_names, table_rows)
127+
128+
print("------------------------------------")
93129
# [END analyze_custom_documents]
94130

95131

sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_read.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,11 @@ def analyze_read():
106106
for paragraph in result.paragraphs:
107107
print(f"Found paragraph with role: '{paragraph.role}' within {paragraph.bounding_regions} bounding region")
108108
print(f"...with content: '{paragraph.content}'")
109+
110+
result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
111+
print("-----Print sorted paragraphs-----")
112+
for idx, paragraph in enumerate(result.paragraphs):
113+
print(f"...paragraph:{idx} with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")
109114

110115
print("----------------------------------------")
111116

0 commit comments

Comments
 (0)