Skip to content

Commit f418774

Browse files
authored
[Samples]Add sample of extracting fixed table (#35275)
1 parent 3a69b7c commit f418774

File tree

3 files changed

+150
-15
lines changed

3 files changed

+150
-15
lines changed

sdk/documentintelligence/azure-ai-documentintelligence/README.md

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -530,23 +530,23 @@ if result.documents:
530530

531531
# Extract table cell values
532532
SYMBOL_OF_TABLE_TYPE = "array"
533+
SYMBOL_OF_OBJECT_TYPE = "object"
533534
KEY_OF_VALUE_OBJECT = "valueObject"
534535
KEY_OF_CELL_CONTENT = "content"
535536

536537
for doc in result.documents:
537538
if not doc.fields is None:
538539
for field_name, field_value in doc.fields.items():
539-
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
540+
# Dynamic Table cell information store as array in document field.
540541
if (
541-
field_name == "MaintenanceLog"
542-
and field_value.type == SYMBOL_OF_TABLE_TYPE
542+
field_value.type == SYMBOL_OF_TABLE_TYPE
543543
and field_value.value_array
544544
):
545545
col_names = []
546546
sample_obj = field_value.value_array[0]
547547
if KEY_OF_VALUE_OBJECT in sample_obj:
548548
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
549-
print("----Extracting Table Cell Values----")
549+
print("----Extracting Dynamic Table Cell Values----")
550550
table_rows = []
551551
for obj in field_value.value_array:
552552
if KEY_OF_VALUE_OBJECT in obj:
@@ -559,6 +559,44 @@ if result.documents:
559559
row_data = list(map(extract_value_by_col_name, col_names))
560560
table_rows.append(row_data)
561561
print_table(col_names, table_rows)
562+
563+
elif (
564+
field_value.type == SYMBOL_OF_OBJECT_TYPE
565+
and KEY_OF_VALUE_OBJECT in field_value
566+
and field_value[KEY_OF_VALUE_OBJECT] is not None
567+
):
568+
rows_by_columns = list(field_value[KEY_OF_VALUE_OBJECT].values())
569+
is_fixed_table = all(
570+
(
571+
rows_of_column["type"] == SYMBOL_OF_OBJECT_TYPE
572+
and Counter(
573+
list(rows_by_columns[0][KEY_OF_VALUE_OBJECT].keys())
574+
)
575+
== Counter(list(rows_of_column[KEY_OF_VALUE_OBJECT].keys()))
576+
)
577+
for rows_of_column in rows_by_columns
578+
)
579+
580+
# Fixed Table cell information store as object in document field.
581+
if is_fixed_table:
582+
print("----Extracting Fixed Table Cell Values----")
583+
col_names = list(field_value[KEY_OF_VALUE_OBJECT].keys())
584+
row_dict: dict = {}
585+
for rows_of_column in rows_by_columns:
586+
rows = rows_of_column[KEY_OF_VALUE_OBJECT]
587+
for row_key in list(rows.keys()):
588+
if row_key in row_dict:
589+
row_dict[row_key].append(
590+
rows[row_key].get(KEY_OF_CELL_CONTENT)
591+
)
592+
else:
593+
row_dict[row_key] = [
594+
row_key,
595+
rows[row_key].get(KEY_OF_CELL_CONTENT),
596+
]
597+
598+
col_names.insert(0, "")
599+
print_table(col_names, list(row_dict.values()))
562600

563601
print("------------------------------------")
564602
```

sdk/documentintelligence/azure-ai-documentintelligence/samples/aio/sample_analyze_custom_documents_async.py

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
import os
3131
import asyncio
32+
from collections import Counter
3233

3334

3435
def print_table(header_names, table_data):
@@ -60,6 +61,13 @@ def print_table(header_names, table_data):
6061

6162

6263
async def analyze_custom_documents(custom_model_id):
64+
# For the Form_1.jpg, it should be the test file under the traning dataset which storage at the Azure Blob Storage path
65+
# combined by DOCUMENTINTELLIGENCE_STORAGE_CONTAINER_SAS_URL and DOCUMENTINTELLIGENCE_STORAGE_PREFIX,
66+
# or it can also be a test file with the format similar to the training dataset.
67+
# Put it here locally just for presenting documents visually in sample.
68+
69+
# Before analyzing a custom document, should upload the related training dataset into Azure Storage Blob and
70+
# train a model. For more information, access https://aka.ms/build-a-custom-model please.
6371
path_to_sample_documents = os.path.abspath(
6472
os.path.join(os.path.abspath(__file__), "..", "..", "./sample_forms/forms/Form_1.jpg")
6573
)
@@ -96,23 +104,23 @@ async def analyze_custom_documents(custom_model_id):
96104

97105
# Extract table cell values
98106
SYMBOL_OF_TABLE_TYPE = "array"
107+
SYMBOL_OF_OBJECT_TYPE = "object"
99108
KEY_OF_VALUE_OBJECT = "valueObject"
100109
KEY_OF_CELL_CONTENT = "content"
101110

102111
for doc in result.documents:
103112
if not doc.fields is None:
104113
for field_name, field_value in doc.fields.items():
105-
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
114+
# Dynamic Table cell information store as array in document field.
106115
if (
107-
field_name == "MaintenanceLog"
108-
and field_value.type == SYMBOL_OF_TABLE_TYPE
116+
field_value.type == SYMBOL_OF_TABLE_TYPE
109117
and field_value.value_array
110118
):
111119
col_names = []
112120
sample_obj = field_value.value_array[0]
113121
if KEY_OF_VALUE_OBJECT in sample_obj:
114122
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
115-
print("----Extracting Table Cell Values----")
123+
print("----Extracting Dynamic Table Cell Values----")
116124
table_rows = []
117125
for obj in field_value.value_array:
118126
if KEY_OF_VALUE_OBJECT in obj:
@@ -125,6 +133,44 @@ async def analyze_custom_documents(custom_model_id):
125133
row_data = list(map(extract_value_by_col_name, col_names))
126134
table_rows.append(row_data)
127135
print_table(col_names, table_rows)
136+
137+
# Fixed Table cell information store as object in document field.
138+
elif (
139+
field_value.type == SYMBOL_OF_OBJECT_TYPE
140+
and KEY_OF_VALUE_OBJECT in field_value
141+
and field_value[KEY_OF_VALUE_OBJECT] is not None
142+
):
143+
rows_by_columns = list(field_value[KEY_OF_VALUE_OBJECT].values())
144+
is_fixed_table = all(
145+
(
146+
rows_of_column["type"] == SYMBOL_OF_OBJECT_TYPE
147+
and Counter(
148+
list(rows_by_columns[0][KEY_OF_VALUE_OBJECT].keys())
149+
)
150+
== Counter(list(rows_of_column[KEY_OF_VALUE_OBJECT].keys()))
151+
)
152+
for rows_of_column in rows_by_columns
153+
)
154+
155+
if is_fixed_table:
156+
print("----Extracting Fixed Table Cell Values----")
157+
col_names = list(field_value[KEY_OF_VALUE_OBJECT].keys())
158+
row_dict: dict = {}
159+
for rows_of_column in rows_by_columns:
160+
rows = rows_of_column[KEY_OF_VALUE_OBJECT]
161+
for row_key in list(rows.keys()):
162+
if row_key in row_dict:
163+
row_dict[row_key].append(
164+
rows[row_key].get(KEY_OF_CELL_CONTENT)
165+
)
166+
else:
167+
row_dict[row_key] = [
168+
row_key,
169+
rows[row_key].get(KEY_OF_CELL_CONTENT)
170+
]
171+
172+
col_names.insert(0, "")
173+
print_table(col_names, list(row_dict.values()))
128174
print("-----------------------------------")
129175
# [END analyze_custom_documents]
130176

@@ -148,11 +194,14 @@ async def main():
148194
raise ValueError("Please provide endpoint and API key to run the samples.")
149195

150196
blob_container_sas_url = os.getenv("DOCUMENTINTELLIGENCE_STORAGE_CONTAINER_SAS_URL")
197+
blob_prefix = os.getenv("DOCUMENTINTELLIGENCE_STORAGE_PREFIX")
151198
if blob_container_sas_url is not None:
152199
request = BuildDocumentModelRequest(
153200
model_id=str(uuid.uuid4()),
154201
build_mode=DocumentBuildMode.TEMPLATE,
155-
azure_blob_source=AzureBlobContentSource(container_url=blob_container_sas_url),
202+
azure_blob_source=AzureBlobContentSource(
203+
container_url=blob_container_sas_url, prefix=blob_prefix
204+
),
156205
)
157206
document_intelligence_admin_client = DocumentIntelligenceAdministrationClient(
158207
endpoint=endpoint, credential=AzureKeyCredential(key)

sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_custom_documents.py

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"""
2929

3030
import os
31-
31+
from collections import Counter
3232

3333
def print_table(header_names, table_data):
3434
"""Print a two-dimensional array like a table.
@@ -59,6 +59,13 @@ def print_table(header_names, table_data):
5959

6060

6161
def analyze_custom_documents(custom_model_id):
62+
# For the Form_1.jpg, it should be the test file under the traning dataset which storage at the Azure Blob Storage path
63+
# combined by DOCUMENTINTELLIGENCE_STORAGE_CONTAINER_SAS_URL and DOCUMENTINTELLIGENCE_STORAGE_PREFIX,
64+
# or it can also be a test file with the format similar to the training dataset.
65+
# Put it here locally just for presenting documents visually in sample.
66+
67+
# Before analyzing a custom document, should upload the related training dataset into Azure Storage Blob and
68+
# train a model. For more information, access https://aka.ms/build-a-custom-model please.
6269
path_to_sample_documents = os.path.abspath(
6370
os.path.join(os.path.abspath(__file__), "..", "./sample_forms/forms/Form_1.jpg")
6471
)
@@ -95,23 +102,23 @@ def analyze_custom_documents(custom_model_id):
95102

96103
# Extract table cell values
97104
SYMBOL_OF_TABLE_TYPE = "array"
105+
SYMBOL_OF_OBJECT_TYPE = "object"
98106
KEY_OF_VALUE_OBJECT = "valueObject"
99107
KEY_OF_CELL_CONTENT = "content"
100108

101109
for doc in result.documents:
102110
if not doc.fields is None:
103111
for field_name, field_value in doc.fields.items():
104-
# "MaintenanceLog" is the table field name which you labeled. Table cell information store as array in document field.
112+
# Dynamic Table cell information store as array in document field.
105113
if (
106-
field_name == "MaintenanceLog"
107-
and field_value.type == SYMBOL_OF_TABLE_TYPE
114+
field_value.type == SYMBOL_OF_TABLE_TYPE
108115
and field_value.value_array
109116
):
110117
col_names = []
111118
sample_obj = field_value.value_array[0]
112119
if KEY_OF_VALUE_OBJECT in sample_obj:
113120
col_names = list(sample_obj[KEY_OF_VALUE_OBJECT].keys())
114-
print("----Extracting Table Cell Values----")
121+
print("----Extracting Dynamic Table Cell Values----")
115122
table_rows = []
116123
for obj in field_value.value_array:
117124
if KEY_OF_VALUE_OBJECT in obj:
@@ -124,6 +131,44 @@ def analyze_custom_documents(custom_model_id):
124131
row_data = list(map(extract_value_by_col_name, col_names))
125132
table_rows.append(row_data)
126133
print_table(col_names, table_rows)
134+
135+
elif (
136+
field_value.type == SYMBOL_OF_OBJECT_TYPE
137+
and KEY_OF_VALUE_OBJECT in field_value
138+
and field_value[KEY_OF_VALUE_OBJECT] is not None
139+
):
140+
rows_by_columns = list(field_value[KEY_OF_VALUE_OBJECT].values())
141+
is_fixed_table = all(
142+
(
143+
rows_of_column["type"] == SYMBOL_OF_OBJECT_TYPE
144+
and Counter(
145+
list(rows_by_columns[0][KEY_OF_VALUE_OBJECT].keys())
146+
)
147+
== Counter(list(rows_of_column[KEY_OF_VALUE_OBJECT].keys()))
148+
)
149+
for rows_of_column in rows_by_columns
150+
)
151+
152+
# Fixed Table cell information store as object in document field.
153+
if is_fixed_table:
154+
print("----Extracting Fixed Table Cell Values----")
155+
col_names = list(field_value[KEY_OF_VALUE_OBJECT].keys())
156+
row_dict: dict = {}
157+
for rows_of_column in rows_by_columns:
158+
rows = rows_of_column[KEY_OF_VALUE_OBJECT]
159+
for row_key in list(rows.keys()):
160+
if row_key in row_dict:
161+
row_dict[row_key].append(
162+
rows[row_key].get(KEY_OF_CELL_CONTENT)
163+
)
164+
else:
165+
row_dict[row_key] = [
166+
row_key,
167+
rows[row_key].get(KEY_OF_CELL_CONTENT),
168+
]
169+
170+
col_names.insert(0, "")
171+
print_table(col_names, list(row_dict.values()))
127172

128173
print("------------------------------------")
129174
# [END analyze_custom_documents]
@@ -156,11 +201,14 @@ def analyze_custom_documents(custom_model_id):
156201
endpoint=endpoint, credential=AzureKeyCredential(key)
157202
)
158203
blob_container_sas_url = os.getenv("DOCUMENTINTELLIGENCE_STORAGE_CONTAINER_SAS_URL")
204+
blob_prefix = os.getenv("DOCUMENTINTELLIGENCE_STORAGE_PREFIX")
159205
if blob_container_sas_url is not None:
160206
request = BuildDocumentModelRequest(
161207
model_id=str(uuid.uuid4()),
162208
build_mode=DocumentBuildMode.TEMPLATE,
163-
azure_blob_source=AzureBlobContentSource(container_url=blob_container_sas_url),
209+
azure_blob_source=AzureBlobContentSource(
210+
container_url=blob_container_sas_url, prefix=blob_prefix
211+
),
164212
)
165213
model = document_intelligence_admin_client.begin_build_document_model(request).result()
166214
model_id = model.model_id

0 commit comments

Comments
 (0)