forked from GoogleCloudPlatform/document-ai-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocai_utils.py
More file actions
137 lines (108 loc) · 4.5 KB
/
docai_utils.py
File metadata and controls
137 lines (108 loc) · 4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Document AI Utility Functions"""
from typing import Tuple
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from consts import (
CLASSIFIER_PROCESSOR_TYPES,
DEFAULT_MIME_TYPE,
DOCAI_ACTIVE_PROCESSORS,
DOCAI_PROCESSOR_LOCATION,
DOCAI_PROJECT_ID,
DOCUMENT_SUPPORTED_PROCESSOR_TYPES,
)
client_options = ClientOptions(
api_endpoint=f"{DOCAI_PROCESSOR_LOCATION}-documentai.googleapis.com"
)
# Instantiates a client
documentai_client = documentai.DocumentProcessorServiceClient(
client_options=client_options
)
def process_document_bytes(
project_id: str,
location: str,
processor_id: str,
file_content: bytes,
mime_type: str = DEFAULT_MIME_TYPE,
) -> documentai.Document:
"""
Processes a document using the Document AI API.
Takes in bytes from file reading, instead of a file path
"""
# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
resource_name = documentai_client.processor_path(project_id, location, processor_id)
# Load Binary Data into Document AI RawDocument Object
raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)
# Configure the process request
request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)
# Use the Document AI client to process the sample form
result = documentai_client.process_document(request=request)
return result.document
def extract_document_entities(document: documentai.Document) -> dict:
"""
Get all entities from a document and output as a dictionary
Format: entity.type_: entity.mention_text OR entity.normalized_value.text
"""
document_entities = {}
for entity in document.entities:
# Fields detected. For a full list of fields for each processor see
# the processor documentation:
# https://cloud.google.com/document-ai/docs/processors-list
key = entity.type_
# Use EKG Enriched Data if available
normalized_value = getattr(entity, "normalized_value", None)
value = normalized_value.text if normalized_value else entity.mention_text
document_entities[key] = value
return document_entities
def select_processor_from_classification(
document_classification: str = "other",
) -> Tuple[str, str]:
"""
Select Processor for a given Document Classification
"""
# Get Supported Parser Processor Type from Document Classification
processor_type = DOCUMENT_SUPPORTED_PROCESSOR_TYPES.get(
document_classification, "FORM_PARSER_PROCESSOR"
)
# Get Specific Processor ID for this Parser Type
processor_id = DOCAI_ACTIVE_PROCESSORS.get(processor_type)
return processor_type, processor_id
def classify_document_bytes(file_content: bytes, mime_type: str) -> str:
"""
Classify a single document with all available specialized processors
"""
# Cycle through all possible classifier Processor Types
for classifier_processor_type in CLASSIFIER_PROCESSOR_TYPES:
# Get Specific Processor ID for this Classifier Type
classifier_processor_id = DOCAI_ACTIVE_PROCESSORS.get(classifier_processor_type)
if classifier_processor_id is None:
continue
# Classify Document
classification_document_proto = process_document_bytes(
DOCAI_PROJECT_ID,
DOCAI_PROCESSOR_LOCATION,
classifier_processor_id,
file_content,
mime_type,
)
# Translate Classification Output to Processor Type
document_classification = classification_document_proto.entities[0].type_
# Specialized Classifiers return "other"
# if it could not classify to a known type
if document_classification == "other":
continue
return document_classification