document-ai-samples/tax-processing-pipeline-python/docai_utils.py at main · jiya-zhang/document-ai-samples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Document AI Utility Functions"""

from typing import Tuple

from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai

from consts import (
    CLASSIFIER_PROCESSOR_TYPES,
    DEFAULT_MIME_TYPE,
    DOCAI_ACTIVE_PROCESSORS,
    DOCAI_PROCESSOR_LOCATION,
    DOCAI_PROJECT_ID,
    DOCUMENT_SUPPORTED_PROCESSOR_TYPES,
)

client_options = ClientOptions(
    api_endpoint=f"{DOCAI_PROCESSOR_LOCATION}-documentai.googleapis.com"
)

# Instantiates a client
documentai_client = documentai.DocumentProcessorServiceClient(
    client_options=client_options
)


def process_document_bytes(
    project_id: str,
    location: str,
    processor_id: str,
    file_content: bytes,
    mime_type: str = DEFAULT_MIME_TYPE,
) -> documentai.Document:
    """
    Processes a document using the Document AI API.
    Takes in bytes from file reading, instead of a file path
    """

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    resource_name = documentai_client.processor_path(project_id, location, processor_id)

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)

    # Configure the process request
    request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)

    # Use the Document AI client to process the sample form
    result = documentai_client.process_document(request=request)

    return result.document


def extract_document_entities(document: documentai.Document) -> dict:
    """
    Get all entities from a document and output as a dictionary
    Format: entity.type_: entity.mention_text OR entity.normalized_value.text
    """
    document_entities = {}
    for entity in document.entities:
        # Fields detected. For a full list of fields for each processor see
        # the processor documentation:
        # https://cloud.google.com/document-ai/docs/processors-list

        key = entity.type_
        # Use EKG Enriched Data if available
        normalized_value = getattr(entity, "normalized_value", None)
        value = normalized_value.text if normalized_value else entity.mention_text

        document_entities[key] = value

    return document_entities


def select_processor_from_classification(
    document_classification: str = "other",
) -> Tuple[str, str]:
    """
    Select Processor for a given Document Classification
    """

    # Get Supported Parser Processor Type from Document Classification
    processor_type = DOCUMENT_SUPPORTED_PROCESSOR_TYPES.get(
        document_classification, "FORM_PARSER_PROCESSOR"
    )

    # Get Specific Processor ID for this Parser Type
    processor_id = DOCAI_ACTIVE_PROCESSORS.get(processor_type)

    return processor_type, processor_id


def classify_document_bytes(file_content: bytes, mime_type: str) -> str:
    """
    Classify a single document with all available specialized processors
    """

    # Cycle through all possible classifier Processor Types
    for classifier_processor_type in CLASSIFIER_PROCESSOR_TYPES:
        # Get Specific Processor ID for this Classifier Type
        classifier_processor_id = DOCAI_ACTIVE_PROCESSORS.get(classifier_processor_type)
        if classifier_processor_id is None:
            continue

        # Classify Document
        classification_document_proto = process_document_bytes(
            DOCAI_PROJECT_ID,
            DOCAI_PROCESSOR_LOCATION,
            classifier_processor_id,
            file_content,
            mime_type,
        )
        # Translate Classification Output to Processor Type
        document_classification = classification_document_proto.entities[0].type_

        # Specialized Classifiers return "other"
        # if it could not classify to a known type
        if document_classification == "other":
            continue

    return document_classification