diff --git a/src/.funcignore b/src/.funcignore new file mode 100644 index 0000000..9966315 --- /dev/null +++ b/src/.funcignore @@ -0,0 +1,8 @@ +.git* +.vscode +__azurite_db*__.json +__blobstorage__ +__queuestorage__ +local.settings.json +test +.venv \ No newline at end of file diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..7685fc4 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,135 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don’t work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Azure Functions artifacts +bin +obj +appsettings.json +local.settings.json + +# Azurite artifacts +__blobstorage__ +__queuestorage__ +__azurite_db*__.json +.python_packages \ No newline at end of file diff --git a/src/.vscode/extensions.json b/src/.vscode/extensions.json new file mode 100644 index 0000000..3f63eb9 --- /dev/null +++ b/src/.vscode/extensions.json @@ -0,0 +1,6 @@ +{ + "recommendations": [ + "ms-azuretools.vscode-azurefunctions", + "ms-python.python" + ] +} \ No newline at end of file diff --git a/src/.vscode/launch.json b/src/.vscode/launch.json new file mode 100644 index 0000000..9a24428 --- /dev/null +++ b/src/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Attach to Python Functions", + "type": "debugpy", + "request": "attach", + "connect": { + "host": "localhost", + "port": 9091 + }, + "preLaunchTask": "func: host start" + } + ] +} \ No newline at end of file diff --git a/src/.vscode/settings.json b/src/.vscode/settings.json new file mode 100644 index 0000000..60e70c2 --- /dev/null +++ b/src/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "azureFunctions.deploySubpath": ".", + "azureFunctions.scmDoBuildDuringDeployment": true, + "azureFunctions.pythonVenv": ".venv", + "azureFunctions.projectLanguage": "Python", + "azureFunctions.projectRuntime": "~4", + "debug.internalConsoleOptions": "neverOpen", + "azureFunctions.projectLanguageModel": 2 +} \ No newline at end of file diff --git a/src/.vscode/tasks.json b/src/.vscode/tasks.json new file mode 100644 index 0000000..ba75962 --- /dev/null +++ b/src/.vscode/tasks.json @@ -0,0 +1,27 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "func", + "label": "func: host start", + "command": "host start", + "problemMatcher": "$func-python-watch", + "isBackground": true, + "dependsOn": "pip install (functions)" + }, + { + "label": "pip install (functions)", + "type": "shell", + "osx": { + "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" + }, + "windows": { + "command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt" + }, + "linux": { + "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" + }, + "problemMatcher": [] + } + ] +} \ No newline at end of file diff --git a/src/function_app.py b/src/function_app.py new file mode 100644 index 0000000..53a836c --- /dev/null +++ b/src/function_app.py @@ -0,0 +1,159 @@ +import logging +import azure.functions as func +from azure.ai.formrecognizer import DocumentAnalysisClient +from azure.core.credentials import AzureKeyCredential +from azure.cosmos import CosmosClient, PartitionKey, exceptions +from azure.identity import DefaultAzureCredential +import os +import uuid + +app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION) + +## DEFINITIONS +def initialize_form_recognizer_client(): + endpoint = os.getenv("FORM_RECOGNIZER_ENDPOINT") + key = os.getenv("FORM_RECOGNIZER_KEY") + if not isinstance(key, str): + raise ValueError("FORM_RECOGNIZER_KEY must be a string") + logging.info(f"Form Recognizer endpoint: {endpoint}") + return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) + +def read_pdf_content(myblob): + logging.info(f"Reading PDF content from blob: {myblob.name}") + return myblob.read() + +def analyze_pdf(form_recognizer_client, pdf_bytes): + logging.info("Starting PDF analysis.") + poller = form_recognizer_client.begin_analyze_document( + model_id="prebuilt-invoice", + document=pdf_bytes + ) + logging.info("PDF analysis in progress.") + return poller.result() + +def extract_invoice_data(result): + logging.info("Extracting invoice data from analysis result.") + invoice_data = { + "id": str(uuid.uuid4()), + "customer_name": "", + "customer_email": "", + "customer_address": "", + "company_name": "", + "company_phone": "", + "company_address": "", + "rentals": [] + } + + def serialize_field(field): + if field: + return str(field.value) # Convert to string + return "" + + for document in result.documents: + fields = document.fields + invoice_data["customer_name"] = serialize_field(fields.get("CustomerName")) + invoice_data["customer_email"] = serialize_field(fields.get("CustomerEmail")) + invoice_data["customer_address"] = serialize_field(fields.get("CustomerAddress")) + invoice_data["company_name"] = serialize_field(fields.get("VendorName")) + invoice_data["company_phone"] = serialize_field(fields.get("VendorPhoneNumber")) + invoice_data["company_address"] = serialize_field(fields.get("VendorAddress")) + + items = fields.get("Items").value if fields.get("Items") else [] + for item in items: + item_value = item.value if item.value else {} + rental = { + "rental_date": serialize_field(item_value.get("Date")), + "title": serialize_field(item_value.get("Description")), + "description": serialize_field(item_value.get("Description")), + "quantity": serialize_field(item_value.get("Quantity")), + "total_price": serialize_field(item_value.get("TotalPrice")) + } + invoice_data["rentals"].append(rental) + + logging.info(f"Successfully extracted invoice data: {invoice_data}") + return invoice_data + +def save_invoice_data_to_cosmos(invoice_data): + try: + endpoint = os.getenv("COSMOS_DB_ENDPOINT") + key = os.getenv("COSMOS_DB_KEY") + aad_credentials = DefaultAzureCredential() + client = CosmosClient(endpoint, credential=aad_credentials, consistency_level='Session') + logging.info("Successfully connected to Cosmos DB using AAD default credential") + except Exception as e: + logging.error(f"Error connecting to Cosmos DB: {e}") + return + + database_name = "ContosoDBDocIntellig" + container_name = "Invoices" + + + try: # Check if the database exists + # If the database does not exist, create it + database = client.create_database_if_not_exists(database_name) + logging.info(f"Database '{database_name}' does not exist. Creating it.") + except exceptions.CosmosResourceExistsError: # If error get name, keep going + database = client.get_database_client(database_name) + logging.info(f"Database '{database_name}' already exists.") + + database.read() + logging.info(f"Reading into '{database_name}' DB") + + try: # Check if the container exists + # If the container does not exist, create it + container = database.create_container( + id=container_name, + partition_key=PartitionKey(path="/transactionId"), + offer_throughput=400 + ) + logging.info(f"Container '{container_name}' does not exist. Creating it.") + except exceptions.CosmosResourceExistsError: + container = database.get_container_client(container_name) + logging.info(f"Container '{container_name}' already exists.") + except exceptions.CosmosHttpResponseError: + raise + + container.read() + logging.info(f"Reading into '{container}' container") + + try: + response = container.upsert_item(invoice_data) + logging.info(f"Saved processed invoice data to Cosmos DB: {response}") + except Exception as e: + logging.error(f"Error inserting item into Cosmos DB: {e}") + +## MAIN +@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}", + connection="invoicecontosostorage_STORAGE") +def BlobTriggerContosoPDFInvoicesDocIntelligence(myblob: func.InputStream): + logging.info(f"Python blob trigger function processed blob\n" + f"Name: {myblob.name}\n" + f"Blob Size: {myblob.length} bytes") + + try: + form_recognizer_client = initialize_form_recognizer_client() + pdf_bytes = read_pdf_content(myblob) + logging.info("Successfully read PDF content from blob.") + except Exception as e: + logging.error(f"Error reading PDF: {e}") + return + + try: + result = analyze_pdf(form_recognizer_client, pdf_bytes) + logging.info("Successfully analyzed PDF using Document Intelligence.") + except Exception as e: + logging.error(f"Error analyzing PDF: {e}") + return + + try: + invoice_data = extract_invoice_data(result) + logging.info(f"Extracted invoice data: {invoice_data}") + except Exception as e: + logging.error(f"Error extracting invoice data: {e}") + return + + try: + save_invoice_data_to_cosmos(invoice_data) + logging.info("Successfully saved invoice data to Cosmos DB.") + except Exception as e: + logging.error(f"Error saving invoice data to Cosmos DB: {e}") \ No newline at end of file diff --git a/src/host.json b/src/host.json new file mode 100644 index 0000000..9df9136 --- /dev/null +++ b/src/host.json @@ -0,0 +1,15 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + } +} \ No newline at end of file diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..d3cd3d3 --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,9 @@ +# DO NOT include azure-functions-worker in this file +# The Python Worker is managed by Azure Functions platform +# Manually managing azure-functions-worker may cause unexpected issues + +azure-functions +azure-ai-formrecognizer +azure-core +azure-cosmos==4.3.0 +azure-identity==1.7.0