Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/.funcignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.git*
.vscode
__azurite_db*__.json
__blobstorage__
__queuestorage__
local.settings.json
test
.venv
135 changes: 135 additions & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don’t work, or not
# install all needed dependencies.
#Pipfile.lock

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# Azure Functions artifacts
bin
obj
appsettings.json
local.settings.json

# Azurite artifacts
__blobstorage__
__queuestorage__
__azurite_db*__.json
.python_packages
6 changes: 6 additions & 0 deletions src/.vscode/extensions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"recommendations": [
"ms-azuretools.vscode-azurefunctions",
"ms-python.python"
]
}
15 changes: 15 additions & 0 deletions src/.vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Attach to Python Functions",
"type": "debugpy",
"request": "attach",
"connect": {
"host": "localhost",
"port": 9091
},
"preLaunchTask": "func: host start"
}
]
}
9 changes: 9 additions & 0 deletions src/.vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"azureFunctions.deploySubpath": ".",
"azureFunctions.scmDoBuildDuringDeployment": true,
"azureFunctions.pythonVenv": ".venv",
"azureFunctions.projectLanguage": "Python",
"azureFunctions.projectRuntime": "~4",
"debug.internalConsoleOptions": "neverOpen",
"azureFunctions.projectLanguageModel": 2
}
27 changes: 27 additions & 0 deletions src/.vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"version": "2.0.0",
"tasks": [
{
"type": "func",
"label": "func: host start",
"command": "host start",
"problemMatcher": "$func-python-watch",
"isBackground": true,
"dependsOn": "pip install (functions)"
},
{
"label": "pip install (functions)",
"type": "shell",
"osx": {
"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
},
"windows": {
"command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt"
},
"linux": {
"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
},
"problemMatcher": []
}
]
}
159 changes: 159 additions & 0 deletions src/function_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import logging
import azure.functions as func
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.cosmos import CosmosClient, PartitionKey, exceptions
from azure.identity import DefaultAzureCredential
import os
import uuid

app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)

## DEFINITIONS
def initialize_form_recognizer_client():
endpoint = os.getenv("FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("FORM_RECOGNIZER_KEY")
if not isinstance(key, str):
raise ValueError("FORM_RECOGNIZER_KEY must be a string")
logging.info(f"Form Recognizer endpoint: {endpoint}")
return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))

def read_pdf_content(myblob):
logging.info(f"Reading PDF content from blob: {myblob.name}")
return myblob.read()

def analyze_pdf(form_recognizer_client, pdf_bytes):
logging.info("Starting PDF analysis.")
poller = form_recognizer_client.begin_analyze_document(
model_id="prebuilt-invoice",
document=pdf_bytes
)
logging.info("PDF analysis in progress.")
return poller.result()

def extract_invoice_data(result):
logging.info("Extracting invoice data from analysis result.")
invoice_data = {
"id": str(uuid.uuid4()),
"customer_name": "",
"customer_email": "",
"customer_address": "",
"company_name": "",
"company_phone": "",
"company_address": "",
"rentals": []
}

def serialize_field(field):
if field:
return str(field.value) # Convert to string
return ""

for document in result.documents:
fields = document.fields
invoice_data["customer_name"] = serialize_field(fields.get("CustomerName"))
invoice_data["customer_email"] = serialize_field(fields.get("CustomerEmail"))
invoice_data["customer_address"] = serialize_field(fields.get("CustomerAddress"))
invoice_data["company_name"] = serialize_field(fields.get("VendorName"))
invoice_data["company_phone"] = serialize_field(fields.get("VendorPhoneNumber"))
invoice_data["company_address"] = serialize_field(fields.get("VendorAddress"))

items = fields.get("Items").value if fields.get("Items") else []
for item in items:
item_value = item.value if item.value else {}
rental = {
"rental_date": serialize_field(item_value.get("Date")),
"title": serialize_field(item_value.get("Description")),
"description": serialize_field(item_value.get("Description")),
"quantity": serialize_field(item_value.get("Quantity")),
"total_price": serialize_field(item_value.get("TotalPrice"))
}
invoice_data["rentals"].append(rental)

logging.info(f"Successfully extracted invoice data: {invoice_data}")
return invoice_data

def save_invoice_data_to_cosmos(invoice_data):
try:
endpoint = os.getenv("COSMOS_DB_ENDPOINT")
key = os.getenv("COSMOS_DB_KEY")
aad_credentials = DefaultAzureCredential()
client = CosmosClient(endpoint, credential=aad_credentials, consistency_level='Session')
logging.info("Successfully connected to Cosmos DB using AAD default credential")
except Exception as e:
logging.error(f"Error connecting to Cosmos DB: {e}")
return

database_name = "ContosoDBDocIntellig"
container_name = "Invoices"


try: # Check if the database exists
# If the database does not exist, create it
database = client.create_database_if_not_exists(database_name)
logging.info(f"Database '{database_name}' does not exist. Creating it.")
except exceptions.CosmosResourceExistsError: # If error get name, keep going
database = client.get_database_client(database_name)
logging.info(f"Database '{database_name}' already exists.")

database.read()
logging.info(f"Reading into '{database_name}' DB")

try: # Check if the container exists
# If the container does not exist, create it
container = database.create_container(
id=container_name,
partition_key=PartitionKey(path="/transactionId"),
offer_throughput=400
)
logging.info(f"Container '{container_name}' does not exist. Creating it.")
except exceptions.CosmosResourceExistsError:
container = database.get_container_client(container_name)
logging.info(f"Container '{container_name}' already exists.")
except exceptions.CosmosHttpResponseError:
raise

container.read()
logging.info(f"Reading into '{container}' container")

try:
response = container.upsert_item(invoice_data)
logging.info(f"Saved processed invoice data to Cosmos DB: {response}")
except Exception as e:
logging.error(f"Error inserting item into Cosmos DB: {e}")

## MAIN
@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}",
connection="invoicecontosostorage_STORAGE")
def BlobTriggerContosoPDFInvoicesDocIntelligence(myblob: func.InputStream):
logging.info(f"Python blob trigger function processed blob\n"
f"Name: {myblob.name}\n"
f"Blob Size: {myblob.length} bytes")

try:
form_recognizer_client = initialize_form_recognizer_client()
pdf_bytes = read_pdf_content(myblob)
logging.info("Successfully read PDF content from blob.")
except Exception as e:
logging.error(f"Error reading PDF: {e}")
return

try:
result = analyze_pdf(form_recognizer_client, pdf_bytes)
logging.info("Successfully analyzed PDF using Document Intelligence.")
except Exception as e:
logging.error(f"Error analyzing PDF: {e}")
return

try:
invoice_data = extract_invoice_data(result)
logging.info(f"Extracted invoice data: {invoice_data}")
except Exception as e:
logging.error(f"Error extracting invoice data: {e}")
return

try:
save_invoice_data_to_cosmos(invoice_data)
logging.info("Successfully saved invoice data to Cosmos DB.")
except Exception as e:
logging.error(f"Error saving invoice data to Cosmos DB: {e}")
15 changes: 15 additions & 0 deletions src/host.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"version": "2.0",
"logging": {
"applicationInsights": {
"samplingSettings": {
"isEnabled": true,
"excludedTypes": "Request"
}
}
},
"extensionBundle": {
"id": "Microsoft.Azure.Functions.ExtensionBundle",
"version": "[4.*, 5.0.0)"
}
}
9 changes: 9 additions & 0 deletions src/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# DO NOT include azure-functions-worker in this file
# The Python Worker is managed by Azure Functions platform
# Manually managing azure-functions-worker may cause unexpected issues

azure-functions
azure-ai-formrecognizer
azure-core
azure-cosmos==4.3.0
azure-identity==1.7.0