Skip to content

Commit 54b01a0

Browse files
committed
layout template updated + some struct in logs
1 parent 3425216 commit 54b01a0

File tree

2 files changed

+189
-6
lines changed

2 files changed

+189
-6
lines changed

src/function_app.py

Lines changed: 185 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,190 @@
1-
import azure.functions as func
21
import logging
2+
import azure.functions as func
3+
from azure.ai.formrecognizer import DocumentAnalysisClient
4+
from azure.core.credentials import AzureKeyCredential
5+
from azure.cosmos import CosmosClient, PartitionKey, exceptions
6+
from azure.identity import DefaultAzureCredential
7+
import os
8+
import uuid
9+
import json
10+
11+
app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
12+
13+
## DEFINITIONS
14+
def initialize_form_recognizer_client():
15+
endpoint = os.getenv("FORM_RECOGNIZER_ENDPOINT")
16+
key = os.getenv("FORM_RECOGNIZER_KEY")
17+
if not isinstance(key, str):
18+
raise ValueError("FORM_RECOGNIZER_KEY must be a string")
19+
logging.info(f"Form Recognizer endpoint: {endpoint}")
20+
return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
21+
22+
def read_pdf_content(myblob):
23+
logging.info(f"Reading PDF content from blob: {myblob.name}")
24+
return myblob.read()
25+
26+
def analyze_pdf(form_recognizer_client, pdf_bytes):
27+
logging.info("Starting PDF layout analysis.")
28+
poller = form_recognizer_client.begin_analyze_document(
29+
model_id="prebuilt-layout",
30+
document=pdf_bytes
31+
)
32+
logging.info("PDF layout analysis in progress.")
33+
result = poller.result()
34+
logging.info("PDF layout analysis completed.")
35+
logging.info(f"Document has {len(result.pages)} page(s), {len(result.tables)} table(s), and {len(result.styles)} style(s).")
36+
return result
37+
38+
def extract_layout_data(result):
39+
logging.info("Extracting layout data from analysis result.")
40+
41+
layout_data = {
42+
"id": str(uuid.uuid4()),
43+
"pages": []
44+
}
45+
46+
# Log styles
47+
for idx, style in enumerate(result.styles):
48+
content_type = "handwritten" if style.is_handwritten else "no handwritten"
49+
logging.info(f"Document contains {content_type} content")
50+
51+
# Process each page
52+
for page in result.pages:
53+
logging.info(f"--- Page {page.page_number} ---")
54+
page_data = {
55+
"page_number": page.page_number,
56+
"lines": [line.content for line in page.lines],
57+
"tables": [],
58+
"selection_marks": [
59+
{"state": mark.state, "confidence": mark.confidence}
60+
for mark in page.selection_marks
61+
]
62+
}
63+
64+
# Log extracted lines
65+
for line_idx, line in enumerate(page.lines):
66+
logging.info(f"Line {line_idx}: '{line.content}'")
67+
68+
# Log selection marks
69+
for selection_mark in page.selection_marks:
70+
logging.info(
71+
f"Selection mark is '{selection_mark.state}' with confidence {selection_mark.confidence}"
72+
)
73+
74+
# Extract tables
75+
page_tables = [
76+
table for table in result.tables
77+
if any(region.page_number == page.page_number for region in table.bounding_regions)
78+
]
79+
80+
for table_index, table in enumerate(page_tables):
81+
logging.info(f"Table {table_index}: {table.row_count} rows, {table.column_count} columns")
82+
83+
table_data = {
84+
"row_count": table.row_count,
85+
"column_count": table.column_count,
86+
"cells": []
87+
}
388

4-
app = func.FunctionApp()
89+
for cell in table.cells:
90+
content = cell.content.strip()
91+
table_data["cells"].append({
92+
"row_index": cell.row_index,
93+
"column_index": cell.column_index,
94+
"content": content
95+
})
96+
logging.info(f"Cell[{cell.row_index}][{cell.column_index}]: '{content}'")
597

98+
page_data["tables"].append(table_data)
99+
100+
layout_data["pages"].append(page_data)
101+
102+
try:
103+
preview = json.dumps(layout_data, indent=2)
104+
logging.info("Structured layout data preview:\n" + preview)
105+
except Exception as e:
106+
logging.warning(f"Could not serialize layout data for preview: {e}")
107+
108+
return layout_data
109+
110+
def save_layout_data_to_cosmos(layout_data):
111+
try:
112+
endpoint = os.getenv("COSMOS_DB_ENDPOINT")
113+
key = os.getenv("COSMOS_DB_KEY")
114+
aad_credentials = DefaultAzureCredential()
115+
client = CosmosClient(endpoint, credential=aad_credentials, consistency_level='Session')
116+
logging.info("Successfully connected to Cosmos DB using AAD default credential")
117+
except Exception as e:
118+
logging.error(f"Error connecting to Cosmos DB: {e}")
119+
return
120+
121+
database_name = "ContosoDBDocIntellig"
122+
container_name = "Layouts"
123+
124+
try:
125+
database = client.create_database_if_not_exists(database_name)
126+
logging.info(f"Database '{database_name}' does not exist. Creating it.")
127+
except exceptions.CosmosResourceExistsError:
128+
database = client.get_database_client(database_name)
129+
logging.info(f"Database '{database_name}' already exists.")
130+
131+
database.read()
132+
logging.info(f"Reading into '{database_name}' DB")
133+
134+
try:
135+
container = database.create_container(
136+
id=container_name,
137+
partition_key=PartitionKey(path="/id"),
138+
offer_throughput=400
139+
)
140+
logging.info(f"Container '{container_name}' does not exist. Creating it.")
141+
except exceptions.CosmosResourceExistsError:
142+
container = database.get_container_client(container_name)
143+
logging.info(f"Container '{container_name}' already exists.")
144+
except exceptions.CosmosHttpResponseError:
145+
raise
146+
147+
container.read()
148+
logging.info(f"Reading into '{container}' container")
149+
150+
try:
151+
response = container.upsert_item(layout_data)
152+
logging.info(f"Saved processed layout data to Cosmos DB. Response: {response}")
153+
except Exception as e:
154+
logging.error(f"Error inserting item into Cosmos DB: {e}")
155+
156+
## MAIN
6157
@app.blob_trigger(arg_name="myblob", path="pdfinvoices/{name}",
7-
connection="runtimestorebrownix3_STORAGE")
158+
connection="invoicecontosostorage_STORAGE")
8159
def BlobTriggerContosoPDFLayoutsDocIntelligence(myblob: func.InputStream):
9-
logging.info(f"Python blob trigger function processed blob"
10-
f"Name: {myblob.name}"
11-
f"Blob Size: {myblob.length} bytes")
160+
logging.info(f"Python blob trigger function processed blob\n"
161+
f"Name: {myblob.name}\n"
162+
f"Blob Size: {myblob.length} bytes")
163+
164+
try:
165+
form_recognizer_client = initialize_form_recognizer_client()
166+
pdf_bytes = read_pdf_content(myblob)
167+
logging.info("Successfully read PDF content from blob.")
168+
except Exception as e:
169+
logging.error(f"Error reading PDF: {e}")
170+
return
171+
172+
try:
173+
result = analyze_pdf(form_recognizer_client, pdf_bytes)
174+
logging.info("Successfully analyzed PDF using Document Intelligence.")
175+
except Exception as e:
176+
logging.error(f"Error analyzing PDF: {e}")
177+
return
178+
179+
try:
180+
layout_data = extract_layout_data(result)
181+
logging.info("Successfully extracted layout data.")
182+
except Exception as e:
183+
logging.error(f"Error extracting layout data: {e}")
184+
return
185+
186+
try:
187+
save_layout_data_to_cosmos(layout_data)
188+
logging.info("Successfully saved layout data to Cosmos DB.")
189+
except Exception as e:
190+
logging.error(f"Error saving layout data to Cosmos DB: {e}")

src/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,7 @@
33
# Manually managing azure-functions-worker may cause unexpected issues
44

55
azure-functions
6+
azure-ai-formrecognizer
7+
azure-core
8+
azure-cosmos==4.3.0
9+
azure-identity==1.7.0

0 commit comments

Comments
 (0)