Skip to content

Commit e85f8c5

Browse files
committed
Store bbox as list of pixel floats, add storage container just for extracted images
1 parent 16a0ec6 commit e85f8c5

File tree

8 files changed

+146
-54
lines changed

8 files changed

+146
-54
lines changed

app/backend/app.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,10 @@
6666
CONFIG_CHAT_HISTORY_COSMOS_ENABLED,
6767
CONFIG_CREDENTIAL,
6868
CONFIG_DEFAULT_REASONING_EFFORT,
69+
CONFIG_IMAGE_BLOB_CONTAINER_CLIENT, # Added this line
6970
CONFIG_INGESTER,
7071
CONFIG_LANGUAGE_PICKER_ENABLED,
72+
CONFIG_MULTIMODAL_ENABLED,
7173
CONFIG_OPENAI_CLIENT,
7274
CONFIG_QUERY_REWRITING_ENABLED,
7375
CONFIG_REASONING_EFFORT_ENABLED,
@@ -84,7 +86,6 @@
8486
CONFIG_USER_BLOB_CONTAINER_CLIENT,
8587
CONFIG_USER_UPLOAD_ENABLED,
8688
CONFIG_VECTOR_SEARCH_ENABLED,
87-
CONFIG_MULTIMODAL_ENABLED
8889
)
8990
from core.authentication import AuthenticationHelper
9091
from core.sessionhelper import create_session_id
@@ -182,7 +183,9 @@ async def ask(auth_claims: dict[str, Any]):
182183
context["auth_claims"] = auth_claims
183184
try:
184185
approach: Approach = cast(Approach, current_app.config[CONFIG_ASK_APPROACH])
185-
r = await approach.run(request_json["messages"], context=context, session_state=request_json.get("session_state"))
186+
r = await approach.run(
187+
request_json["messages"], context=context, session_state=request_json.get("session_state")
188+
)
186189
return jsonify(r)
187190
except Exception as error:
188191
return error_response(error, "/ask")
@@ -404,6 +407,7 @@ async def setup_clients():
404407
# Replace these with your own values, either in environment variables or directly here
405408
AZURE_STORAGE_ACCOUNT = os.environ["AZURE_STORAGE_ACCOUNT"]
406409
AZURE_STORAGE_CONTAINER = os.environ["AZURE_STORAGE_CONTAINER"]
410+
AZURE_IMAGESTORAGE_CONTAINER = os.environ.get("AZURE_IMAGESTORAGE_CONTAINER")
407411
AZURE_USERSTORAGE_ACCOUNT = os.environ.get("AZURE_USERSTORAGE_ACCOUNT")
408412
AZURE_USERSTORAGE_CONTAINER = os.environ.get("AZURE_USERSTORAGE_CONTAINER")
409413
AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"]
@@ -511,6 +515,15 @@ async def setup_clients():
511515
f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net", AZURE_STORAGE_CONTAINER, credential=azure_credential
512516
)
513517

518+
# Set up the image storage container client if configured
519+
image_blob_container_client = None
520+
if AZURE_IMAGESTORAGE_CONTAINER:
521+
image_blob_container_client = ContainerClient(
522+
f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net",
523+
AZURE_IMAGESTORAGE_CONTAINER,
524+
credential=azure_credential,
525+
)
526+
514527
# Set up authentication helper
515528
search_index = None
516529
if AZURE_USE_AUTHENTICATION:
@@ -636,6 +649,8 @@ async def setup_clients():
636649
current_app.config[CONFIG_SEARCH_CLIENT] = search_client
637650
current_app.config[CONFIG_AGENT_CLIENT] = agent_client
638651
current_app.config[CONFIG_BLOB_CONTAINER_CLIENT] = blob_container_client
652+
if image_blob_container_client:
653+
current_app.config[CONFIG_IMAGE_BLOB_CONTAINER_CLIENT] = image_blob_container_client
639654
current_app.config[CONFIG_AUTH_CLIENT] = auth_helper
640655

641656
current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED] = AZURE_SEARCH_SEMANTIC_RANKER != "disabled"
@@ -710,12 +725,15 @@ async def setup_clients():
710725
vision_token_provider=token_provider,
711726
)
712727

728+
713729
@bp.after_app_serving
714730
async def close_clients():
715731
await current_app.config[CONFIG_SEARCH_CLIENT].close()
716732
await current_app.config[CONFIG_BLOB_CONTAINER_CLIENT].close()
717733
if current_app.config.get(CONFIG_USER_BLOB_CONTAINER_CLIENT):
718734
await current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT].close()
735+
if current_app.config.get(CONFIG_IMAGE_BLOB_CONTAINER_CLIENT):
736+
await current_app.config[CONFIG_IMAGE_BLOB_CONTAINER_CLIENT].close()
719737

720738

721739
def create_app():

app/backend/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
CONFIG_ASK_APPROACH = "ask_approach"
44
CONFIG_CHAT_APPROACH = "chat_approach"
55
CONFIG_BLOB_CONTAINER_CLIENT = "blob_container_client"
6+
CONFIG_IMAGE_BLOB_CONTAINER_CLIENT = "image_blob_container_client"
67
CONFIG_USER_UPLOAD_ENABLED = "user_upload_enabled"
78
CONFIG_USER_BLOB_CONTAINER_CLIENT = "user_blob_container_client"
89
CONFIG_AUTH_CLIENT = "auth_client"
@@ -30,4 +31,4 @@
3031
CONFIG_COSMOS_HISTORY_CLIENT = "cosmos_history_client"
3132
CONFIG_COSMOS_HISTORY_CONTAINER = "cosmos_history_container"
3233
CONFIG_COSMOS_HISTORY_VERSION = "cosmos_history_version"
33-
CONFIG_MULTIMODAL_ENABLED = "multimodal_enabled"
34+
CONFIG_MULTIMODAL_ENABLED = "multimodal_enabled"

app/backend/prepdocs.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22
import asyncio
33
import logging
44
import os
5+
from enum import Enum
56
from typing import Optional, Union
67

78
from azure.core.credentials import AzureKeyCredential
89
from azure.core.credentials_async import AsyncTokenCredential
910
from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
10-
from rich.logging import RichHandler
1111
from openai import AsyncAzureOpenAI, AsyncOpenAI
12+
from rich.logging import RichHandler
1213

1314
from load_azd_env import load_azd_env
1415
from prepdocslib.blobmanager import BlobManager
@@ -31,11 +32,14 @@
3132
LocalListFileStrategy,
3233
)
3334
from prepdocslib.parser import Parser
34-
from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser, MediaDescriptionStrategy
35+
from prepdocslib.pdfparser import (
36+
DocumentAnalysisParser,
37+
LocalPdfParser,
38+
MediaDescriptionStrategy,
39+
)
3540
from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy
3641
from prepdocslib.textparser import TextParser
3742
from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
38-
from enum import Enum
3943

4044
logger = logging.getLogger("scripts")
4145

@@ -86,11 +90,14 @@ def setup_blob_manager(
8690
subscription_id: str,
8791
store_page_images: bool,
8892
storage_key: Union[str, None] = None,
93+
image_storage_container: Union[str, None] = None, # Added this parameter
8994
):
9095
storage_creds: Union[AsyncTokenCredential, str] = azure_credential if storage_key is None else storage_key
96+
9197
return BlobManager(
9298
endpoint=f"https://{storage_account}.blob.core.windows.net",
9399
container=storage_container,
100+
image_container=image_storage_container,
94101
account=storage_account,
95102
credential=storage_creds,
96103
resourceGroup=storage_resource_group,
@@ -178,6 +185,7 @@ def setup_embeddings_service(
178185
disable_batch=disable_batch_vectors,
179186
)
180187

188+
181189
def setup_openai_client(
182190
openai_host: OpenAIHost,
183191
azure_openai_api_key: Union[str, None] = None,
@@ -231,6 +239,7 @@ def setup_openai_client(
231239
)
232240
return openai_client
233241

242+
234243
def setup_file_processors(
235244
azure_credential: AsyncTokenCredential,
236245
document_intelligence_service: Union[str, None],
@@ -255,7 +264,15 @@ def setup_file_processors(
255264
doc_int_parser = DocumentAnalysisParser(
256265
endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
257266
credential=documentintelligence_creds,
258-
media_description_strategy = MediaDescriptionStrategy.OPENAI if use_multimodal else MediaDescriptionStrategy.CONTENTUNDERSTANDING if use_content_understanding else MediaDescriptionStrategy.NONE,
267+
media_description_strategy=(
268+
MediaDescriptionStrategy.OPENAI
269+
if use_multimodal
270+
else (
271+
MediaDescriptionStrategy.CONTENTUNDERSTANDING
272+
if use_content_understanding
273+
else MediaDescriptionStrategy.NONE
274+
)
275+
),
259276
openai_client=openai_client,
260277
openai_model=openai_model,
261278
openai_deployment=openai_deployment,
@@ -384,7 +401,9 @@ async def main(strategy: Strategy, setup_index: bool = True):
384401
args = parser.parse_args()
385402

386403
if args.verbose:
387-
logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)], level=logging.WARNING)
404+
logging.basicConfig(
405+
format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)], level=logging.WARNING
406+
)
388407
# We only set the level to INFO for our logger,
389408
# to avoid seeing the noisy INFO level logs from the Azure SDKs
390409
logger.setLevel(logging.DEBUG)
@@ -448,6 +467,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
448467
subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
449468
store_page_images=use_multimodal,
450469
storage_key=clean_key_if_exists(args.storagekey),
470+
image_storage_container=os.environ.get("AZURE_IMAGESTORAGE_CONTAINER"), # Pass the image container
451471
)
452472
list_file_strategy = setup_list_file_strategy(
453473
azure_credential=azd_credential,
@@ -460,7 +480,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
460480

461481
openai_host = OpenAIHost(os.environ["OPENAI_HOST"])
462482
# https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
463-
azure_openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01"
483+
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01"
464484
emb_model_dimensions = 1536
465485
if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"):
466486
emb_model_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"])
@@ -490,7 +510,6 @@ async def main(strategy: Strategy, setup_index: bool = True):
490510
openai_organization=os.getenv("OPENAI_ORGANIZATION"),
491511
)
492512

493-
494513
ingestion_strategy: Strategy
495514
if use_int_vectorization:
496515

app/backend/prepdocslib/blobmanager.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,13 @@ def __init__(
3535
resourceGroup: str,
3636
subscriptionId: str,
3737
store_page_images: bool = False,
38+
image_container: Optional[str] = None, # Added this parameter
3839
):
3940
self.endpoint = endpoint
4041
self.credential = credential
4142
self.account = account
4243
self.container = container
44+
self.image_container = image_container
4345
self.store_page_images = store_page_images
4446
self.resourceGroup = resourceGroup
4547
self.subscriptionId = subscriptionId
@@ -60,11 +62,17 @@ async def upload_blob(self, file: File) -> Optional[list[str]]:
6062
blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
6163
file.url = blob_client.url
6264
return None
63-
64-
async def upload_document_image(self, document_file: File, image_bytes: bytes, image_filename: str) -> Optional[str]:
65+
66+
async def upload_document_image(
67+
self, document_file: File, image_bytes: bytes, image_filename: str
68+
) -> Optional[str]:
69+
if self.image_container is None:
70+
raise ValueError(
71+
"Image container name is not set. Re-run `azd provision` to automatically set up the images container."
72+
)
6573
async with BlobServiceClient(
6674
account_url=self.endpoint, credential=self.credential, max_single_put_size=4 * 1024 * 1024
67-
) as service_client, service_client.get_container_client(self.container) as container_client:
75+
) as service_client, service_client.get_container_client(self.image_container) as container_client:
6876
if not await container_client.exists():
6977
await container_client.create_container()
7078
blob_name = BlobManager.blob_name_from_file_name(document_file.content.name) + "/" + image_filename

app/backend/prepdocslib/page.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
from typing import Sequence
21
from dataclasses import dataclass, field
32

43

54
@dataclass
65
class ImageOnPage:
76
bytes: bytes
8-
bbox: tuple[float, float, float, float]
7+
bbox: list[float, float, float, float] # Pixels
98
filename: str
109
description: str
1110
url: str | None = None
1211
embedding: list[float] | None = None
1312

13+
1414
@dataclass
1515
class Page:
1616
"""
@@ -21,11 +21,13 @@ class Page:
2121
offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow")
2222
text (str): The text of the page
2323
"""
24+
2425
page_num: int
2526
offset: int
2627
text: str
2728
images: list[ImageOnPage] = field(default_factory=list)
2829

30+
2931
@dataclass
3032
class SplitPage:
3133
"""
@@ -35,6 +37,7 @@ class SplitPage:
3537
page_num (int): Page number (0-indexed)
3638
text (str): The text of the section
3739
"""
40+
3841
page_num: int
3942
text: str
40-
images: list[ImageOnPage] = field(default_factory=list)
43+
images: list[ImageOnPage] = field(default_factory=list)

0 commit comments

Comments
 (0)