2
2
import asyncio
3
3
import logging
4
4
import os
5
+ from enum import Enum
5
6
from typing import Optional , Union
6
7
7
8
from azure .core .credentials import AzureKeyCredential
8
9
from azure .core .credentials_async import AsyncTokenCredential
9
10
from azure .identity .aio import AzureDeveloperCliCredential , get_bearer_token_provider
10
- from rich .logging import RichHandler
11
11
from openai import AsyncAzureOpenAI , AsyncOpenAI
12
+ from rich .logging import RichHandler
12
13
13
14
from load_azd_env import load_azd_env
14
15
from prepdocslib .blobmanager import BlobManager
31
32
LocalListFileStrategy ,
32
33
)
33
34
from prepdocslib .parser import Parser
34
- from prepdocslib .pdfparser import DocumentAnalysisParser , LocalPdfParser , MediaDescriptionStrategy
35
+ from prepdocslib .pdfparser import (
36
+ DocumentAnalysisParser ,
37
+ LocalPdfParser ,
38
+ MediaDescriptionStrategy ,
39
+ )
35
40
from prepdocslib .strategy import DocumentAction , SearchInfo , Strategy
36
41
from prepdocslib .textparser import TextParser
37
42
from prepdocslib .textsplitter import SentenceTextSplitter , SimpleTextSplitter
38
- from enum import Enum
39
43
40
44
logger = logging .getLogger ("scripts" )
41
45
@@ -86,11 +90,14 @@ def setup_blob_manager(
86
90
subscription_id : str ,
87
91
store_page_images : bool ,
88
92
storage_key : Union [str , None ] = None ,
93
+ image_storage_container : Union [str , None ] = None , # Added this parameter
89
94
):
90
95
storage_creds : Union [AsyncTokenCredential , str ] = azure_credential if storage_key is None else storage_key
96
+
91
97
return BlobManager (
92
98
endpoint = f"https://{ storage_account } .blob.core.windows.net" ,
93
99
container = storage_container ,
100
+ image_container = image_storage_container ,
94
101
account = storage_account ,
95
102
credential = storage_creds ,
96
103
resourceGroup = storage_resource_group ,
@@ -178,6 +185,7 @@ def setup_embeddings_service(
178
185
disable_batch = disable_batch_vectors ,
179
186
)
180
187
188
+
181
189
def setup_openai_client (
182
190
openai_host : OpenAIHost ,
183
191
azure_openai_api_key : Union [str , None ] = None ,
@@ -231,6 +239,7 @@ def setup_openai_client(
231
239
)
232
240
return openai_client
233
241
242
+
234
243
def setup_file_processors (
235
244
azure_credential : AsyncTokenCredential ,
236
245
document_intelligence_service : Union [str , None ],
@@ -255,7 +264,15 @@ def setup_file_processors(
255
264
doc_int_parser = DocumentAnalysisParser (
256
265
endpoint = f"https://{ document_intelligence_service } .cognitiveservices.azure.com/" ,
257
266
credential = documentintelligence_creds ,
258
- media_description_strategy = MediaDescriptionStrategy .OPENAI if use_multimodal else MediaDescriptionStrategy .CONTENTUNDERSTANDING if use_content_understanding else MediaDescriptionStrategy .NONE ,
267
+ media_description_strategy = (
268
+ MediaDescriptionStrategy .OPENAI
269
+ if use_multimodal
270
+ else (
271
+ MediaDescriptionStrategy .CONTENTUNDERSTANDING
272
+ if use_content_understanding
273
+ else MediaDescriptionStrategy .NONE
274
+ )
275
+ ),
259
276
openai_client = openai_client ,
260
277
openai_model = openai_model ,
261
278
openai_deployment = openai_deployment ,
@@ -384,7 +401,9 @@ async def main(strategy: Strategy, setup_index: bool = True):
384
401
args = parser .parse_args ()
385
402
386
403
if args .verbose :
387
- logging .basicConfig (format = "%(message)s" , datefmt = "[%X]" , handlers = [RichHandler (rich_tracebacks = True )], level = logging .WARNING )
404
+ logging .basicConfig (
405
+ format = "%(message)s" , datefmt = "[%X]" , handlers = [RichHandler (rich_tracebacks = True )], level = logging .WARNING
406
+ )
388
407
# We only set the level to INFO for our logger,
389
408
# to avoid seeing the noisy INFO level logs from the Azure SDKs
390
409
logger .setLevel (logging .DEBUG )
@@ -448,6 +467,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
448
467
subscription_id = os .environ ["AZURE_SUBSCRIPTION_ID" ],
449
468
store_page_images = use_multimodal ,
450
469
storage_key = clean_key_if_exists (args .storagekey ),
470
+ image_storage_container = os .environ .get ("AZURE_IMAGESTORAGE_CONTAINER" ), # Pass the image container
451
471
)
452
472
list_file_strategy = setup_list_file_strategy (
453
473
azure_credential = azd_credential ,
@@ -460,7 +480,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
460
480
461
481
openai_host = OpenAIHost (os .environ ["OPENAI_HOST" ])
462
482
# https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
463
- azure_openai_api_version = os .getenv ("AZURE_OPENAI_API_VERSION" ) or "2024-06-01"
483
+ azure_openai_api_version = os .getenv ("AZURE_OPENAI_API_VERSION" ) or "2024-06-01"
464
484
emb_model_dimensions = 1536
465
485
if os .getenv ("AZURE_OPENAI_EMB_DIMENSIONS" ):
466
486
emb_model_dimensions = int (os .environ ["AZURE_OPENAI_EMB_DIMENSIONS" ])
@@ -490,7 +510,6 @@ async def main(strategy: Strategy, setup_index: bool = True):
490
510
openai_organization = os .getenv ("OPENAI_ORGANIZATION" ),
491
511
)
492
512
493
-
494
513
ingestion_strategy : Strategy
495
514
if use_int_vectorization :
496
515
0 commit comments