Skip to content

Commit 983b1f7

Browse files
2 parents b0860ee + e92eba1 commit 983b1f7

32 files changed

+4389
-1234
lines changed

.github/workflows/sync-branches.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515

1616
steps:
1717
- name: Checkout repository
18-
uses: actions/checkout@v3
18+
uses: actions/checkout@v4
1919
with:
2020
fetch-depth: 0 # Fetch all history for accurate branch comparison
2121

code/backend/batch/batch_push_results.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,22 @@ def _get_file_name_from_message(message_body) -> str:
2828
)
2929
def batch_push_results(msg: func.QueueMessage) -> None:
3030
message_body = json.loads(msg.get_body().decode("utf-8"))
31-
logger.debug("Process Document Event queue function triggered: %s", message_body)
31+
logger.info("Process Document Event queue function triggered: %s", message_body)
3232

3333
event_type = message_body.get("eventType", "")
3434
# We handle "" in this scenario for backwards compatibility
3535
# This function is primarily triggered by an Event Grid queue message from the blob storage
3636
# However, it can also be triggered using a legacy schema from BatchStartProcessing
3737
if event_type in ("", "Microsoft.Storage.BlobCreated"):
38+
logger.info("Handling 'Blob Created' event with message body: %s", message_body)
3839
_process_document_created_event(message_body)
3940

4041
elif event_type == "Microsoft.Storage.BlobDeleted":
42+
logger.info("Handling 'Blob Deleted' event with message body: %s", message_body)
4143
_process_document_deleted_event(message_body)
4244

4345
else:
46+
logger.exception("Received an unrecognized event type: %s", event_type)
4447
raise NotImplementedError(f"Unknown event type received: {event_type}")
4548

4649

code/backend/batch/utilities/helpers/azure_form_recognizer_helper.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
import logging
12
from azure.core.credentials import AzureKeyCredential
23
from azure.ai.formrecognizer import DocumentAnalysisClient
34
from azure.identity import DefaultAzureCredential
45
import html
56
import traceback
67
from .env_helper import EnvHelper
78

9+
logger = logging.getLogger(__name__)
10+
811

912
class AzureFormRecognizerClient:
1013
def __init__(self) -> None:
@@ -75,6 +78,8 @@ def begin_analyze_document_from_url(
7578
model_id = "prebuilt-layout" if use_layout else "prebuilt-read"
7679

7780
try:
81+
logger.info("Method begin_analyze_document_from_url started")
82+
logger.info(f"Model ID selected: {model_id}")
7883
poller = self.document_analysis_client.begin_analyze_document_from_url(
7984
model_id, document_url=source_url
8085
)
@@ -144,4 +149,7 @@ def begin_analyze_document_from_url(
144149

145150
return page_map
146151
except Exception as e:
152+
logger.exception(f"Exception in begin_analyze_document_from_url: {e}")
147153
raise ValueError(f"Error: {traceback.format_exc()}. Error: {e}")
154+
finally:
155+
logger.info("Method begin_analyze_document_from_url ended")

code/backend/batch/utilities/helpers/config/config_helper.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,21 +190,27 @@ def _set_new_config_properties(config: dict, default_config: dict):
190190
@staticmethod
191191
@functools.cache
192192
def get_active_config_or_default():
193+
logger.info("Method get_active_config_or_default started")
193194
env_helper = EnvHelper()
194195
config = ConfigHelper.get_default_config()
195196

196197
if env_helper.LOAD_CONFIG_FROM_BLOB_STORAGE:
198+
logger.info("Loading configuration from Blob Storage")
197199
blob_client = AzureBlobStorageClient(container_name=CONFIG_CONTAINER_NAME)
198200

199201
if blob_client.file_exists(CONFIG_FILE_NAME):
202+
logger.info("Configuration file found in Blob Storage")
200203
default_config = config
201204
config_file = blob_client.download_file(CONFIG_FILE_NAME)
202205
config = json.loads(config_file)
203206

204207
ConfigHelper._set_new_config_properties(config, default_config)
205208
else:
206-
logger.info("Returning default config")
209+
logger.info(
210+
"Configuration file not found in Blob Storage, using default configuration"
211+
)
207212

213+
logger.info("Method get_active_config_or_default ended")
208214
return Config(config)
209215

210216
@staticmethod

code/backend/batch/utilities/helpers/embedders/integrated_vectorization_embedder.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,16 @@ class IntegratedVectorizationEmbedder(EmbedderBase):
1515
def __init__(self, env_helper: EnvHelper):
1616
self.env_helper = env_helper
1717
self.llm_helper: LLMHelper = LLMHelper()
18+
logger.info("Initialized IntegratedVectorizationEmbedder.")
1819

1920
def embed_file(self, source_url: str, file_name: str = None):
21+
logger.info(
22+
f"Starting embed_file for source_url: {source_url}, file_name: {file_name}."
23+
)
2024
self.process_using_integrated_vectorization(source_url=source_url)
2125

2226
def process_using_integrated_vectorization(self, source_url: str):
27+
logger.info(f"Starting integrated vectorization for source_url: {source_url}.")
2328
config = ConfigHelper.get_active_config_or_default()
2429
try:
2530
search_datasource = AzureSearchDatasource(self.env_helper)
@@ -35,14 +40,20 @@ def process_using_integrated_vectorization(self, source_url: str):
3540
self.env_helper.AZURE_SEARCH_INDEXER_NAME,
3641
skillset_name=search_skillset_result.name,
3742
)
43+
logger.info("Integrated vectorization process completed successfully.")
3844
return indexer_result
3945
except Exception as e:
4046
logger.error(f"Error processing {source_url}: {e}")
4147
raise e
4248

4349
def reprocess_all(self):
50+
logger.info("Starting reprocess_all operation.")
4451
search_indexer = AzureSearchIndexer(self.env_helper)
4552
if search_indexer.indexer_exists(self.env_helper.AZURE_SEARCH_INDEXER_NAME):
53+
logger.info(
54+
f"Running indexer: {self.env_helper.AZURE_SEARCH_INDEXER_NAME}."
55+
)
4656
search_indexer.run_indexer(self.env_helper.AZURE_SEARCH_INDEXER_NAME)
4757
else:
58+
logger.info("Indexer does not exist. Starting full processing.")
4859
self.process_using_integrated_vectorization(source_url="all")

code/backend/batch/utilities/helpers/embedders/postgres_embedder.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
class PostgresEmbedder(EmbedderBase):
2222
def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
23+
logger.info("Initializing PostgresEmbedder.")
2324
self.env_helper = env_helper
2425
self.llm_helper = LLMHelper()
2526
self.azure_postgres_helper = AzurePostgresHelper()
@@ -33,6 +34,7 @@ def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
3334
self.embedding_configs[ext] = processor
3435

3536
def embed_file(self, source_url: str, file_name: str):
37+
logger.info(f"Embedding file: {file_name} from source: {source_url}")
3638
file_extension = file_name.split(".")[-1].lower()
3739
embedding_config = self.embedding_configs.get(file_extension)
3840
self.__embed(
@@ -48,32 +50,42 @@ def embed_file(self, source_url: str, file_name: str):
4850
def __embed(
4951
self, source_url: str, file_extension: str, embedding_config: EmbeddingConfig
5052
):
53+
logger.info(f"Starting embedding process for source: {source_url}")
5154
documents_to_upload: List[SourceDocument] = []
5255
if (
5356
embedding_config.use_advanced_image_processing
5457
and file_extension
5558
in self.config.get_advanced_image_processing_image_types()
5659
):
60+
logger.error(
61+
"Advanced image processing is not supported in PostgresEmbedder."
62+
)
5763
raise NotImplementedError(
5864
"Advanced image processing is not supported in PostgresEmbedder."
5965
)
6066
else:
67+
logger.info(f"Loading documents from source: {source_url}")
6168
documents: List[SourceDocument] = self.document_loading.load(
6269
source_url, embedding_config.loading
6370
)
6471
documents = self.document_chunking.chunk(
6572
documents, embedding_config.chunking
6673
)
74+
logger.info("Chunked into document chunks.")
6775

6876
for document in documents:
6977
documents_to_upload.append(self.__convert_to_search_document(document))
7078

7179
if documents_to_upload:
80+
logger.info(
81+
f"Uploading {len(documents_to_upload)} documents to vector store."
82+
)
7283
self.azure_postgres_helper.create_vector_store(documents_to_upload)
7384
else:
7485
logger.warning("No documents to upload.")
7586

7687
def __convert_to_search_document(self, document: SourceDocument):
88+
logger.info(f"Generating embeddings for document ID: {document.id}")
7789
embedded_content = self.llm_helper.generate_embeddings(document.content)
7890
metadata = {
7991
"id": document.id,
@@ -84,6 +96,7 @@ def __convert_to_search_document(self, document: SourceDocument):
8496
"offset": document.offset,
8597
"page_number": document.page_number,
8698
}
99+
logger.info(f"Metadata generated for document ID: {document.id}")
87100
return {
88101
"id": document.id,
89102
"content": document.content,

code/backend/batch/utilities/helpers/embedders/push_embedder.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
class PushEmbedder(EmbedderBase):
2626
def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
27+
logger.info("Initializing PushEmbedder")
2728
self.env_helper = env_helper
2829
self.llm_helper = LLMHelper()
2930
self.azure_search_helper = AzureSearchHelper()
@@ -33,11 +34,14 @@ def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
3334
self.blob_client = blob_client
3435
self.config = ConfigHelper.get_active_config_or_default()
3536
self.embedding_configs = {}
37+
logger.info("Loading document processors")
3638
for processor in self.config.document_processors:
3739
ext = processor.document_type.lower()
3840
self.embedding_configs[ext] = processor
41+
logger.info("Document processors loaded")
3942

4043
def embed_file(self, source_url: str, file_name: str):
44+
logger.info(f"Embedding file: {file_name} from URL: {source_url}")
4145
file_extension = file_name.split(".")[-1].lower()
4246
embedding_config = self.embedding_configs.get(file_extension)
4347
self.__embed(
@@ -46,19 +50,22 @@ def embed_file(self, source_url: str, file_name: str):
4650
embedding_config=embedding_config,
4751
)
4852
if file_extension != "url":
53+
logger.info(f"Upserting blob metadata for file: {file_name}")
4954
self.blob_client.upsert_blob_metadata(
5055
file_name, {"embeddings_added": "true"}
5156
)
5257

5358
def __embed(
5459
self, source_url: str, file_extension: str, embedding_config: EmbeddingConfig
5560
):
61+
logger.info(f"Processing embedding for file extension: {file_extension}")
5662
documents_to_upload: List[SourceDocument] = []
5763
if (
5864
embedding_config.use_advanced_image_processing
5965
and file_extension
6066
in self.config.get_advanced_image_processing_image_types()
6167
):
68+
logger.info(f"Using advanced image processing for: {source_url}")
6269
caption = self.__generate_image_caption(source_url)
6370
caption_vector = self.llm_helper.generate_embeddings(caption)
6471

@@ -69,6 +76,7 @@ def __embed(
6976
)
7077
)
7178
else:
79+
logger.info(f"Loading documents from source: {source_url}")
7280
documents: List[SourceDocument] = self.document_loading.load(
7381
source_url, embedding_config.loading
7482
)
@@ -81,6 +89,7 @@ def __embed(
8189

8290
# Upload documents (which are chunks) to search index in batches
8391
if documents_to_upload:
92+
logger.info("Uploading documents in batches")
8493
batch_size = self.env_helper.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE
8594
search_client = self.azure_search_helper.get_search_client()
8695
for i in range(0, len(documents_to_upload), batch_size):
@@ -93,6 +102,7 @@ def __embed(
93102
logger.warning("No documents to upload.")
94103

95104
def __generate_image_caption(self, source_url):
105+
logger.info(f"Generating image caption for URL: {source_url}")
96106
model = self.env_helper.AZURE_OPENAI_VISION_MODEL
97107
caption_system_message = """You are an assistant that generates rich descriptions of images.
98108
You need to be accurate in the information you extract and detailed in the descriptons you generate.
@@ -116,9 +126,11 @@ def __generate_image_caption(self, source_url):
116126

117127
response = self.llm_helper.get_chat_completion(messages, model)
118128
caption = response.choices[0].message.content
129+
logger.info("Caption generation completed")
119130
return caption
120131

121132
def __convert_to_search_document(self, document: SourceDocument):
133+
logger.info(f"Converting document ID {document.id} to search document format")
122134
embedded_content = self.llm_helper.generate_embeddings(document.content)
123135
metadata = {
124136
self.env_helper.AZURE_SEARCH_FIELDS_ID: document.id,
@@ -151,6 +163,7 @@ def __create_image_document(
151163
content: str,
152164
content_vector: List[float],
153165
):
166+
logger.info(f"Creating image document for source URL: {source_url}")
154167
parsed_url = urlparse(source_url)
155168

156169
file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path

code/backend/batch/utilities/helpers/env_helper.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,14 @@ def __load_config(self, **kwargs) -> None:
356356

357357
self.PROMPT_FLOW_DEPLOYMENT_NAME = os.getenv("PROMPT_FLOW_DEPLOYMENT_NAME", "")
358358

359+
self.OPEN_AI_FUNCTIONS_SYSTEM_PROMPT = os.getenv(
360+
"OPEN_AI_FUNCTIONS_SYSTEM_PROMPT", ""
361+
)
362+
self.SEMENTIC_KERNEL_SYSTEM_PROMPT = os.getenv(
363+
"SEMENTIC_KERNEL_SYSTEM_PROMPT", ""
364+
)
365+
logger.info("Initializing EnvHelper completed")
366+
359367
def is_chat_model(self):
360368
if "gpt-4" in self.AZURE_OPENAI_MODEL_NAME.lower():
361369
return True

code/backend/batch/utilities/helpers/llm_helper.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
from openai import AzureOpenAI
23
from typing import List, Union, cast
34
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
@@ -10,9 +11,12 @@
1011
from azure.identity import DefaultAzureCredential
1112
from .env_helper import EnvHelper
1213

14+
logger = logging.getLogger(__name__)
15+
1316

1417
class LLMHelper:
1518
def __init__(self):
19+
logger.info("Initializing LLMHelper")
1620
self.env_helper: EnvHelper = EnvHelper()
1721
self.auth_type_keys = self.env_helper.is_auth_type_keys()
1822
self.token_provider = self.env_helper.AZURE_TOKEN_PROVIDER
@@ -38,6 +42,8 @@ def __init__(self):
3842
)
3943
self.embedding_model = self.env_helper.AZURE_OPENAI_EMBEDDING_MODEL
4044

45+
logger.info("Initializing LLMHelper completed")
46+
4147
def get_llm(self):
4248
if self.auth_type_keys:
4349
return AzureChatOpenAI(

code/backend/batch/utilities/orchestrator/lang_chain_agent.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ async def orchestrate(
5656
self, user_message: str, chat_history: List[dict], **kwargs: dict
5757
) -> list[dict]:
5858

59+
logger.info("Method orchestrate of lang_chain_agent started")
5960
# Call Content Safety tool
6061
if self.config.prompts.enable_content_safety:
6162
if response := self.call_content_safety_input(user_message):
@@ -122,4 +123,5 @@ async def orchestrate(
122123
answer=answer.answer,
123124
source_documents=answer.source_documents,
124125
)
126+
logger.info("Method orchestrate of lang_chain_agent ended")
125127
return messages

0 commit comments

Comments
 (0)