11import base64
2- import importlib
32import json
3+ import logging
44from collections .abc import Iterable
55from dataclasses import dataclass , field
66from typing import Any
1010
1111from document_extractor import function_app as document_extractor
1212from figure_processor import function_app as figure_processor
13+ from prepdocslib .fileprocessor import FileProcessor
14+ from prepdocslib .textparser import TextParser
15+ from prepdocslib .textsplitter import SentenceTextSplitter
1316from tests .mocks import TEST_PNG_BYTES
1417from text_processor import function_app as text_processor
1518
@@ -75,8 +78,6 @@ async def parse(self, content: Any):
7578 page = document_extractor .Page (page_num = 0 , offset = 0 , text = page_text , images = [figure ])
7679
7780 # Set up mock file processors and settings
78- from prepdocslib .fileprocessor import FileProcessor
79-
8081 mock_file_processors = {
8182 ".pdf" : FileProcessor (StubParser ([page ]), None ),
8283 }
@@ -123,8 +124,6 @@ async def parse(self, content: Any):
123124
124125@pytest .mark .asyncio
125126async def test_document_extractor_requires_single_record (monkeypatch : pytest .MonkeyPatch ) -> None :
126- from prepdocslib .fileprocessor import FileProcessor
127-
128127 mock_settings = document_extractor .GlobalSettings (
129128 file_processors = {".pdf" : FileProcessor (None , None )},
130129 azure_credential = object (),
@@ -138,8 +137,6 @@ async def test_document_extractor_requires_single_record(monkeypatch: pytest.Mon
138137
139138@pytest .mark .asyncio
140139async def test_document_extractor_handles_processing_exception (monkeypatch : pytest .MonkeyPatch ) -> None :
141- from prepdocslib .fileprocessor import FileProcessor
142-
143140 async def failing_process (data : dict [str , Any ]) -> dict [str , Any ]:
144141 raise RuntimeError ("boom" )
145142
@@ -179,8 +176,6 @@ async def test_document_extractor_invalid_json_returns_error() -> None:
179176
180177@pytest .mark .asyncio
181178async def test_document_extractor_process_document_http_error (monkeypatch : pytest .MonkeyPatch ) -> None :
182- from prepdocslib .fileprocessor import FileProcessor
183-
184179 class FailingParser :
185180 async def parse (self , content ):
186181 raise document_extractor .HttpResponseError (message = "fail" )
@@ -215,11 +210,10 @@ def test_document_extractor_missing_file_data() -> None:
215210
216211def test_document_extractor_managed_identity_reload (monkeypatch : pytest .MonkeyPatch ) -> None :
217212 monkeypatch .setenv ("AZURE_CLIENT_ID" , "client-123" )
218- module = importlib .reload (document_extractor )
219- module .configure_global_settings ()
220- assert isinstance (module .settings .azure_credential , module .ManagedIdentityCredential )
213+ document_extractor .configure_global_settings ()
214+ assert isinstance (document_extractor .settings .azure_credential , document_extractor .ManagedIdentityCredential )
221215 monkeypatch .delenv ("AZURE_CLIENT_ID" , raising = False )
222- importlib . reload ( document_extractor )
216+ document_extractor . configure_global_settings ( )
223217
224218
225219@pytest .mark .asyncio
@@ -297,64 +291,82 @@ def test_figure_processor_initialisation_with_env(monkeypatch: pytest.MonkeyPatc
297291 monkeypatch .setenv ("AZURE_OPENAI_CHATGPT_DEPLOYMENT" , "deploy" )
298292 monkeypatch .setenv ("AZURE_VISION_ENDPOINT" , "https://vision" )
299293
300- import sys
301- from pathlib import Path
302-
303- fp_root = Path (__file__ ).parent .parent / "app" / "functions" / "figure_processor"
304- sys .path .insert (0 , str (fp_root ))
305-
306- fp_servicesetup = importlib .import_module ("prepdocslib.servicesetup" )
307- fp_embeddings = importlib .import_module ("prepdocslib.embeddings" )
294+ call_state : dict [str , Any ] = {}
295+
296+ class StubCredential :
297+ def __init__ (self , client_id : str | None = None ):
298+ call_state ["credential_client_id" ] = client_id
299+
300+ def fake_setup_blob_manager (** kwargs : Any ) -> str :
301+ call_state ["blob_manager_kwargs" ] = kwargs
302+ return "blob"
303+
304+ def fake_setup_figure_processor (** kwargs : Any ) -> str :
305+ call_state ["figure_processor_kwargs" ] = kwargs
306+ return "figproc"
307+
308+ def fake_setup_openai_client (
309+ * ,
310+ openai_host : Any ,
311+ azure_credential : Any ,
312+ azure_openai_service : str | None ,
313+ azure_openai_custom_url : str | None ,
314+ ) -> tuple [str , None ]:
315+ call_state ["openai_client_args" ] = {
316+ "openai_host" : openai_host ,
317+ "azure_credential" : azure_credential ,
318+ "azure_openai_service" : azure_openai_service ,
319+ "azure_openai_custom_url" : azure_openai_custom_url ,
320+ }
321+ return ("openai-client" , None )
308322
309- monkeypatch .setattr (fp_servicesetup , "setup_blob_manager" , lambda ** _ : "blob" )
310- monkeypatch .setattr (fp_servicesetup , "setup_figure_processor" , lambda ** _ : "figproc" )
311- monkeypatch .setattr (fp_servicesetup , "setup_openai_client" , lambda ** _ : ("openai-client" , None ))
323+ def fake_get_bearer_token_provider (credential : Any , scope : str ):
324+ call_state ["token_scope" ] = scope
325+ call_state ["token_credential" ] = credential
326+ return lambda : "token"
312327
313328 class DummyImageEmbeddings :
314329 def __init__ (self , endpoint : str , token_provider ):
315330 self .endpoint = endpoint
316331 self .token_provider = token_provider
317332
318- monkeypatch .setattr (fp_embeddings , "ImageEmbeddings" , DummyImageEmbeddings )
319- monkeypatch .setattr ("azure.identity.aio.get_bearer_token_provider" , lambda * _ , ** __ : lambda : "token" )
333+ monkeypatch .setattr (figure_processor , "ManagedIdentityCredential" , StubCredential )
334+ monkeypatch .setattr (figure_processor , "setup_blob_manager" , fake_setup_blob_manager )
335+ monkeypatch .setattr (figure_processor , "setup_figure_processor" , fake_setup_figure_processor )
336+ monkeypatch .setattr (figure_processor , "setup_openai_client" , fake_setup_openai_client )
337+ monkeypatch .setattr (figure_processor , "get_bearer_token_provider" , fake_get_bearer_token_provider )
338+ monkeypatch .setattr (figure_processor , "ImageEmbeddings" , DummyImageEmbeddings )
339+ monkeypatch .setattr (figure_processor , "settings" , None )
320340
321- module = importlib .reload (figure_processor )
322- module .configure_global_settings ()
341+ figure_processor .configure_global_settings ()
323342
324- assert module .settings .blob_manager == "blob"
325- assert module .settings .figure_processor == "figproc"
326- assert isinstance (module .settings .image_embeddings , DummyImageEmbeddings )
343+ assert figure_processor .settings is not None
344+ assert figure_processor .settings .blob_manager == "blob"
345+ assert figure_processor .settings .figure_processor == "figproc"
346+ embeddings = figure_processor .settings .image_embeddings
347+ assert isinstance (embeddings , DummyImageEmbeddings )
348+ assert embeddings .endpoint == "https://vision"
349+ assert embeddings .token_provider () == "token"
327350
328- # Reset module to default configuration for subsequent tests
329- for var in [
330- "AZURE_CLIENT_ID" ,
331- "AZURE_STORAGE_ACCOUNT" ,
332- "AZURE_IMAGESTORAGE_CONTAINER" ,
333- "USE_MULTIMODAL" ,
334- "AZURE_OPENAI_SERVICE" ,
335- "AZURE_OPENAI_CHATGPT_DEPLOYMENT" ,
336- "AZURE_VISION_ENDPOINT" ,
337- ]:
338- monkeypatch .delenv (var , raising = False )
339- sys .path .remove (str (fp_root ))
340- importlib .reload (figure_processor )
351+ assert call_state ["credential_client_id" ] == "client-456"
352+ assert call_state ["blob_manager_kwargs" ]["storage_account" ] == "acct"
353+ assert call_state ["figure_processor_kwargs" ]["use_multimodal" ] is True
354+ assert call_state ["token_scope" ] == "https://cognitiveservices.azure.com/.default"
355+ assert isinstance (call_state ["token_credential" ], StubCredential )
356+ assert call_state ["openai_client_args" ]["azure_openai_service" ] == "svc"
357+ assert call_state ["openai_client_args" ]["azure_credential" ] is call_state ["token_credential" ]
341358
342359
343- def test_figure_processor_warns_when_openai_incomplete (monkeypatch : pytest .MonkeyPatch ) -> None :
360+ def test_figure_processor_warns_when_openai_incomplete (monkeypatch : pytest .MonkeyPatch , caplog ) -> None :
344361 """Figure processor is created with warning when USE_MULTIMODAL is true but OpenAI config is incomplete."""
345362 monkeypatch .setenv ("USE_MULTIMODAL" , "true" )
346363 monkeypatch .setenv ("AZURE_STORAGE_ACCOUNT" , "acct" )
347364 monkeypatch .setenv ("AZURE_IMAGESTORAGE_CONTAINER" , "images" )
348365 # OpenAI config missing, so figure_processor will be created but won't work properly
349- module = importlib .reload (figure_processor )
350- module .configure_global_settings ()
366+ figure_processor .configure_global_settings ()
351367 # A FigureProcessor object is created even with incomplete config
352- assert module .settings .figure_processor is not None
353- # But it will raise ValueError when trying to describe images due to missing OpenAI client
354- monkeypatch .delenv ("USE_MULTIMODAL" , raising = False )
355- monkeypatch .delenv ("AZURE_STORAGE_ACCOUNT" , raising = False )
356- monkeypatch .delenv ("AZURE_IMAGESTORAGE_CONTAINER" , raising = False )
357- importlib .reload (figure_processor )
368+ assert figure_processor .settings .figure_processor is not None
369+ assert "USE_MULTIMODAL is true but Azure OpenAI configuration incomplete" in caplog .text
358370
359371
360372@pytest .mark .asyncio
@@ -371,9 +383,6 @@ async def create_embeddings(self, texts: list[str]) -> list[list[float]]:
371383 return [[0.41 , 0.42 , 0.43 ] for _ in texts ]
372384
373385 # Set up mock file processors with stub splitter
374- from prepdocslib .fileprocessor import FileProcessor
375- from prepdocslib .textparser import TextParser
376-
377386 mock_file_processors = {
378387 ".pdf" : FileProcessor (TextParser (), StubSplitter ()),
379388 }
@@ -562,21 +571,12 @@ async def test_text_processor_invalid_json(monkeypatch: pytest.MonkeyPatch) -> N
562571@pytest .mark .asyncio
563572async def test_text_processor_with_client_id (monkeypatch : pytest .MonkeyPatch ) -> None :
564573 """Test text processor uses ManagedIdentityCredential with client ID."""
565- import os
566-
567574 # Set the AZURE_CLIENT_ID environment variable
568- original_client_id = os .environ .get ("AZURE_CLIENT_ID" )
569- os .environ ["AZURE_CLIENT_ID" ] = "test-client-id"
570-
571- try :
572- # Force reimport to trigger module initialization with the env var set
573- importlib .reload (text_processor )
574- finally :
575- # Restore original value
576- if original_client_id :
577- os .environ ["AZURE_CLIENT_ID" ] = original_client_id
578- else :
579- os .environ .pop ("AZURE_CLIENT_ID" , None )
575+ monkeypatch .setenv ("AZURE_CLIENT_ID" , "test-client-id" )
576+ text_processor .configure_global_settings ()
577+ # Verify it was configured (actual verification would check the credential type)
578+ monkeypatch .delenv ("AZURE_CLIENT_ID" , raising = False )
579+ text_processor .configure_global_settings ()
580580
581581
582582@pytest .mark .asyncio
@@ -589,10 +589,6 @@ async def test_text_processor_embeddings_setup(monkeypatch: pytest.MonkeyPatch)
589589@pytest .mark .asyncio
590590async def test_text_processor_no_sections (monkeypatch : pytest .MonkeyPatch ) -> None :
591591 """Test text processor handles empty sections."""
592- from prepdocslib .fileprocessor import FileProcessor
593- from prepdocslib .textparser import TextParser
594- from prepdocslib .textsplitter import SentenceTextSplitter
595-
596592 mock_file_processors = {
597593 ".pdf" : FileProcessor (TextParser (), SentenceTextSplitter ()),
598594 }
@@ -640,12 +636,6 @@ def mock_process_text(pages, file, splitter, category):
640636@pytest .mark .asyncio
641637async def test_text_processor_embeddings_not_initialized (monkeypatch : pytest .MonkeyPatch , caplog ) -> None :
642638 """Test text processor logs warning when embeddings requested but not initialized."""
643- import logging
644-
645- from prepdocslib .fileprocessor import FileProcessor
646- from prepdocslib .textparser import TextParser
647- from prepdocslib .textsplitter import SentenceTextSplitter
648-
649639 mock_file_processors = {
650640 ".pdf" : FileProcessor (TextParser (), SentenceTextSplitter ()),
651641 }
@@ -690,10 +680,6 @@ def mock_process_text(pages, file, splitter, category):
690680@pytest .mark .asyncio
691681async def test_text_processor_empty_chunk_skipped (monkeypatch : pytest .MonkeyPatch ) -> None :
692682 """Test text processor skips empty chunks."""
693- from prepdocslib .fileprocessor import FileProcessor
694- from prepdocslib .textparser import TextParser
695- from prepdocslib .textsplitter import SentenceTextSplitter
696-
697683 mock_file_processors = {
698684 ".pdf" : FileProcessor (TextParser (), SentenceTextSplitter ()),
699685 }
@@ -744,10 +730,6 @@ def mock_process_text(pages, file, splitter, category):
744730@pytest .mark .asyncio
745731async def test_text_processor_with_multimodal_embeddings (monkeypatch : pytest .MonkeyPatch ) -> None :
746732 """Test text processor includes image embeddings when use_multimodal is true."""
747- from prepdocslib .fileprocessor import FileProcessor
748- from prepdocslib .textparser import TextParser
749- from prepdocslib .textsplitter import SentenceTextSplitter
750-
751733 mock_file_processors = {
752734 ".pdf" : FileProcessor (TextParser (), SentenceTextSplitter ()),
753735 }
@@ -810,12 +792,6 @@ def mock_process_text(pages, file, splitter, category):
810792@pytest .mark .asyncio
811793async def test_text_processor_embedding_dimension_mismatch (monkeypatch : pytest .MonkeyPatch , caplog ) -> None :
812794 """Test text processor logs warning when embedding dimensions don't match."""
813- import logging
814-
815- from prepdocslib .fileprocessor import FileProcessor
816- from prepdocslib .textparser import TextParser
817- from prepdocslib .textsplitter import SentenceTextSplitter
818-
819795 mock_embedding_service = type ("MockEmbeddingService" , (), {})()
820796
821797 async def mock_create_embeddings (texts ):
@@ -867,12 +843,6 @@ def mock_process_text(pages, file, splitter, category):
867843@pytest .mark .asyncio
868844async def test_text_processor_embeddings_missing_warning (monkeypatch : pytest .MonkeyPatch , caplog ) -> None :
869845 """Test text processor logs warning when embeddings are requested but missing."""
870- import logging
871-
872- from prepdocslib .fileprocessor import FileProcessor
873- from prepdocslib .textparser import TextParser
874- from prepdocslib .textsplitter import SentenceTextSplitter
875-
876846 mock_embedding_service = type ("MockEmbeddingService" , (), {})()
877847
878848 async def mock_create_embeddings (texts ):
0 commit comments