More code cleanup

pamelafox · pamelafox · commit 1db5f14c03e1 · 2025-11-11T15:29:05.000-08:00
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -26,6 +26,7 @@
 from prepdocslib.parser import Parser
 from prepdocslib.servicesetup import (
     OpenAIHost,
+    clean_key_if_exists,
     select_parser,
     setup_blob_manager,
     setup_embeddings_service,
@@ -41,13 +42,6 @@
 logger = logging.getLogger("scripts")
 
 
-def clean_key_if_exists(key: Optional[str]) -> Optional[str]:
-    """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None."""
-    if key is not None and key.strip() != "":
-        return key.strip()
-    return None
-
-
 async def check_search_service_connectivity(search_service: str) -> bool:
     """Check if the search service is accessible by hitting the /ping endpoint."""
     ping_url = f"https://{search_service}.search.windows.net/ping"
diff --git a/app/backend/prepdocslib/figureprocessor.py b/app/backend/prepdocslib/figureprocessor.py
@@ -40,63 +40,61 @@ def __init__(
         openai_deployment: str | None = None,
         content_understanding_endpoint: str | None = None,
     ) -> None:
-        self._credential = credential
+        self.credential = credential
         self.strategy = strategy
-        self._openai_client = openai_client
-        self._openai_model = openai_model
-        self._openai_deployment = openai_deployment
-        self._content_understanding_endpoint = content_understanding_endpoint
-        self._media_describer: MediaDescriber | None = None
-        self._content_understanding_ready = False
+        self.openai_client = openai_client
+        self.openai_model = openai_model
+        self.openai_deployment = openai_deployment
+        self.content_understanding_endpoint = content_understanding_endpoint
+        self.media_describer: MediaDescriber | None = None
+        self.content_understanding_ready = False
 
     async def get_media_describer(self) -> MediaDescriber | None:
         """Return (and lazily create) the media describer for this processor."""
 
         if self.strategy == MediaDescriptionStrategy.NONE:
             return None
 
-        if self._media_describer is not None:
-            return self._media_describer
+        if self.media_describer is not None:
+            return self.media_describer
 
         if self.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING:
-            if self._content_understanding_endpoint is None:
+            if self.content_understanding_endpoint is None:
                 raise ValueError("Content Understanding strategy requires an endpoint")
-            if self._credential is None:
+            if self.credential is None:
                 raise ValueError("Content Understanding strategy requires a credential")
-            if isinstance(self._credential, AzureKeyCredential):
+            if isinstance(self.credential, AzureKeyCredential):
                 raise ValueError(
                     "Content Understanding does not support key credentials; provide a token credential instead"
                 )
-            self._media_describer = ContentUnderstandingDescriber(
-                self._content_understanding_endpoint, self._credential
-            )
-            return self._media_describer
+            self.media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential)
+            return self.media_describer
 
         if self.strategy == MediaDescriptionStrategy.OPENAI:
-            if self._openai_client is None or self._openai_model is None:
+            if self.openai_client is None or self.openai_model is None:
                 raise ValueError("OpenAI strategy requires both a client and a model name")
-            self._media_describer = MultimodalModelDescriber(
-                self._openai_client, model=self._openai_model, deployment=self._openai_deployment
+            self.media_describer = MultimodalModelDescriber(
+                self.openai_client, model=self.openai_model, deployment=self.openai_deployment
             )
-            return self._media_describer
+            return self.media_describer
 
         logger.warning("Unknown media description strategy '%s'; skipping description", self.strategy)
         return None
 
     def mark_content_understanding_ready(self) -> None:
         """Record that the Content Understanding analyzer exists to avoid recreating it."""
 
-        self._content_understanding_ready = True
+        self.content_understanding_ready = True
 
     async def describe(self, image_bytes: bytes) -> str | None:
         """Generate a description for the provided image bytes if a describer is available."""
 
         describer = await self.get_media_describer()
         if describer is None:
             return None
-        if isinstance(describer, ContentUnderstandingDescriber) and not self._content_understanding_ready:
+        if isinstance(describer, ContentUnderstandingDescriber) and not self.content_understanding_ready:
             await describer.create_analyzer()
-            self._content_understanding_ready = True
+            self.content_understanding_ready = True
         return await describer.describe_image(image_bytes)
 
 
diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py
@@ -21,47 +21,23 @@ def to_skill_payload(
         self,
         file_name: str,
         *,
-        include_bytes: bool = False,
         include_bytes_base64: bool = True,
     ) -> dict[str, Any]:
-        """Serialize this figure for the figure_processor skill output.
-
-        Parameters:
-            file_name: Source document file name.
-            include_bytes: When True, include the raw ``bytes`` field. Defaults to False to avoid
-                bloating payload size and because JSON serialization of raw bytes is not desired.
-            include_bytes_base64: When True (default), include a base64 representation of the image
-                as ``bytes_base64`` for downstream skills that might still need the encoded image.
-
-        Notes:
-            - Previous behavior always included both the raw bytes (via ``asdict``) and a base64 copy.
-              This is wasteful for typical pipelines where only the blob ``url`` plus lightweight
-              metadata are required. The new defaults favor minimal payload size.
-            - Callers needing the raw bytes can opt-in with ``include_bytes=True`` (e.g., for a
-              chained skill that has not yet persisted the blob or for debugging scenarios).
-        """
-
         data = asdict(self)
 
-        if not include_bytes and "bytes" in data:
-            # Remove raw bytes to keep payload lean (and JSON-friendly without extra handling).
-            data.pop("bytes", None)
+        # Remove raw bytes to keep payload lean (and JSON-friendly without extra handling).
+        data.pop("bytes", None)
 
+        # Optionally include base64-encoded bytes for skills that need it
         if include_bytes_base64:
-            # Always base64 from the current in-memory bytes, not from any cached version, to ensure fidelity.
             b = self.bytes if isinstance(self.bytes, (bytes, bytearray)) else b""
             data["bytes_base64"] = base64.b64encode(b).decode("utf-8")
 
-        # Remove None values to prevent document extractor from emitting fields that will be
-        # enriched by figure processor, avoiding potential conflicts in Azure AI Search enrichment merge
-        data = {k: v for k, v in data.items() if v is not None}
-
         data["document_file_name"] = file_name
         return data
 
     @classmethod
     def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]:
-        """Deserialize a figure skill payload into an ImageOnPage, normalizing fields."""
         # Decode base64 image data (optional - may be omitted if already persisted to blob)
         bytes_base64 = data.get("bytes_base64")
         if bytes_base64:
diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py
@@ -23,6 +23,13 @@
 logger = logging.getLogger("scripts")
 
 
+def clean_key_if_exists(key: Optional[str]) -> Optional[str]:
+    """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None."""
+    if key is not None and key.strip() != "":
+        return key.strip()
+    return None
+
+
 class OpenAIHost(str, Enum):
     """Supported OpenAI hosting styles.
 
diff --git a/app/backend/prepdocslib/textprocessor.py b/app/backend/prepdocslib/textprocessor.py
@@ -1,26 +1,18 @@
 """Utilities for processing document text and combining it with figure descriptions."""
 
 import logging
-from typing import TYPE_CHECKING
 
-if TYPE_CHECKING:  # pragma: no cover - used only for type hints
-    from .listfilestrategy import File
-    from .page import Page
-    from .searchmanager import Section
-    from .textsplitter import TextSplitter
+from .figureprocessor import build_figure_markup
+from .listfilestrategy import File
+from .page import Page
+from .searchmanager import Section
+from .textsplitter import TextSplitter
 
 logger = logging.getLogger("scripts")
 
 
 def combine_text_with_figures(page: "Page") -> None:
-    """Replace figure placeholders in page text with full description markup.
-
-    This is Skill #3 (text_processor) in the three-skill pipeline.
-    After figures have been described and enriched, this replaces their
-    placeholders in the page text with the full <figure> markup.
-    """
-    from .figureprocessor import build_figure_markup
-
+    """Replace figure placeholders in page text with full description markup."""
     for image in page.images:
         if image.description and image.placeholder in page.text:
             figure_markup = build_figure_markup(image, image.description)
@@ -39,22 +31,9 @@ def process_text(
     category: str | None = None,
 ) -> list["Section"]:
     """Process document text and figures into searchable sections.
-
-    This is Skill #3 (text_processor) in the three-skill pipeline.
     Combines text with figure descriptions, splits into chunks, and
     associates figures with their containing sections.
-
-    Args:
-        pages: List of parsed pages with enriched figures
-        file: Original file being processed
-        splitter: Text splitter for chunking content
-        category: Optional category for sections
-
-    Returns:
-        List of Sections ready for indexing
     """
-    from .searchmanager import Section
-
     # Step 1: Combine text with figures on each page
     for page in pages:
         combine_text_with_figures(page)
diff --git a/app/backend/setup_cloud_ingestion.py b/app/backend/setup_cloud_ingestion.py
@@ -3,7 +3,6 @@
 import asyncio
 import logging
 import os
-from typing import Optional
 
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import AzureDeveloperCliCredential
@@ -13,8 +12,10 @@
 from load_azd_env import load_azd_env
 from prepdocslib.blobmanager import BlobManager
 from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy
+from prepdocslib.listfilestrategy import LocalListFileStrategy
 from prepdocslib.servicesetup import (
     OpenAIHost,
+    clean_key_if_exists,
     setup_blob_manager,
     setup_embeddings_service,
     setup_openai_client,
@@ -25,13 +26,6 @@
 logger = logging.getLogger("scripts")
 
 
-def clean_key_if_exists(key: Optional[str]) -> Optional[str]:
-    """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None."""
-    if key is not None and key.strip() != "":
-        return key.strip()
-    return None
-
-
 async def setup_cloud_ingestion_strategy(
     azure_credential: AsyncTokenCredential,
     document_action: DocumentAction = DocumentAction.Add,
@@ -107,10 +101,8 @@ async def setup_cloud_ingestion_strategy(
         disable_batch=False,
     )
 
-    # Create a minimal list file strategy (cloud ingestion doesn't use file listing)
-    from prepdocslib.listfilestrategy import LocalListFileStrategy
-
-    list_file_strategy = LocalListFileStrategy(path_pattern="", enable_global_documents=False)
+    # Create a list file strategy for uploading files from the data folder
+    list_file_strategy = LocalListFileStrategy(path_pattern="data/*", enable_global_documents=False)
 
     # Create the cloud ingestion strategy
     ingestion_strategy = CloudIngestionStrategy(
@@ -174,13 +166,9 @@ async def main():
         await ingestion_strategy.run()
 
     finally:
-        # Gracefully close any async clients/credentials
-        try:
-            await blob_manager.close_clients()
-            await openai_client.close()
-            await azd_credential.close()
-        except Exception as e:
-            logger.debug(f"Failed to close async clients cleanly: {e}")
+        await blob_manager.close_clients()
+        await openai_client.close()
+        await azd_credential.close()
 
 
 if __name__ == "__main__":
diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py
@@ -161,7 +161,7 @@ async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse:
                 image_embeddings_client=settings.image_embeddings,
                 figure_processor=settings.figure_processor,
             )
-            figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False, include_bytes=False)
+            figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False)
             output_values.append(
                 {
                     "recordId": record_id,
diff --git a/infra/main.bicep b/infra/main.bicep
@@ -468,7 +468,7 @@ var appEnvVariables = {
   AZURE_SEARCH_SERVICE: searchService.outputs.name
   AZURE_SEARCH_SEMANTIC_RANKER: actualSearchServiceSemanticRankerLevel
   AZURE_SEARCH_QUERY_REWRITING: searchServiceQueryRewriting
-  AZURE_VISION_ENDPOINT: useMultimodal ? vision!.outputs.endpoint : ''
+  AZURE_VISION_ENDPOINT: useMultimodal ? vision.outputs.endpoint : ''
   AZURE_SEARCH_QUERY_LANGUAGE: searchQueryLanguage
   AZURE_SEARCH_QUERY_SPELLER: searchQuerySpeller
   AZURE_SEARCH_FIELD_NAME_EMBEDDING: searchFieldNameEmbedding
@@ -656,7 +656,7 @@ module acaAuth 'core/host/container-apps-auth.bicep' = if (deploymentTarget == '
   }
 }
 
-// FUNCTION APPS FOR CLOUD INGESTION
+// Optional Azure Functions for document ingestion and processing
 module functions 'app/functions.bicep' = if (useCloudIngestion) {
   name: 'functions'
   scope: resourceGroup
@@ -1445,11 +1445,11 @@ output AZURE_OPENAI_EVAL_MODEL string = isAzureOpenAiHost && useEval ? eval.mode
 output AZURE_OPENAI_SEARCHAGENT_DEPLOYMENT string = isAzureOpenAiHost && useAgenticRetrieval ? searchAgent.deploymentName : ''
 output AZURE_OPENAI_SEARCHAGENT_MODEL string = isAzureOpenAiHost && useAgenticRetrieval ? searchAgent.modelName : ''
 output AZURE_OPENAI_REASONING_EFFORT string  = defaultReasoningEffort
-output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech!.outputs.resourceId : ''
-output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech!.outputs.location : ''
+output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech.outputs.resourceId : ''
+output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech.outputs.location : ''
 
-output AZURE_VISION_ENDPOINT string = useMultimodal ? vision!.outputs.endpoint : ''
-output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding!.outputs.endpoint : ''
+output AZURE_VISION_ENDPOINT string = useMultimodal ? vision.outputs.endpoint : ''
+output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : ''
 
 output AZURE_DOCUMENTINTELLIGENCE_SERVICE string = documentIntelligence.outputs.name
 output AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP string = documentIntelligenceResourceGroup.name
diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py
diff --git a/tests/test_prepdocslib_filestrategy.py b/tests/test_prepdocslib_filestrategy.py

Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse:`
`161`	`161`	`image_embeddings_client=settings.image_embeddings,`
`162`	`162`	`figure_processor=settings.figure_processor,`
`163`	`163`	`)`
`164`		`- figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False, include_bytes=False)`
	`164`	`+ figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False)`
`165`	`165`	`output_values.append(`
`166`	`166`	`{`
`167`	`167`	`"recordId": record_id,`