Skip to content

Commit 1db5f14

Browse files
committed
More code cleanup
1 parent b733d20 commit 1db5f14

File tree

10 files changed

+73
-137
lines changed

10 files changed

+73
-137
lines changed

app/backend/prepdocs.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from prepdocslib.parser import Parser
2727
from prepdocslib.servicesetup import (
2828
OpenAIHost,
29+
clean_key_if_exists,
2930
select_parser,
3031
setup_blob_manager,
3132
setup_embeddings_service,
@@ -41,13 +42,6 @@
4142
logger = logging.getLogger("scripts")
4243

4344

44-
def clean_key_if_exists(key: Optional[str]) -> Optional[str]:
45-
"""Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None."""
46-
if key is not None and key.strip() != "":
47-
return key.strip()
48-
return None
49-
50-
5145
async def check_search_service_connectivity(search_service: str) -> bool:
5246
"""Check if the search service is accessible by hitting the /ping endpoint."""
5347
ping_url = f"https://{search_service}.search.windows.net/ping"

app/backend/prepdocslib/figureprocessor.py

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -40,63 +40,61 @@ def __init__(
4040
openai_deployment: str | None = None,
4141
content_understanding_endpoint: str | None = None,
4242
) -> None:
43-
self._credential = credential
43+
self.credential = credential
4444
self.strategy = strategy
45-
self._openai_client = openai_client
46-
self._openai_model = openai_model
47-
self._openai_deployment = openai_deployment
48-
self._content_understanding_endpoint = content_understanding_endpoint
49-
self._media_describer: MediaDescriber | None = None
50-
self._content_understanding_ready = False
45+
self.openai_client = openai_client
46+
self.openai_model = openai_model
47+
self.openai_deployment = openai_deployment
48+
self.content_understanding_endpoint = content_understanding_endpoint
49+
self.media_describer: MediaDescriber | None = None
50+
self.content_understanding_ready = False
5151

5252
async def get_media_describer(self) -> MediaDescriber | None:
5353
"""Return (and lazily create) the media describer for this processor."""
5454

5555
if self.strategy == MediaDescriptionStrategy.NONE:
5656
return None
5757

58-
if self._media_describer is not None:
59-
return self._media_describer
58+
if self.media_describer is not None:
59+
return self.media_describer
6060

6161
if self.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING:
62-
if self._content_understanding_endpoint is None:
62+
if self.content_understanding_endpoint is None:
6363
raise ValueError("Content Understanding strategy requires an endpoint")
64-
if self._credential is None:
64+
if self.credential is None:
6565
raise ValueError("Content Understanding strategy requires a credential")
66-
if isinstance(self._credential, AzureKeyCredential):
66+
if isinstance(self.credential, AzureKeyCredential):
6767
raise ValueError(
6868
"Content Understanding does not support key credentials; provide a token credential instead"
6969
)
70-
self._media_describer = ContentUnderstandingDescriber(
71-
self._content_understanding_endpoint, self._credential
72-
)
73-
return self._media_describer
70+
self.media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential)
71+
return self.media_describer
7472

7573
if self.strategy == MediaDescriptionStrategy.OPENAI:
76-
if self._openai_client is None or self._openai_model is None:
74+
if self.openai_client is None or self.openai_model is None:
7775
raise ValueError("OpenAI strategy requires both a client and a model name")
78-
self._media_describer = MultimodalModelDescriber(
79-
self._openai_client, model=self._openai_model, deployment=self._openai_deployment
76+
self.media_describer = MultimodalModelDescriber(
77+
self.openai_client, model=self.openai_model, deployment=self.openai_deployment
8078
)
81-
return self._media_describer
79+
return self.media_describer
8280

8381
logger.warning("Unknown media description strategy '%s'; skipping description", self.strategy)
8482
return None
8583

8684
def mark_content_understanding_ready(self) -> None:
8785
"""Record that the Content Understanding analyzer exists to avoid recreating it."""
8886

89-
self._content_understanding_ready = True
87+
self.content_understanding_ready = True
9088

9189
async def describe(self, image_bytes: bytes) -> str | None:
9290
"""Generate a description for the provided image bytes if a describer is available."""
9391

9492
describer = await self.get_media_describer()
9593
if describer is None:
9694
return None
97-
if isinstance(describer, ContentUnderstandingDescriber) and not self._content_understanding_ready:
95+
if isinstance(describer, ContentUnderstandingDescriber) and not self.content_understanding_ready:
9896
await describer.create_analyzer()
99-
self._content_understanding_ready = True
97+
self.content_understanding_ready = True
10098
return await describer.describe_image(image_bytes)
10199

102100

app/backend/prepdocslib/page.py

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,47 +21,23 @@ def to_skill_payload(
2121
self,
2222
file_name: str,
2323
*,
24-
include_bytes: bool = False,
2524
include_bytes_base64: bool = True,
2625
) -> dict[str, Any]:
27-
"""Serialize this figure for the figure_processor skill output.
28-
29-
Parameters:
30-
file_name: Source document file name.
31-
include_bytes: When True, include the raw ``bytes`` field. Defaults to False to avoid
32-
bloating payload size and because JSON serialization of raw bytes is not desired.
33-
include_bytes_base64: When True (default), include a base64 representation of the image
34-
as ``bytes_base64`` for downstream skills that might still need the encoded image.
35-
36-
Notes:
37-
- Previous behavior always included both the raw bytes (via ``asdict``) and a base64 copy.
38-
This is wasteful for typical pipelines where only the blob ``url`` plus lightweight
39-
metadata are required. The new defaults favor minimal payload size.
40-
- Callers needing the raw bytes can opt-in with ``include_bytes=True`` (e.g., for a
41-
chained skill that has not yet persisted the blob or for debugging scenarios).
42-
"""
43-
4426
data = asdict(self)
4527

46-
if not include_bytes and "bytes" in data:
47-
# Remove raw bytes to keep payload lean (and JSON-friendly without extra handling).
48-
data.pop("bytes", None)
28+
# Remove raw bytes to keep payload lean (and JSON-friendly without extra handling).
29+
data.pop("bytes", None)
4930

31+
# Optionally include base64-encoded bytes for skills that need it
5032
if include_bytes_base64:
51-
# Always base64 from the current in-memory bytes, not from any cached version, to ensure fidelity.
5233
b = self.bytes if isinstance(self.bytes, (bytes, bytearray)) else b""
5334
data["bytes_base64"] = base64.b64encode(b).decode("utf-8")
5435

55-
# Remove None values to prevent document extractor from emitting fields that will be
56-
# enriched by figure processor, avoiding potential conflicts in Azure AI Search enrichment merge
57-
data = {k: v for k, v in data.items() if v is not None}
58-
5936
data["document_file_name"] = file_name
6037
return data
6138

6239
@classmethod
6340
def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]:
64-
"""Deserialize a figure skill payload into an ImageOnPage, normalizing fields."""
6541
# Decode base64 image data (optional - may be omitted if already persisted to blob)
6642
bytes_base64 = data.get("bytes_base64")
6743
if bytes_base64:

app/backend/prepdocslib/servicesetup.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@
2323
logger = logging.getLogger("scripts")
2424

2525

26+
def clean_key_if_exists(key: Optional[str]) -> Optional[str]:
27+
"""Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None."""
28+
if key is not None and key.strip() != "":
29+
return key.strip()
30+
return None
31+
32+
2633
class OpenAIHost(str, Enum):
2734
"""Supported OpenAI hosting styles.
2835

app/backend/prepdocslib/textprocessor.py

Lines changed: 6 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,18 @@
11
"""Utilities for processing document text and combining it with figure descriptions."""
22

33
import logging
4-
from typing import TYPE_CHECKING
54

6-
if TYPE_CHECKING: # pragma: no cover - used only for type hints
7-
from .listfilestrategy import File
8-
from .page import Page
9-
from .searchmanager import Section
10-
from .textsplitter import TextSplitter
5+
from .figureprocessor import build_figure_markup
6+
from .listfilestrategy import File
7+
from .page import Page
8+
from .searchmanager import Section
9+
from .textsplitter import TextSplitter
1110

1211
logger = logging.getLogger("scripts")
1312

1413

1514
def combine_text_with_figures(page: "Page") -> None:
16-
"""Replace figure placeholders in page text with full description markup.
17-
18-
This is Skill #3 (text_processor) in the three-skill pipeline.
19-
After figures have been described and enriched, this replaces their
20-
placeholders in the page text with the full <figure> markup.
21-
"""
22-
from .figureprocessor import build_figure_markup
23-
15+
"""Replace figure placeholders in page text with full description markup."""
2416
for image in page.images:
2517
if image.description and image.placeholder in page.text:
2618
figure_markup = build_figure_markup(image, image.description)
@@ -39,22 +31,9 @@ def process_text(
3931
category: str | None = None,
4032
) -> list["Section"]:
4133
"""Process document text and figures into searchable sections.
42-
43-
This is Skill #3 (text_processor) in the three-skill pipeline.
4434
Combines text with figure descriptions, splits into chunks, and
4535
associates figures with their containing sections.
46-
47-
Args:
48-
pages: List of parsed pages with enriched figures
49-
file: Original file being processed
50-
splitter: Text splitter for chunking content
51-
category: Optional category for sections
52-
53-
Returns:
54-
List of Sections ready for indexing
5536
"""
56-
from .searchmanager import Section
57-
5837
# Step 1: Combine text with figures on each page
5938
for page in pages:
6039
combine_text_with_figures(page)

app/backend/setup_cloud_ingestion.py

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import asyncio
44
import logging
55
import os
6-
from typing import Optional
76

87
from azure.core.credentials_async import AsyncTokenCredential
98
from azure.identity.aio import AzureDeveloperCliCredential
@@ -13,8 +12,10 @@
1312
from load_azd_env import load_azd_env
1413
from prepdocslib.blobmanager import BlobManager
1514
from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy
15+
from prepdocslib.listfilestrategy import LocalListFileStrategy
1616
from prepdocslib.servicesetup import (
1717
OpenAIHost,
18+
clean_key_if_exists,
1819
setup_blob_manager,
1920
setup_embeddings_service,
2021
setup_openai_client,
@@ -25,13 +26,6 @@
2526
logger = logging.getLogger("scripts")
2627

2728

28-
def clean_key_if_exists(key: Optional[str]) -> Optional[str]:
29-
"""Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None."""
30-
if key is not None and key.strip() != "":
31-
return key.strip()
32-
return None
33-
34-
3529
async def setup_cloud_ingestion_strategy(
3630
azure_credential: AsyncTokenCredential,
3731
document_action: DocumentAction = DocumentAction.Add,
@@ -107,10 +101,8 @@ async def setup_cloud_ingestion_strategy(
107101
disable_batch=False,
108102
)
109103

110-
# Create a minimal list file strategy (cloud ingestion doesn't use file listing)
111-
from prepdocslib.listfilestrategy import LocalListFileStrategy
112-
113-
list_file_strategy = LocalListFileStrategy(path_pattern="", enable_global_documents=False)
104+
# Create a list file strategy for uploading files from the data folder
105+
list_file_strategy = LocalListFileStrategy(path_pattern="data/*", enable_global_documents=False)
114106

115107
# Create the cloud ingestion strategy
116108
ingestion_strategy = CloudIngestionStrategy(
@@ -174,13 +166,9 @@ async def main():
174166
await ingestion_strategy.run()
175167

176168
finally:
177-
# Gracefully close any async clients/credentials
178-
try:
179-
await blob_manager.close_clients()
180-
await openai_client.close()
181-
await azd_credential.close()
182-
except Exception as e:
183-
logger.debug(f"Failed to close async clients cleanly: {e}")
169+
await blob_manager.close_clients()
170+
await openai_client.close()
171+
await azd_credential.close()
184172

185173

186174
if __name__ == "__main__":

app/functions/figure_processor/function_app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse:
161161
image_embeddings_client=settings.image_embeddings,
162162
figure_processor=settings.figure_processor,
163163
)
164-
figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False, include_bytes=False)
164+
figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False)
165165
output_values.append(
166166
{
167167
"recordId": record_id,

infra/main.bicep

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ var appEnvVariables = {
468468
AZURE_SEARCH_SERVICE: searchService.outputs.name
469469
AZURE_SEARCH_SEMANTIC_RANKER: actualSearchServiceSemanticRankerLevel
470470
AZURE_SEARCH_QUERY_REWRITING: searchServiceQueryRewriting
471-
AZURE_VISION_ENDPOINT: useMultimodal ? vision!.outputs.endpoint : ''
471+
AZURE_VISION_ENDPOINT: useMultimodal ? vision.outputs.endpoint : ''
472472
AZURE_SEARCH_QUERY_LANGUAGE: searchQueryLanguage
473473
AZURE_SEARCH_QUERY_SPELLER: searchQuerySpeller
474474
AZURE_SEARCH_FIELD_NAME_EMBEDDING: searchFieldNameEmbedding
@@ -656,7 +656,7 @@ module acaAuth 'core/host/container-apps-auth.bicep' = if (deploymentTarget == '
656656
}
657657
}
658658

659-
// FUNCTION APPS FOR CLOUD INGESTION
659+
// Optional Azure Functions for document ingestion and processing
660660
module functions 'app/functions.bicep' = if (useCloudIngestion) {
661661
name: 'functions'
662662
scope: resourceGroup
@@ -1445,11 +1445,11 @@ output AZURE_OPENAI_EVAL_MODEL string = isAzureOpenAiHost && useEval ? eval.mode
14451445
output AZURE_OPENAI_SEARCHAGENT_DEPLOYMENT string = isAzureOpenAiHost && useAgenticRetrieval ? searchAgent.deploymentName : ''
14461446
output AZURE_OPENAI_SEARCHAGENT_MODEL string = isAzureOpenAiHost && useAgenticRetrieval ? searchAgent.modelName : ''
14471447
output AZURE_OPENAI_REASONING_EFFORT string = defaultReasoningEffort
1448-
output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech!.outputs.resourceId : ''
1449-
output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech!.outputs.location : ''
1448+
output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech.outputs.resourceId : ''
1449+
output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech.outputs.location : ''
14501450

1451-
output AZURE_VISION_ENDPOINT string = useMultimodal ? vision!.outputs.endpoint : ''
1452-
output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding!.outputs.endpoint : ''
1451+
output AZURE_VISION_ENDPOINT string = useMultimodal ? vision.outputs.endpoint : ''
1452+
output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : ''
14531453

14541454
output AZURE_DOCUMENTINTELLIGENCE_SERVICE string = documentIntelligence.outputs.name
14551455
output AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP string = documentIntelligenceResourceGroup.name

0 commit comments

Comments
 (0)