Skip to content

Commit be98004

Browse files
committed
Consolidate docs
1 parent 8df151f commit be98004

File tree

8 files changed

+89
-1298
lines changed

8 files changed

+89
-1298
lines changed

app/backend/prepdocslib/blobmanager.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,7 @@ async def upload_blob(self, file: File) -> str:
432432
blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
433433
file.url = blob_client.url
434434

435+
assert file.url is not None, "file.url must be set after upload"
435436
return unquote(file.url)
436437

437438
async def upload_document_image(

app/backend/prepdocslib/cloudingestionstrategy.py

Lines changed: 24 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def __init__(
7171
use_acls: bool = False,
7272
use_multimodal: bool = False,
7373
enforce_access_control: bool = False,
74-
search_user_assigned_identity_resource_id: str | None = None,
74+
search_user_assigned_identity_resource_id: str,
7575
) -> None:
7676
self.list_file_strategy = list_file_strategy
7777
self.blob_manager = blob_manager
@@ -297,38 +297,31 @@ async def setup(self) -> None:
297297
skillset = self._build_skillset()
298298
await indexer_client.create_or_update_skillset(skillset)
299299

300+
indexer = SearchIndexer(
301+
name=self.indexer_name,
302+
description="Indexer orchestrating cloud ingestion pipeline",
303+
data_source_name=self.data_source_name,
304+
target_index_name=self.search_info.index_name,
305+
skillset_name=self.skillset_name,
306+
parameters=IndexingParameters(
307+
configuration=IndexingParametersConfiguration(
308+
query_timeout=None, # type: ignore
309+
data_to_extract="storageMetadata",
310+
allow_skillset_to_read_file_data=True,
311+
)
312+
),
313+
)
314+
await indexer_client.create_or_update_indexer(indexer)
315+
300316
async def run(self) -> None:
301-
if self.document_action == DocumentAction.Add:
302-
files = self.list_file_strategy.list()
303-
async for file in files:
304-
try:
305-
await self.blob_manager.upload_blob(file)
306-
finally:
307-
if file:
308-
file.close()
309-
elif self.document_action == DocumentAction.Remove:
310-
paths = self.list_file_strategy.list_paths()
311-
async for path in paths:
312-
await self.blob_manager.remove_blob(path)
313-
elif self.document_action == DocumentAction.RemoveAll:
314-
await self.blob_manager.remove_blob()
315-
316-
indexer = SearchIndexer(
317-
name=self.indexer_name,
318-
description="Indexer orchestrating cloud ingestion pipeline",
319-
data_source_name=self.data_source_name,
320-
target_index_name=self.search_info.index_name,
321-
skillset_name=self.skillset_name,
322-
parameters=IndexingParameters(
323-
configuration=IndexingParametersConfiguration(
324-
query_timeout=None,
325-
data_to_extract="storageMetadata",
326-
allow_skillset_to_read_file_data=True,
327-
)
328-
),
329-
)
317+
files = self.list_file_strategy.list()
318+
async for file in files:
319+
try:
320+
await self.blob_manager.upload_blob(file)
321+
finally:
322+
if file:
323+
file.close()
330324

331325
async with self.search_info.create_search_indexer_client() as indexer_client:
332-
await indexer_client.create_or_update_indexer(indexer)
333326
await indexer_client.run_indexer(self.indexer_name)
334327
logger.info("Triggered indexer '%s' for cloud ingestion", self.indexer_name)

app/backend/prepdocslib/page.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,15 +85,25 @@ def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]:
8585
else:
8686
bbox = (0, 0, 0, 0)
8787

88+
filename = data.get("filename")
89+
figure_id = data.get("figure_id")
90+
placeholder = data.get("placeholder")
91+
assert filename is not None, "filename is required"
92+
assert figure_id is not None, "figure_id is required"
93+
94+
# Generate placeholder if not provided
95+
if placeholder is None:
96+
placeholder = f'<figure id="{figure_id}"></figure>'
97+
8898
image = cls(
8999
bytes=raw_bytes,
90100
bbox=bbox,
91101
page_num=page_num,
92-
filename=data.get("filename"),
93-
figure_id=data.get("figure_id"),
94-
placeholder=data.get("placeholder"),
102+
filename=filename,
103+
figure_id=figure_id,
104+
placeholder=placeholder,
95105
mime_type=data.get("mime_type") or "image/png",
96-
title=data.get("title"),
106+
title=data.get("title") or "",
97107
description=data.get("description"),
98108
url=data.get("url"),
99109
)

app/backend/setup_cloud_ingestion.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77

88
from azure.core.credentials_async import AsyncTokenCredential
99
from azure.identity.aio import AzureDeveloperCliCredential
10+
from openai import AsyncOpenAI
1011
from rich.logging import RichHandler
1112

1213
from load_azd_env import load_azd_env
14+
from prepdocslib.blobmanager import BlobManager
1315
from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy
1416
from prepdocslib.servicesetup import (
1517
OpenAIHost,
@@ -33,13 +35,13 @@ def clean_key_if_exists(key: Optional[str]) -> Optional[str]:
3335
async def setup_cloud_ingestion_strategy(
3436
azure_credential: AsyncTokenCredential,
3537
document_action: DocumentAction = DocumentAction.Add,
36-
) -> CloudIngestionStrategy:
38+
) -> tuple[CloudIngestionStrategy, AsyncOpenAI, AsyncTokenCredential, BlobManager]:
3739
"""Setup the cloud ingestion strategy with all required services."""
3840

3941
# Get environment variables
4042
search_service = os.environ["AZURE_SEARCH_SERVICE"]
4143
index_name = os.environ["AZURE_SEARCH_INDEX"]
42-
search_user_assigned_identity_resource_id = os.environ.get("AZURE_SEARCH_USER_ASSIGNED_IDENTITY_RESOURCE_ID")
44+
search_user_assigned_identity_resource_id = os.environ["AZURE_SEARCH_USER_ASSIGNED_IDENTITY_RESOURCE_ID"]
4345
storage_account = os.environ["AZURE_STORAGE_ACCOUNT"]
4446
storage_container = os.environ["AZURE_STORAGE_CONTAINER"]
4547
storage_resource_group = os.environ["AZURE_STORAGE_RESOURCE_GROUP"]
@@ -168,7 +170,8 @@ async def main():
168170
# Setup the indexer, skillset, and data source
169171
logger.info("Setting up indexer, skillset, and data source...")
170172
await ingestion_strategy.setup()
171-
logger.info("Cloud ingestion setup complete!")
173+
logger.info("Triggering initial indexing run...")
174+
await ingestion_strategy.run()
172175

173176
finally:
174177
# Gracefully close any async clients/credentials

0 commit comments

Comments
 (0)