Skip to content

Commit 806828e

Browse files
committed
Integrated vectorization and user upload work
1 parent 29c44c8 commit 806828e

File tree

10 files changed

+153
-41
lines changed

10 files changed

+153
-41
lines changed

app/backend/app.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@
9898
clean_key_if_exists,
9999
setup_embeddings_service,
100100
setup_file_processors,
101+
setup_image_embeddings_service,
101102
setup_search_info,
102103
)
103104
from prepdocslib.filestrategy import UploadUserFileStrategy
@@ -596,11 +597,18 @@ async def setup_clients():
596597
openai_org=OPENAI_ORGANIZATION,
597598
disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false",
598599
)
600+
image_embeddings_service = setup_image_embeddings_service(
601+
azure_credential=azure_credential,
602+
vision_endpoint=AZURE_VISION_ENDPOINT,
603+
use_multimodal=USE_MULTIMODAL,
604+
)
599605
ingester = UploadUserFileStrategy(
600606
search_info=search_info,
601-
embeddings=text_embeddings_service,
602607
file_processors=file_processors,
608+
embeddings=text_embeddings_service,
609+
image_embeddings=image_embeddings_service,
603610
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
611+
blob_manager=user_blob_container_client,
604612
)
605613
current_app.config[CONFIG_INGESTER] = ingester
606614

app/backend/prepdocs.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -393,11 +393,6 @@ async def main(strategy: Strategy, setup_index: bool = True):
393393
required=False,
394394
help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)",
395395
)
396-
parser.add_argument(
397-
"--searchserviceassignedid",
398-
required=False,
399-
help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
400-
)
401396

402397
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
403398
args = parser.parse_args()
@@ -526,10 +521,15 @@ async def main(strategy: Strategy, setup_index: bool = True):
526521
embeddings=openai_embeddings_service,
527522
search_field_name_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"],
528523
subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
529-
search_service_user_assigned_id=args.searchserviceassignedid,
530524
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
531525
use_acls=use_acls,
532526
category=args.category,
527+
use_multimodal=use_multimodal,
528+
image_embeddings=setup_image_embeddings_service(
529+
azure_credential=azd_credential,
530+
vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"),
531+
use_multimodal=use_multimodal,
532+
),
533533
)
534534
else:
535535
file_processors = setup_file_processors(

app/backend/prepdocslib/blobmanager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ async def upload_blob(self, file: File) -> Optional[list[str]]:
4848
if file.url is None:
4949
with open(file.content.name, "rb") as reopened_file:
5050
blob_name = BlobManager.blob_name_from_file_name(file.content.name)
51-
logger.info("Uploading blob for document %s", blob_name)
51+
logger.info("Uploading blob for document '%s'", blob_name)
5252
blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
5353
file.url = blob_client.url
5454
return None
@@ -108,7 +108,7 @@ async def upload_document_image(
108108
blob_name = (
109109
f"{self.blob_name_from_file_name(document_file.content.name)}/page{image_page_num}/{image_filename}"
110110
)
111-
logger.info("Uploading blob for document image %s", blob_name)
111+
logger.info("Uploading blob for document image '%s'", blob_name)
112112
blob_client = await container_client.upload_blob(blob_name, output, overwrite=True)
113113
return blob_client.url
114114
return None

app/backend/prepdocslib/filestrategy.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,13 @@ def __init__(
145145
embeddings: Optional[OpenAIEmbeddings] = None,
146146
image_embeddings: Optional[ImageEmbeddings] = None,
147147
search_field_name_embedding: Optional[str] = None,
148+
blob_manager: Optional[BlobManager] = None,
148149
):
149150
self.file_processors = file_processors
150151
self.embeddings = embeddings
151152
self.image_embeddings = image_embeddings
152153
self.search_info = search_info
154+
self.blob_manager = blob_manager
153155
self.search_manager = SearchManager(
154156
search_info=self.search_info,
155157
search_analyzer_name=None,
@@ -164,7 +166,7 @@ def __init__(
164166
async def add_file(self, file: File):
165167
if self.image_embeddings:
166168
logging.warning("Image embeddings are not currently supported for the user upload feature")
167-
sections = await parse_file(file, self.file_processors)
169+
sections = await parse_file(file, self.file_processors, None, self.blob_manager, self.image_embeddings)
168170
if sections:
169171
await self.search_manager.update_content(sections, url=file.url)
170172

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@
55
NativeBlobSoftDeleteDeletionDetectionPolicy,
66
)
77
from azure.search.documents.indexes.models import (
8+
AIServicesAccountIdentity,
89
AzureOpenAIEmbeddingSkill,
10+
BlobIndexerImageAction,
11+
IndexingParameters,
12+
IndexingParametersConfiguration,
913
IndexProjectionMode,
1014
InputFieldMappingEntry,
1115
OutputFieldMappingEntry,
@@ -16,12 +20,17 @@
1620
SearchIndexerIndexProjection,
1721
SearchIndexerIndexProjectionSelector,
1822
SearchIndexerIndexProjectionsParameters,
23+
SearchIndexerKnowledgeStore,
24+
SearchIndexerKnowledgeStoreFileProjectionSelector,
25+
SearchIndexerKnowledgeStoreProjection,
1926
SearchIndexerSkillset,
27+
ShaperSkill,
2028
SplitSkill,
29+
VisionVectorizeSkill,
2130
)
2231

2332
from .blobmanager import BlobManager
24-
from .embeddings import AzureOpenAIEmbeddingService
33+
from .embeddings import AzureOpenAIEmbeddingService, ImageEmbeddings
2534
from .listfilestrategy import ListFileStrategy
2635
from .searchmanager import SearchManager
2736
from .strategy import DocumentAction, SearchInfo, Strategy
@@ -42,20 +51,20 @@ def __init__(
4251
embeddings: AzureOpenAIEmbeddingService,
4352
search_field_name_embedding: str,
4453
subscription_id: str,
45-
search_service_user_assigned_id: str,
4654
document_action: DocumentAction = DocumentAction.Add,
4755
search_analyzer_name: Optional[str] = None,
4856
use_acls: bool = False,
4957
category: Optional[str] = None,
58+
use_multimodal: bool = False,
59+
image_embeddings: Optional[ImageEmbeddings] = None,
5060
):
51-
5261
self.list_file_strategy = list_file_strategy
5362
self.blob_manager = blob_manager
5463
self.document_action = document_action
5564
self.embeddings = embeddings
65+
self.image_embeddings = image_embeddings
5666
self.search_field_name_embedding = search_field_name_embedding
5767
self.subscription_id = subscription_id
58-
self.search_user_assigned_identity = search_service_user_assigned_id
5968
self.search_analyzer_name = search_analyzer_name
6069
self.use_acls = use_acls
6170
self.category = category
@@ -64,6 +73,7 @@ def __init__(
6473
self.skillset_name = f"{prefix}-skillset"
6574
self.indexer_name = f"{prefix}-indexer"
6675
self.data_source_name = f"{prefix}-blob"
76+
self.use_multimodal = use_multimodal and image_embeddings is not None
6777

6878
async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset:
6979
"""
@@ -97,6 +107,23 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
97107
outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
98108
)
99109

110+
vision_embedding_skill = VisionVectorizeSkill(
111+
name="vision-embedding-skill",
112+
description="Skill to generate image embeddings via Azure AI Vision",
113+
context="/document/normalized_images/*",
114+
model_version="2023-04-15",
115+
inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
116+
outputs=[OutputFieldMappingEntry(name="vector", target_name="image_vector")],
117+
)
118+
vision_embedding_shaper_skill = ShaperSkill(
119+
name="vision-embedding-shaper-skill",
120+
description="Shaper skill to ensure image embeddings are in the correct format",
121+
context="/document/normalized_images/*",
122+
inputs=[InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector")],
123+
outputs=[OutputFieldMappingEntry(name="output", target_name="images")],
124+
)
125+
# TODO: project images into a container
126+
100127
index_projection = SearchIndexerIndexProjection(
101128
selectors=[
102129
SearchIndexerIndexProjectionSelector(
@@ -111,6 +138,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
111138
InputFieldMappingEntry(
112139
name=self.search_field_name_embedding, source="/document/pages/*/vector"
113140
),
141+
InputFieldMappingEntry(name="images", source="/document/normalized_images/*/images"),
114142
],
115143
),
116144
],
@@ -119,11 +147,36 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
119147
),
120148
)
121149

150+
indexer_skills = [split_skill, embedding_skill]
151+
if self.use_multimodal:
152+
indexer_skills.extend([vision_embedding_skill, vision_embedding_shaper_skill])
153+
extra_params = {}
154+
if self.use_multimodal:
155+
extra_params = {
156+
"cognitive_services_account": AIServicesAccountIdentity(subdomain_url=self.image_embeddings.endpoint),
157+
"knowledge_store": SearchIndexerKnowledgeStore(
158+
storage_connection_string=self.blob_manager.get_managedidentity_connectionstring(),
159+
projections=[
160+
SearchIndexerKnowledgeStoreProjection(
161+
files=[
162+
SearchIndexerKnowledgeStoreFileProjectionSelector(
163+
storage_container=self.blob_manager.image_container,
164+
source="/document/normalized_images/*",
165+
)
166+
]
167+
)
168+
],
169+
),
170+
}
171+
172+
# We still need to map the images onto url in the images complex field type
173+
# something about key path
122174
skillset = SearchIndexerSkillset(
123175
name=self.skillset_name,
124176
description="Skillset to chunk documents and generate embeddings",
125-
skills=[split_skill, embedding_skill],
177+
skills=indexer_skills,
126178
index_projection=index_projection,
179+
**extra_params,
127180
)
128181

129182
return skillset
@@ -137,7 +190,7 @@ async def setup(self):
137190
use_int_vectorization=True,
138191
embeddings=self.embeddings,
139192
field_name_embedding=self.search_field_name_embedding,
140-
search_images=False,
193+
search_images=self.use_multimodal,
141194
)
142195

143196
await search_manager.create_index()
@@ -175,12 +228,24 @@ async def run(self):
175228
await self.blob_manager.remove_blob()
176229

177230
# Create an indexer
231+
extra_params = {}
232+
if self.use_multimodal:
233+
extra_params = {
234+
"parameters": IndexingParameters(
235+
configuration=IndexingParametersConfiguration(
236+
query_timeout=None, # Current bug in AI Search SDK
237+
image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGES,
238+
),
239+
)
240+
}
241+
178242
indexer = SearchIndexer(
179243
name=self.indexer_name,
180244
description="Indexer to index documents and generate embeddings",
181245
skillset_name=self.skillset_name,
182246
target_index_name=self.search_info.index_name,
183247
data_source_name=self.data_source_name,
248+
**extra_params,
184249
)
185250

186251
indexer_client = self.search_info.create_search_indexer_client()

app/backend/prepdocslib/listfilestrategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def check_md5(self, path: str) -> bool:
102102
stored_hash = md5_f.read()
103103

104104
if stored_hash and stored_hash.strip() == existing_hash.strip():
105-
logger.info("Skipping %s, no changes detected.", path)
105+
logger.info("Skipping '%s', no changes detected.", path)
106106
return True
107107

108108
# Write the hash

app/backend/prepdocslib/mediadescriber.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,16 @@
66
import aiohttp
77
from azure.core.credentials_async import AsyncTokenCredential
88
from azure.identity.aio import get_bearer_token_provider
9-
from openai import AsyncOpenAI
9+
from openai import AsyncOpenAI, RateLimitError
1010
from rich.progress import Progress
11-
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
11+
from tenacity import (
12+
AsyncRetrying,
13+
retry,
14+
retry_if_exception_type,
15+
stop_after_attempt,
16+
wait_fixed,
17+
wait_random_exponential,
18+
)
1219

1320
logger = logging.getLogger("scripts")
1421

@@ -116,29 +123,39 @@ def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: Optional[
116123
self.deployment = deployment
117124

118125
async def describe_image(self, image_bytes: bytes) -> str:
126+
def before_retry_sleep(retry_state):
127+
logger.info("Rate limited on the OpenAI chat completions API, sleeping before retrying...")
128+
119129
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
120130
image_datauri = f"data:image/png;base64,{image_base64}"
121131

122-
response = await self.openai_client.chat.completions.create(
123-
model=self.model if self.deployment is None else self.deployment,
124-
max_tokens=500,
125-
messages=[
126-
{
127-
"role": "system",
128-
"content": "You are a helpful assistant that describes images from organizational documents.",
129-
},
130-
{
131-
"role": "user",
132-
"content": [
132+
async for attempt in AsyncRetrying(
133+
retry=retry_if_exception_type(RateLimitError),
134+
wait=wait_random_exponential(min=15, max=60),
135+
stop=stop_after_attempt(15),
136+
before_sleep=before_retry_sleep,
137+
):
138+
with attempt:
139+
response = await self.openai_client.chat.completions.create(
140+
model=self.model if self.deployment is None else self.deployment,
141+
max_tokens=500,
142+
messages=[
133143
{
134-
"text": "Describe image with no more than 5 sentences. Do not speculate about anything you don't know.",
135-
"type": "text",
144+
"role": "system",
145+
"content": "You are a helpful assistant that describes images from organizational documents.",
146+
},
147+
{
148+
"role": "user",
149+
"content": [
150+
{
151+
"text": "Describe image with no more than 5 sentences. Do not speculate about anything you don't know.",
152+
"type": "text",
153+
},
154+
{"image_url": {"url": image_datauri, "detail": "auto"}, "type": "image_url"},
155+
],
136156
},
137-
{"image_url": {"url": image_datauri, "detail": "auto"}, "type": "image_url"},
138157
],
139-
},
140-
],
141-
)
158+
)
142159
description = ""
143160
if response.choices and response.choices[0].message.content:
144161
description = response.choices[0].message.content.strip()

docs/multimodal.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,13 @@ For more details on how this feature works, read [this blog post](https://techco
2727

2828
### Prerequisites
2929

30-
* Create a [AI Vision account in Azure Portal first](https://ms.portal.azure.com/#create/Microsoft.CognitiveServicesComputerVision), so that you can agree to the Responsible AI terms for that resource. You can delete that account after agreeing. (TODO: Is this still needed?)
31-
* The use of a chat completion model that supports multimodal inputs. The default model for the repository is currently `gpt-4.1-mini`, which does support multimodal inputs.
30+
* The use of a chat completion model that supports multimodal inputs. The default model for the repository is currently `gpt-4.1-mini`, which does support multimodal inputs. The `gpt-4o-mini` technically supports multimodal inputs, but due to how image tokens are calculated, you need a much higher deployment capacity to use it effectively. Please try `gpt-4.1-mini` first, and experiment with other models later.
3231

3332
### Deployment
3433

3534
1. **Enable multimodal capabilities:**
3635

37-
First, make sure you do *not* have integrated vectorization enabled, since that is currently incompatible:
36+
First, make sure you do *not* have integrated vectorization enabled, since that is currently incompatible: (TODO!)
3837

3938
```shell
4039
azd env set USE_FEATURE_INT_VECTORIZATION false

infra/main.bicep

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1109,6 +1109,16 @@ module storageRoleSearchService 'core/security/role.bicep' = if (useIntegratedVe
11091109
}
11101110
}
11111111

1112+
module storageRoleContributorSearchService 'core/security/role.bicep' = if (useIntegratedVectorization && useMultimodal) {
1113+
scope: storageResourceGroup
1114+
name: 'storage-role-contributor-searchservice'
1115+
params: {
1116+
principalId: searchService.outputs.principalId
1117+
roleDefinitionId: 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' // Storage Blob Data Contributor
1118+
principalType: 'ServicePrincipal'
1119+
}
1120+
}
1121+
11121122
// Used to issue search queries
11131123
// https://learn.microsoft.com/azure/search/search-security-rbac
11141124
module searchRoleBackend 'core/security/role.bicep' = {

0 commit comments

Comments
 (0)