Skip to content

Commit 493ece4

Browse files
committed
Integrated vectorization progress
1 parent 8ed8a63 commit 493ece4

File tree

2 files changed

+22
-3
lines changed

2 files changed

+22
-3
lines changed

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,16 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
119119
name="vision-embedding-shaper-skill",
120120
description="Shaper skill to ensure image embeddings are in the correct format",
121121
context="/document/normalized_images/*",
122-
inputs=[InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector")],
122+
inputs=[
123+
InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector"),
124+
InputFieldMappingEntry(
125+
name="url",
126+
# source=f'="{self.blob_manager.endpoint}/images/"+$(/document/normalized_images/*/imagePath)'
127+
source="=$(/document/normalized_images/*/imagePath)",
128+
),
129+
],
123130
outputs=[OutputFieldMappingEntry(name="output", target_name="images")],
124131
)
125-
# TODO: project images into a container
126132

127133
index_projection = SearchIndexerIndexProjection(
128134
selectors=[
@@ -171,6 +177,10 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
171177

172178
# We still need to map the images onto url in the images complex field type
173179
# something about key path
180+
# id = "feb5e192afb6_aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L05vcnRod2luZF9IZWFsdGhfUGx1c19CZW5lZml0c19EZXRhaWxzLnBkZg2_pages_65",
181+
# parent_id = is the folder name
182+
# https://stxxk4qzq3tahic2.blob.core.windows.net/images/aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L0JlbmVmaXRfT3B0aW9ucy5wZGY1/normalized_images_1.jpg
183+
174184
skillset = SearchIndexerSkillset(
175185
name=self.skillset_name,
176186
description="Skillset to chunk documents and generate embeddings",
@@ -235,7 +245,7 @@ async def run(self):
235245
configuration=IndexingParametersConfiguration(
236246
query_timeout=None, # Current bug in AI Search SDK
237247
image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGES,
238-
),
248+
)
239249
)
240250
}
241251

todo.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,15 @@ TODO:
2222

2323
To decide:
2424
* For user data lake client, how often should we double check the ACL matches the oid, versus assuming the URLs convey that? (Like when fetching the image?)
25+
* add a note that we only check owner, not full access control
26+
* add owner check to each blob-related method (/list, /remove)
27+
* move fetch image into blobmanager
28+
* WHY ARE THERE NO OIDS on the uploaded sections????
2529

2630
Later:
2731
Agentic: Incompatible since it doesnt retrieve images. We would need to do a follow-up search query to get each document, like filter: id eq 'x' or id eq 'y' or....
32+
33+
34+
"id":
35+
"feb5e192afb6_aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L05vcnRod2luZF9IZWFsdGhfUGx1c19CZW5lZml0c19EZXRhaWxzLnBkZg2_pages_65",
36+
"aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L05vcnRod2luZF9IZWFsdGhfUGx1c19CZW5lZml0c19EZXRhaWxzLnBkZg2",

0 commit comments

Comments
 (0)