Integrated vectorization progress

pamelafox · pamelafox · commit 493ece47010f · 2025-07-03T15:00:35.000-07:00
diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py
@@ -119,10 +119,16 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
             name="vision-embedding-shaper-skill",
             description="Shaper skill to ensure image embeddings are in the correct format",
             context="/document/normalized_images/*",
-            inputs=[InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector")],
+            inputs=[
+                InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector"),
+                InputFieldMappingEntry(
+                    name="url",
+                    # source=f'="{self.blob_manager.endpoint}/images/"+$(/document/normalized_images/*/imagePath)'
+                    source="=$(/document/normalized_images/*/imagePath)",
+                ),
+            ],
             outputs=[OutputFieldMappingEntry(name="output", target_name="images")],
         )
-        # TODO: project images into a container
 
         index_projection = SearchIndexerIndexProjection(
             selectors=[
@@ -171,6 +177,10 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
 
         # We still need to map the images onto url in the images complex field type
         # something about key path
+        # id = "feb5e192afb6_aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L05vcnRod2luZF9IZWFsdGhfUGx1c19CZW5lZml0c19EZXRhaWxzLnBkZg2_pages_65",
+        # parent_id = is the folder name
+        # https://stxxk4qzq3tahic2.blob.core.windows.net/images/aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L0JlbmVmaXRfT3B0aW9ucy5wZGY1/normalized_images_1.jpg
+
         skillset = SearchIndexerSkillset(
             name=self.skillset_name,
             description="Skillset to chunk documents and generate embeddings",
@@ -235,7 +245,7 @@ async def run(self):
                     configuration=IndexingParametersConfiguration(
                         query_timeout=None,  # Current bug in AI Search SDK
                         image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGES,
-                    ),
+                    )
                 )
             }
 
diff --git a/todo.txt b/todo.txt
@@ -22,6 +22,15 @@ TODO:
 
 To decide:
 * For user data lake client, how often should we double check the ACL matches the oid, versus assuming the URLs convey that? (Like when fetching the image?)
+   * add a note that we only check owner, not full access control
+   * add owner check to each blob-related method (/list, /remove)
+   * move fetch image into blobmanager
+   * WHY ARE THERE NO OIDS on the uploaded sections????
 
 Later:
 Agentic: Incompatible since it doesnt retrieve images. We would need to do a follow-up search query to get each document, like filter: id eq 'x' or id eq 'y' or....
+
+
+"id":
+"feb5e192afb6_aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L05vcnRod2luZF9IZWFsdGhfUGx1c19CZW5lZml0c19EZXRhaWxzLnBkZg2_pages_65",
+"aHR0cHM6Ly9zdHh4azRxenEzdGFoaWMyLmJsb2IuY29yZS53aW5kb3dzLm5ldC9jb250ZW50L05vcnRod2luZF9IZWFsdGhfUGx1c19CZW5lZml0c19EZXRhaWxzLnBkZg2",