2121from prepdocslib .textprocessor import process_text
2222from prepdocslib .textsplitter import SentenceTextSplitter
2323
24- app = func .FunctionApp ()
24+ # Mark the function as anonymous since we are protecting it with built-in auth instead
25+ app = func .FunctionApp (http_auth_level = func .AuthLevel .ANONYMOUS )
2526
2627logger = logging .getLogger (__name__ )
2728
3334AZURE_OPENAI_EMB_DEPLOYMENT = os .getenv ("AZURE_OPENAI_EMB_DEPLOYMENT" , "" )
3435AZURE_OPENAI_EMB_MODEL_NAME = os .getenv ("AZURE_OPENAI_EMB_MODEL_NAME" , "text-embedding-3-large" )
3536AZURE_OPENAI_EMB_DIMENSIONS = int (os .getenv ("AZURE_OPENAI_EMB_DIMENSIONS" , "3072" ))
36- AZURE_OPENAI_API_VERSION = os .getenv ("AZURE_OPENAI_API_VERSION" , "" )
37+ AZURE_OPENAI_API_VERSION = os .getenv ("AZURE_OPENAI_API_VERSION" , "2024-06-01 " )
3738
3839GLOBAL_CREDENTIAL : ManagedIdentityCredential | None
3940EMBEDDING_SERVICE : AzureOpenAIEmbeddingService | None
8485
8586
8687@app .function_name (name = "process_text" )
87- @app .route (route = "process" , methods = ["POST" ])
88+ @app .route (route = "process" , methods = ["POST" ], auth_level = func . AuthLevel . ANONYMOUS )
8889async def process_text_entry (req : func .HttpRequest ) -> func .HttpResponse :
8990 """Azure Search custom skill entry point for chunking and embeddings."""
9091
@@ -169,7 +170,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]:
169170 if not figure_payload :
170171 logger .warning ("Figure ID %s not found in figures metadata for page %d" , fid , page_num )
171172 continue
172- image_on_page = ImageOnPage .from_skill_payload (figure_payload )
173+ image_on_page , _ = ImageOnPage .from_skill_payload (figure_payload )
173174 page_obj .images .append (image_on_page )
174175 pages .append (page_obj )
175176
@@ -202,7 +203,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]:
202203 content = section .chunk .text .strip ()
203204 if not content :
204205 continue
205- embedding_vec = embeddings [idx ] if embeddings else []
206+ embedding_vec = embeddings [idx ] if embeddings else None
206207 image_refs : list [dict [str , Any ]] = []
207208 for image in section .chunk .images :
208209 ref = {
@@ -216,16 +217,29 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]:
216217 if USE_MULTIMODAL and image .embedding is not None :
217218 ref ["imageEmbedding" ] = image .embedding
218219 image_refs .append (ref )
219- outputs .append (
220- {
221- "id" : f"{ normalized_id } -{ idx :04d} " ,
222- "content" : content ,
223- "embedding" : embedding_vec ,
224- "sourcepage" : BlobManager .sourcepage_from_file_page (file_name , section .chunk .page_num ),
225- "sourcefile" : file_name ,
226- "parent_id" : storage_url ,
227- ** ({"images" : image_refs } if image_refs else {}),
228- }
229- )
220+ chunk_entry : dict [str , Any ] = {
221+ "id" : f"{ normalized_id } -{ idx :04d} " ,
222+ "content" : content ,
223+ "sourcepage" : BlobManager .sourcepage_from_file_page (file_name , section .chunk .page_num ),
224+ "sourcefile" : file_name ,
225+ "parent_id" : storage_url ,
226+ ** ({"images" : image_refs } if image_refs else {}),
227+ }
228+
229+ if embedding_vec is not None :
230+ if len (embedding_vec ) == AZURE_OPENAI_EMB_DIMENSIONS :
231+ chunk_entry ["embedding" ] = embedding_vec
232+ else :
233+ logger .warning (
234+ "Skipping embedding for %s chunk %d due to dimension mismatch (expected %d, got %d)" ,
235+ file_name ,
236+ idx ,
237+ AZURE_OPENAI_EMB_DIMENSIONS ,
238+ len (embedding_vec ),
239+ )
240+ elif USE_VECTORS :
241+ logger .warning ("Embeddings were requested but missing for %s chunk %d" , file_name , idx )
242+
243+ outputs .append (chunk_entry )
230244
231245 return outputs
0 commit comments