latest changes to get auth working

pamelafox · pamelafox · commit d8dd729ab1ae · 2025-11-05T14:48:52.000-08:00
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -461,11 +461,11 @@ async def main(strategy: Strategy, setup_index: bool = True):
             raise ValueError("Cloud ingestion requires Azure OpenAI embeddings to configure the search index.")
 
         document_extractor_uri = require_env_var("DOCUMENT_EXTRACTOR_SKILL_ENDPOINT")
-        document_extractor_resource_id = require_env_var("DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID")
+        document_extractor_resource_id = require_env_var("DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID")
         figure_processor_uri = require_env_var("FIGURE_PROCESSOR_SKILL_ENDPOINT")
-        figure_processor_resource_id = require_env_var("FIGURE_PROCESSOR_SKILL_RESOURCE_ID")
+        figure_processor_resource_id = require_env_var("FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID")
         text_processor_uri = require_env_var("TEXT_PROCESSOR_SKILL_ENDPOINT")
-        text_processor_resource_id = require_env_var("TEXT_PROCESSOR_SKILL_RESOURCE_ID")
+        text_processor_resource_id = require_env_var("TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID")
         search_embedding_field = require_env_var("AZURE_SEARCH_FIELD_NAME_EMBEDDING")
 
         ingestion_strategy = CloudIngestionStrategy(
diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py
@@ -10,6 +10,8 @@
     NativeBlobSoftDeleteDeletionDetectionPolicy,
 )
 from azure.search.documents.indexes.models import (
+    IndexingParameters,
+    IndexingParametersConfiguration,
     IndexProjectionMode,
     InputFieldMappingEntry,
     OutputFieldMappingEntry,
@@ -102,33 +104,23 @@ def __init__(
         self.indexer_name = f"{prefix}-indexer"
         self.data_source_name = f"{prefix}-blob"
 
-        def _ensure_default_scope(val: str) -> str:
-            # If already ends with '/.default' keep as-is.
-            if val.endswith("/.default"):
-                return val
-            # If already contains '.default' (rare variant) keep.
-            if val.endswith(".default"):
-                return val
-            # Append '/.default' consistently (works for both raw appId and api://appId forms).
-            return f"{val}/.default"
-
         self.document_extractor = _SkillConfig(
             name=f"{prefix}-document-extractor-skill",
             description="Custom skill that downloads and parses source documents",
             uri=document_extractor_uri,
-            auth_resource_id=_ensure_default_scope(document_extractor_auth_resource_id),
+            auth_resource_id=document_extractor_auth_resource_id,
         )
         self.figure_processor = _SkillConfig(
             name=f"{prefix}-figure-processor-skill",
             description="Custom skill that enriches individual figures",
             uri=figure_processor_uri,
-            auth_resource_id=_ensure_default_scope(figure_processor_auth_resource_id),
+            auth_resource_id=figure_processor_auth_resource_id,
         )
         self.text_processor = _SkillConfig(
             name=f"{prefix}-text-processor-skill",
             description="Custom skill that merges figures, chunks text, and generates embeddings",
             uri=text_processor_uri,
-            auth_resource_id=_ensure_default_scope(text_processor_auth_resource_id),
+            auth_resource_id=text_processor_auth_resource_id,
         )
 
         self._search_manager: SearchManager | None = None
@@ -166,12 +158,10 @@ def _build_document_extractor_skill(self) -> WebApiSkill:
             # Managed identity: Search service authenticates against the function app using this resource ID.
             auth_resource_id=self.document_extractor.auth_resource_id,
             inputs=[
-                InputFieldMappingEntry(name="blobUrl", source="/document/metadata_storage_path"),
+                # Provide the binary payload expected by the document extractor custom skill.
+                InputFieldMappingEntry(name="file_data", source="/document/file_data"),
                 InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"),
                 InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"),
-                InputFieldMappingEntry(
-                    name="metadata_storage_sas_token", source="/document/metadata_storage_sas_token"
-                ),
             ],
             outputs=outputs,
         )
@@ -310,6 +300,15 @@ async def run(self) -> None:
             data_source_name=self.data_source_name,
             target_index_name=self.search_info.index_name,
             skillset_name=self.skillset_name,
+            parameters=IndexingParameters(
+                configuration=IndexingParametersConfiguration(
+                    query_timeout=None,
+                    # markdown_parsing_submode=None,
+                    data_to_extract="contentAndMetadata",
+                    # markdown_header_depth=None,
+                    allow_skillset_to_read_file_data=True,
+                )
+            ),
         )
 
         async with self.search_info.create_search_indexer_client() as indexer_client:
diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py
@@ -17,7 +17,7 @@
 from prepdocslib.ingestionhelpers import select_parser
 from prepdocslib.page import Page
 
-app = func.FunctionApp()
+app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
 
 logger = logging.getLogger(__name__)
 
@@ -38,7 +38,7 @@
 
 
 @app.function_name(name="extract")
-@app.route(route="extract", methods=["POST"])
+@app.route(route="extract", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
 async def extract_document(req: func.HttpRequest) -> func.HttpResponse:
     """
     Azure Search Custom Skill: Extract document content
diff --git a/app/functions/document_extractor/host.json b/app/functions/document_extractor/host.json
@@ -1,5 +1,12 @@
 {
   "version": "2.0",
+  "extensions": {
+    "mcp": {
+      "system": {
+        "webhookAuthorizationLevel": "anonymous"
+        }
+    }
+  },
   "extensionBundle": {
     "id": "Microsoft.Azure.Functions.ExtensionBundle",
     "version": "[4.*, 5.0.0)"
diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py
@@ -31,7 +31,8 @@
     setup_openai_client,
 )
 
-app = func.FunctionApp()
+# Mark the function as anonymous since we are protecting it with built-in auth instead
+app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
 
 logger = logging.getLogger(__name__)
 
@@ -118,7 +119,7 @@
 
 
 @app.function_name(name="process_figure")
-@app.route(route="process", methods=["POST"])
+@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
 async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse:
     """Entrypoint for Azure Search custom skill calls."""
 
diff --git a/app/functions/figure_processor/host.json b/app/functions/figure_processor/host.json
@@ -1,5 +1,12 @@
 {
   "version": "2.0",
+  "extensions": {
+    "mcp": {
+      "system": {
+        "webhookAuthorizationLevel": "anonymous"
+        }
+    }
+  },
   "extensionBundle": {
     "id": "Microsoft.Azure.Functions.ExtensionBundle",
     "version": "[4.*, 5.0.0)"
diff --git a/app/functions/text_processor/function_app.py b/app/functions/text_processor/function_app.py
@@ -21,7 +21,8 @@
 from prepdocslib.textprocessor import process_text
 from prepdocslib.textsplitter import SentenceTextSplitter
 
-app = func.FunctionApp()
+# Mark the function as anonymous since we are protecting it with built-in auth instead
+app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
 
 logger = logging.getLogger(__name__)
 
@@ -33,7 +34,7 @@
 AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT", "")
 AZURE_OPENAI_EMB_MODEL_NAME = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-3-large")
 AZURE_OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "3072"))
-AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "")
+AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-06-01")
 
 GLOBAL_CREDENTIAL: ManagedIdentityCredential | None
 EMBEDDING_SERVICE: AzureOpenAIEmbeddingService | None
@@ -84,7 +85,7 @@
 
 
 @app.function_name(name="process_text")
-@app.route(route="process", methods=["POST"])
+@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
 async def process_text_entry(req: func.HttpRequest) -> func.HttpResponse:
     """Azure Search custom skill entry point for chunking and embeddings."""
 
@@ -169,7 +170,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]:
             if not figure_payload:
                 logger.warning("Figure ID %s not found in figures metadata for page %d", fid, page_num)
                 continue
-            image_on_page = ImageOnPage.from_skill_payload(figure_payload)
+            image_on_page, _ = ImageOnPage.from_skill_payload(figure_payload)
             page_obj.images.append(image_on_page)
         pages.append(page_obj)
 
@@ -202,7 +203,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]:
         content = section.chunk.text.strip()
         if not content:
             continue
-        embedding_vec = embeddings[idx] if embeddings else []
+        embedding_vec = embeddings[idx] if embeddings else None
         image_refs: list[dict[str, Any]] = []
         for image in section.chunk.images:
             ref = {
@@ -216,16 +217,29 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]:
             if USE_MULTIMODAL and image.embedding is not None:
                 ref["imageEmbedding"] = image.embedding
             image_refs.append(ref)
-        outputs.append(
-            {
-                "id": f"{normalized_id}-{idx:04d}",
-                "content": content,
-                "embedding": embedding_vec,
-                "sourcepage": BlobManager.sourcepage_from_file_page(file_name, section.chunk.page_num),
-                "sourcefile": file_name,
-                "parent_id": storage_url,
-                **({"images": image_refs} if image_refs else {}),
-            }
-        )
+        chunk_entry: dict[str, Any] = {
+            "id": f"{normalized_id}-{idx:04d}",
+            "content": content,
+            "sourcepage": BlobManager.sourcepage_from_file_page(file_name, section.chunk.page_num),
+            "sourcefile": file_name,
+            "parent_id": storage_url,
+            **({"images": image_refs} if image_refs else {}),
+        }
+
+        if embedding_vec is not None:
+            if len(embedding_vec) == AZURE_OPENAI_EMB_DIMENSIONS:
+                chunk_entry["embedding"] = embedding_vec
+            else:
+                logger.warning(
+                    "Skipping embedding for %s chunk %d due to dimension mismatch (expected %d, got %d)",
+                    file_name,
+                    idx,
+                    AZURE_OPENAI_EMB_DIMENSIONS,
+                    len(embedding_vec),
+                )
+        elif USE_VECTORS:
+            logger.warning("Embeddings were requested but missing for %s chunk %d", file_name, idx)
+
+        outputs.append(chunk_entry)
 
     return outputs
diff --git a/app/functions/text_processor/host.json b/app/functions/text_processor/host.json
@@ -1,5 +1,12 @@
 {
   "version": "2.0",
+  "extensions": {
+    "mcp": {
+      "system": {
+        "webhookAuthorizationLevel": "anonymous"
+        }
+    }
+  },
   "extensionBundle": {
     "id": "Microsoft.Azure.Functions.ExtensionBundle",
     "version": "[4.*, 5.0.0)"
diff --git a/infra/app/functions-app.bicep b/infra/app/functions-app.bicep
@@ -49,6 +49,7 @@ var baseAppSettings = {
   AzureWebJobsStorage__queueServiceUri: stg.properties.primaryEndpoints.queue
   AzureWebJobsStorage__tableServiceUri: stg.properties.primaryEndpoints.table
   FUNCTIONS_EXTENSION_VERSION: '~4'
+  AZURE_CLIENT_ID: identityClientId
 }
 
 // Optional Application Insights settings
@@ -59,6 +60,8 @@ var appInsightsSettings = !empty(applicationInsightsName) ? {
 
 var easyAuthSettings = {
     OVERRIDE_USE_MI_FIC_ASSERTION_CLIENTID: identityClientId
+    WEBSITE_AUTH_PRM_DEFAULT_WITH_SCOPES: '${authIdentifierUri}/user_impersonation'
+    WEBSITE_AUTH_AAD_ALLOWED_TENANTS: authTenantId
 }
 
 // Merge all app settings
@@ -124,6 +127,15 @@ resource auth 'Microsoft.Web/sites/config@2022-03-01' = {
       unauthenticatedClientAction: 'Return401'
       redirectToProvider: 'azureactivedirectory'
     }
+    httpSettings: {
+      requireHttps: true
+      routes: {
+        apiPrefix: '/.auth'
+      }
+      forwardProxy: {
+        convention: 'NoProxy'
+      }
+    }
     identityProviders: {
       azureActiveDirectory: {
         enabled: true
@@ -139,10 +151,36 @@ resource auth 'Microsoft.Web/sites/config@2022-03-01' = {
           ]
           defaultAuthorizationPolicy: {
             allowedPrincipals: {}
-            allowedApplications: [authClientId]
+            allowedApplications: null  // TODO: Restrict to AI Search App
           }
         }
+        isAutoProvisioned: false
+      }
+    }
+    login: {
+      routes: {
+        logoutEndpoint: '/.auth/logout'
       }
+      tokenStore: {
+        enabled: true
+        tokenRefreshExtensionHours: 72
+        fileSystem: {}
+        azureBlobStorage: {}
+      }
+      preserveUrlFragmentsForLogins: false
+      allowedExternalRedirectUrls: []
+      cookieExpiration: {
+        convention: 'FixedTime'
+        timeToExpiration: '08:00:00'
+      }
+      nonce: {
+        validateNonce: true
+        nonceExpirationInterval: '00:05:00'
+      }
+    }
+    platform: {
+      enabled: true
+      runtimeVersion: '~1'
     }
   }
 }