Skip to content

Commit d8dd729

Browse files
committed
latest changes to get auth working
1 parent 9ac595f commit d8dd729

File tree

9 files changed

+114
-41
lines changed

9 files changed

+114
-41
lines changed

app/backend/prepdocs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -461,11 +461,11 @@ async def main(strategy: Strategy, setup_index: bool = True):
461461
raise ValueError("Cloud ingestion requires Azure OpenAI embeddings to configure the search index.")
462462

463463
document_extractor_uri = require_env_var("DOCUMENT_EXTRACTOR_SKILL_ENDPOINT")
464-
document_extractor_resource_id = require_env_var("DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID")
464+
document_extractor_resource_id = require_env_var("DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID")
465465
figure_processor_uri = require_env_var("FIGURE_PROCESSOR_SKILL_ENDPOINT")
466-
figure_processor_resource_id = require_env_var("FIGURE_PROCESSOR_SKILL_RESOURCE_ID")
466+
figure_processor_resource_id = require_env_var("FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID")
467467
text_processor_uri = require_env_var("TEXT_PROCESSOR_SKILL_ENDPOINT")
468-
text_processor_resource_id = require_env_var("TEXT_PROCESSOR_SKILL_RESOURCE_ID")
468+
text_processor_resource_id = require_env_var("TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID")
469469
search_embedding_field = require_env_var("AZURE_SEARCH_FIELD_NAME_EMBEDDING")
470470

471471
ingestion_strategy = CloudIngestionStrategy(

app/backend/prepdocslib/cloudingestionstrategy.py

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
NativeBlobSoftDeleteDeletionDetectionPolicy,
1111
)
1212
from azure.search.documents.indexes.models import (
13+
IndexingParameters,
14+
IndexingParametersConfiguration,
1315
IndexProjectionMode,
1416
InputFieldMappingEntry,
1517
OutputFieldMappingEntry,
@@ -102,33 +104,23 @@ def __init__(
102104
self.indexer_name = f"{prefix}-indexer"
103105
self.data_source_name = f"{prefix}-blob"
104106

105-
def _ensure_default_scope(val: str) -> str:
106-
# If already ends with '/.default' keep as-is.
107-
if val.endswith("/.default"):
108-
return val
109-
# If already contains '.default' (rare variant) keep.
110-
if val.endswith(".default"):
111-
return val
112-
# Append '/.default' consistently (works for both raw appId and api://appId forms).
113-
return f"{val}/.default"
114-
115107
self.document_extractor = _SkillConfig(
116108
name=f"{prefix}-document-extractor-skill",
117109
description="Custom skill that downloads and parses source documents",
118110
uri=document_extractor_uri,
119-
auth_resource_id=_ensure_default_scope(document_extractor_auth_resource_id),
111+
auth_resource_id=document_extractor_auth_resource_id,
120112
)
121113
self.figure_processor = _SkillConfig(
122114
name=f"{prefix}-figure-processor-skill",
123115
description="Custom skill that enriches individual figures",
124116
uri=figure_processor_uri,
125-
auth_resource_id=_ensure_default_scope(figure_processor_auth_resource_id),
117+
auth_resource_id=figure_processor_auth_resource_id,
126118
)
127119
self.text_processor = _SkillConfig(
128120
name=f"{prefix}-text-processor-skill",
129121
description="Custom skill that merges figures, chunks text, and generates embeddings",
130122
uri=text_processor_uri,
131-
auth_resource_id=_ensure_default_scope(text_processor_auth_resource_id),
123+
auth_resource_id=text_processor_auth_resource_id,
132124
)
133125

134126
self._search_manager: SearchManager | None = None
@@ -166,12 +158,10 @@ def _build_document_extractor_skill(self) -> WebApiSkill:
166158
# Managed identity: Search service authenticates against the function app using this resource ID.
167159
auth_resource_id=self.document_extractor.auth_resource_id,
168160
inputs=[
169-
InputFieldMappingEntry(name="blobUrl", source="/document/metadata_storage_path"),
161+
# Provide the binary payload expected by the document extractor custom skill.
162+
InputFieldMappingEntry(name="file_data", source="/document/file_data"),
170163
InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"),
171164
InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"),
172-
InputFieldMappingEntry(
173-
name="metadata_storage_sas_token", source="/document/metadata_storage_sas_token"
174-
),
175165
],
176166
outputs=outputs,
177167
)
@@ -310,6 +300,15 @@ async def run(self) -> None:
310300
data_source_name=self.data_source_name,
311301
target_index_name=self.search_info.index_name,
312302
skillset_name=self.skillset_name,
303+
parameters=IndexingParameters(
304+
configuration=IndexingParametersConfiguration(
305+
query_timeout=None,
306+
# markdown_parsing_submode=None,
307+
data_to_extract="contentAndMetadata",
308+
# markdown_header_depth=None,
309+
allow_skillset_to_read_file_data=True,
310+
)
311+
),
313312
)
314313

315314
async with self.search_info.create_search_indexer_client() as indexer_client:

app/functions/document_extractor/function_app.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from prepdocslib.ingestionhelpers import select_parser
1818
from prepdocslib.page import Page
1919

20-
app = func.FunctionApp()
20+
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
2121

2222
logger = logging.getLogger(__name__)
2323

@@ -38,7 +38,7 @@
3838

3939

4040
@app.function_name(name="extract")
41-
@app.route(route="extract", methods=["POST"])
41+
@app.route(route="extract", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
4242
async def extract_document(req: func.HttpRequest) -> func.HttpResponse:
4343
"""
4444
Azure Search Custom Skill: Extract document content

app/functions/document_extractor/host.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
{
22
"version": "2.0",
3+
"extensions": {
4+
"mcp": {
5+
"system": {
6+
"webhookAuthorizationLevel": "anonymous"
7+
}
8+
}
9+
},
310
"extensionBundle": {
411
"id": "Microsoft.Azure.Functions.ExtensionBundle",
512
"version": "[4.*, 5.0.0)"

app/functions/figure_processor/function_app.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
setup_openai_client,
3232
)
3333

34-
app = func.FunctionApp()
34+
# Mark the function as anonymous since we are protecting it with built-in auth instead
35+
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
3536

3637
logger = logging.getLogger(__name__)
3738

@@ -118,7 +119,7 @@
118119

119120

120121
@app.function_name(name="process_figure")
121-
@app.route(route="process", methods=["POST"])
122+
@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
122123
async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse:
123124
"""Entrypoint for Azure Search custom skill calls."""
124125

app/functions/figure_processor/host.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
{
22
"version": "2.0",
3+
"extensions": {
4+
"mcp": {
5+
"system": {
6+
"webhookAuthorizationLevel": "anonymous"
7+
}
8+
}
9+
},
310
"extensionBundle": {
411
"id": "Microsoft.Azure.Functions.ExtensionBundle",
512
"version": "[4.*, 5.0.0)"

app/functions/text_processor/function_app.py

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
from prepdocslib.textprocessor import process_text
2222
from prepdocslib.textsplitter import SentenceTextSplitter
2323

24-
app = func.FunctionApp()
24+
# Mark the function as anonymous since we are protecting it with built-in auth instead
25+
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
2526

2627
logger = logging.getLogger(__name__)
2728

@@ -33,7 +34,7 @@
3334
AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT", "")
3435
AZURE_OPENAI_EMB_MODEL_NAME = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-3-large")
3536
AZURE_OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "3072"))
36-
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "")
37+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-06-01")
3738

3839
GLOBAL_CREDENTIAL: ManagedIdentityCredential | None
3940
EMBEDDING_SERVICE: AzureOpenAIEmbeddingService | None
@@ -84,7 +85,7 @@
8485

8586

8687
@app.function_name(name="process_text")
87-
@app.route(route="process", methods=["POST"])
88+
@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
8889
async def process_text_entry(req: func.HttpRequest) -> func.HttpResponse:
8990
"""Azure Search custom skill entry point for chunking and embeddings."""
9091

@@ -169,7 +170,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]:
169170
if not figure_payload:
170171
logger.warning("Figure ID %s not found in figures metadata for page %d", fid, page_num)
171172
continue
172-
image_on_page = ImageOnPage.from_skill_payload(figure_payload)
173+
image_on_page, _ = ImageOnPage.from_skill_payload(figure_payload)
173174
page_obj.images.append(image_on_page)
174175
pages.append(page_obj)
175176

@@ -202,7 +203,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]:
202203
content = section.chunk.text.strip()
203204
if not content:
204205
continue
205-
embedding_vec = embeddings[idx] if embeddings else []
206+
embedding_vec = embeddings[idx] if embeddings else None
206207
image_refs: list[dict[str, Any]] = []
207208
for image in section.chunk.images:
208209
ref = {
@@ -216,16 +217,29 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]:
216217
if USE_MULTIMODAL and image.embedding is not None:
217218
ref["imageEmbedding"] = image.embedding
218219
image_refs.append(ref)
219-
outputs.append(
220-
{
221-
"id": f"{normalized_id}-{idx:04d}",
222-
"content": content,
223-
"embedding": embedding_vec,
224-
"sourcepage": BlobManager.sourcepage_from_file_page(file_name, section.chunk.page_num),
225-
"sourcefile": file_name,
226-
"parent_id": storage_url,
227-
**({"images": image_refs} if image_refs else {}),
228-
}
229-
)
220+
chunk_entry: dict[str, Any] = {
221+
"id": f"{normalized_id}-{idx:04d}",
222+
"content": content,
223+
"sourcepage": BlobManager.sourcepage_from_file_page(file_name, section.chunk.page_num),
224+
"sourcefile": file_name,
225+
"parent_id": storage_url,
226+
**({"images": image_refs} if image_refs else {}),
227+
}
228+
229+
if embedding_vec is not None:
230+
if len(embedding_vec) == AZURE_OPENAI_EMB_DIMENSIONS:
231+
chunk_entry["embedding"] = embedding_vec
232+
else:
233+
logger.warning(
234+
"Skipping embedding for %s chunk %d due to dimension mismatch (expected %d, got %d)",
235+
file_name,
236+
idx,
237+
AZURE_OPENAI_EMB_DIMENSIONS,
238+
len(embedding_vec),
239+
)
240+
elif USE_VECTORS:
241+
logger.warning("Embeddings were requested but missing for %s chunk %d", file_name, idx)
242+
243+
outputs.append(chunk_entry)
230244

231245
return outputs

app/functions/text_processor/host.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
{
22
"version": "2.0",
3+
"extensions": {
4+
"mcp": {
5+
"system": {
6+
"webhookAuthorizationLevel": "anonymous"
7+
}
8+
}
9+
},
310
"extensionBundle": {
411
"id": "Microsoft.Azure.Functions.ExtensionBundle",
512
"version": "[4.*, 5.0.0)"

infra/app/functions-app.bicep

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ var baseAppSettings = {
4949
AzureWebJobsStorage__queueServiceUri: stg.properties.primaryEndpoints.queue
5050
AzureWebJobsStorage__tableServiceUri: stg.properties.primaryEndpoints.table
5151
FUNCTIONS_EXTENSION_VERSION: '~4'
52+
AZURE_CLIENT_ID: identityClientId
5253
}
5354

5455
// Optional Application Insights settings
@@ -59,6 +60,8 @@ var appInsightsSettings = !empty(applicationInsightsName) ? {
5960

6061
var easyAuthSettings = {
6162
OVERRIDE_USE_MI_FIC_ASSERTION_CLIENTID: identityClientId
63+
WEBSITE_AUTH_PRM_DEFAULT_WITH_SCOPES: '${authIdentifierUri}/user_impersonation'
64+
WEBSITE_AUTH_AAD_ALLOWED_TENANTS: authTenantId
6265
}
6366

6467
// Merge all app settings
@@ -124,6 +127,15 @@ resource auth 'Microsoft.Web/sites/config@2022-03-01' = {
124127
unauthenticatedClientAction: 'Return401'
125128
redirectToProvider: 'azureactivedirectory'
126129
}
130+
httpSettings: {
131+
requireHttps: true
132+
routes: {
133+
apiPrefix: '/.auth'
134+
}
135+
forwardProxy: {
136+
convention: 'NoProxy'
137+
}
138+
}
127139
identityProviders: {
128140
azureActiveDirectory: {
129141
enabled: true
@@ -139,10 +151,36 @@ resource auth 'Microsoft.Web/sites/config@2022-03-01' = {
139151
]
140152
defaultAuthorizationPolicy: {
141153
allowedPrincipals: {}
142-
allowedApplications: [authClientId]
154+
allowedApplications: null // TODO: Restrict to AI Search App
143155
}
144156
}
157+
isAutoProvisioned: false
158+
}
159+
}
160+
login: {
161+
routes: {
162+
logoutEndpoint: '/.auth/logout'
145163
}
164+
tokenStore: {
165+
enabled: true
166+
tokenRefreshExtensionHours: 72
167+
fileSystem: {}
168+
azureBlobStorage: {}
169+
}
170+
preserveUrlFragmentsForLogins: false
171+
allowedExternalRedirectUrls: []
172+
cookieExpiration: {
173+
convention: 'FixedTime'
174+
timeToExpiration: '08:00:00'
175+
}
176+
nonce: {
177+
validateNonce: true
178+
nonceExpirationInterval: '00:05:00'
179+
}
180+
}
181+
platform: {
182+
enabled: true
183+
runtimeVersion: '~1'
146184
}
147185
}
148186
}

0 commit comments

Comments
 (0)