Apply acls to citations (#1160)

mattgotteiner · pamelafox · web-flow · commit bec59be74108 · 2024-01-24T17:39:29.000-08:00
* working

* object urls

* fixes and feedback

* fix typing errors

* adding more tests

* Update app/backend/app.py

* Update app/backend/app.py

Co-authored-by: Pamela Fox &lt;pamela.fox@gmail.com&gt;

* addressing feedback

* more feedback

---------

Co-authored-by: Matt Gotteiner &lt;[email protected]&gt;
Co-authored-by: Pamela Fox &lt;pamela.fox@gmail.com&gt;
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -5,7 +5,7 @@
 import mimetypes
 import os
 from pathlib import Path
-from typing import AsyncGenerator, cast
+from typing import Any, AsyncGenerator, Dict, cast
 
 from azure.core.exceptions import ResourceNotFoundError
 from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
@@ -14,7 +14,7 @@
 from azure.search.documents.aio import SearchClient
 from azure.search.documents.indexes.aio import SearchIndexClient
 from azure.storage.blob.aio import BlobServiceClient
-from openai import APIError, AsyncAzureOpenAI, AsyncOpenAI
+from openai import AsyncAzureOpenAI, AsyncOpenAI
 from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
 from opentelemetry.instrumentation.asgi import OpenTelemetryMiddleware
 from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
@@ -36,24 +36,20 @@
 from approaches.chatreadretrievereadvision import ChatReadRetrieveReadVisionApproach
 from approaches.retrievethenread import RetrieveThenReadApproach
 from approaches.retrievethenreadvision import RetrieveThenReadVisionApproach
+from config import (
+    CONFIG_ASK_APPROACH,
+    CONFIG_ASK_VISION_APPROACH,
+    CONFIG_AUTH_CLIENT,
+    CONFIG_BLOB_CONTAINER_CLIENT,
+    CONFIG_CHAT_APPROACH,
+    CONFIG_CHAT_VISION_APPROACH,
+    CONFIG_GPT4V_DEPLOYED,
+    CONFIG_OPENAI_CLIENT,
+    CONFIG_SEARCH_CLIENT,
+)
 from core.authentication import AuthenticationHelper
-
-CONFIG_OPENAI_TOKEN = "openai_token"
-CONFIG_CREDENTIAL = "azure_credential"
-CONFIG_ASK_APPROACH = "ask_approach"
-CONFIG_ASK_VISION_APPROACH = "ask_vision_approach"
-CONFIG_CHAT_VISION_APPROACH = "chat_vision_approach"
-CONFIG_CHAT_APPROACH = "chat_approach"
-CONFIG_BLOB_CONTAINER_CLIENT = "blob_container_client"
-CONFIG_AUTH_CLIENT = "auth_client"
-CONFIG_GPT4V_DEPLOYED = "gpt4v_deployed"
-CONFIG_SEARCH_CLIENT = "search_client"
-CONFIG_OPENAI_CLIENT = "openai_client"
-ERROR_MESSAGE = """The app encountered an error processing your request.
-If you are an administrator of the app, view the full error in the logs. See aka.ms/appservice-logs for more information.
-Error type: {error_type}
-"""
-ERROR_MESSAGE_FILTER = """Your message contains content that was flagged by the OpenAI content filter."""
+from decorators import authenticated, authenticated_path
+from error import error_dict, error_response
 
 bp = Blueprint("routes", __name__, static_folder="static")
 # Fix Windows registry issue with mimetypes
@@ -83,11 +79,16 @@ async def assets(path):
     return await send_from_directory(Path(__file__).resolve().parent / "static" / "assets", path)
 
 
-# Serve content files from blob storage from within the app to keep the example self-contained.
-# *** NOTE *** this assumes that the content files are public, or at least that all users of the app
-# can access all the files. This is also slow and memory hungry.
 @bp.route("/content/<path>")
+@authenticated_path
 async def content_file(path: str):
+    """
+    Serve content files from blob storage from within the app to keep the example self-contained.
+    *** NOTE *** if you are using app services authentication, this route will return unauthorized to all users that are not logged in
+    if AZURE_ENFORCE_ACCESS_CONTROL is not set or false, logged in users can access all files regardless of access control
+    if AZURE_ENFORCE_ACCESS_CONTROL is set to true, logged in users can only access files they have access to
+    This is also slow and memory hungry.
+    """
     # Remove page number from path, filename-1.txt -> filename.txt
     if path.find("#page=") > 0:
         path_parts = path.rsplit("#page=", 1)
@@ -110,28 +111,15 @@ async def content_file(path: str):
     return await send_file(blob_file, mimetype=mime_type, as_attachment=False, attachment_filename=path)
 
 
-def error_dict(error: Exception) -> dict:
-    if isinstance(error, APIError) and error.code == "content_filter":
-        return {"error": ERROR_MESSAGE_FILTER}
-    return {"error": ERROR_MESSAGE.format(error_type=type(error))}
-
-
-def error_response(error: Exception, route: str, status_code: int = 500):
-    logging.exception("Exception in %s: %s", route, error)
-    if isinstance(error, APIError) and error.code == "content_filter":
-        status_code = 400
-    return jsonify(error_dict(error)), status_code
-
-
 @bp.route("/ask", methods=["POST"])
-async def ask():
+@authenticated
+async def ask(auth_claims: Dict[str, Any]):
     if not request.is_json:
         return jsonify({"error": "request must be json"}), 415
     request_json = await request.get_json()
     context = request_json.get("context", {})
-    auth_helper = current_app.config[CONFIG_AUTH_CLIENT]
+    context["auth_claims"] = auth_claims
     try:
-        context["auth_claims"] = await auth_helper.get_auth_claims_if_enabled(request.headers)
         use_gpt4v = context.get("overrides", {}).get("use_gpt4v", False)
         approach: Approach
         if use_gpt4v and CONFIG_ASK_VISION_APPROACH in current_app.config:
@@ -163,14 +151,14 @@ async def format_as_ndjson(r: AsyncGenerator[dict, None]) -> AsyncGenerator[str,
 
 
 @bp.route("/chat", methods=["POST"])
-async def chat():
+@authenticated
+async def chat(auth_claims: Dict[str, Any]):
     if not request.is_json:
         return jsonify({"error": "request must be json"}), 415
     request_json = await request.get_json()
     context = request_json.get("context", {})
-    auth_helper = current_app.config[CONFIG_AUTH_CLIENT]
+    context["auth_claims"] = auth_claims
     try:
-        context["auth_claims"] = await auth_helper.get_auth_claims_if_enabled(request.headers)
         use_gpt4v = context.get("overrides", {}).get("use_gpt4v", False)
         approach: Approach
         if use_gpt4v and CONFIG_CHAT_VISION_APPROACH in current_app.config:
diff --git a/app/backend/config.py b/app/backend/config.py
@@ -0,0 +1,11 @@
+CONFIG_OPENAI_TOKEN = "openai_token"
+CONFIG_CREDENTIAL = "azure_credential"
+CONFIG_ASK_APPROACH = "ask_approach"
+CONFIG_ASK_VISION_APPROACH = "ask_vision_approach"
+CONFIG_CHAT_VISION_APPROACH = "chat_vision_approach"
+CONFIG_CHAT_APPROACH = "chat_approach"
+CONFIG_BLOB_CONTAINER_CLIENT = "blob_container_client"
+CONFIG_AUTH_CLIENT = "auth_client"
+CONFIG_GPT4V_DEPLOYED = "gpt4v_deployed"
+CONFIG_SEARCH_CLIENT = "search_client"
+CONFIG_OPENAI_CLIENT = "openai_client"
diff --git a/app/backend/core/authentication.py b/app/backend/core/authentication.py
@@ -5,6 +5,7 @@
 from typing import Any, Optional
 
 import aiohttp
+from azure.search.documents.aio import SearchClient
 from azure.search.documents.indexes.models import SearchIndex
 from msal import ConfidentialClientApplication
 from msal.token_cache import TokenCache
@@ -216,3 +217,23 @@ async def get_auth_claims_if_enabled(self, headers: dict) -> dict[str, Any]:
             if self.require_access_control:
                 raise
             return {}
+
+    async def check_path_auth(self, path: str, auth_claims: dict[str, Any], search_client: SearchClient) -> bool:
+        # Start with the standard security filter for all queries
+        security_filter = self.build_security_filters(overrides={}, auth_claims=auth_claims)
+        # If there was no security filter, then the path is allowed
+        if not security_filter:
+            return True
+
+        # Filter down to only chunks that are from the specific source file
+        filter = f"{security_filter} and (sourcepage eq '{path}')"
+
+        # If the filter returns any results, the user is allowed to access the document
+        # Otherwise, access is denied
+        results = await search_client.search(search_text="*", top=1, filter=filter)
+        allowed = False
+        async for _ in results:
+            allowed = True
+            break
+
+        return allowed
diff --git a/app/backend/decorators.py b/app/backend/decorators.py
@@ -0,0 +1,55 @@
+import logging
+from functools import wraps
+from typing import Any, Callable, Dict
+
+from quart import abort, current_app, request
+
+from config import CONFIG_AUTH_CLIENT, CONFIG_SEARCH_CLIENT
+from core.authentication import AuthError
+from error import error_response
+
+
+def authenticated_path(route_fn: Callable[[str], Any]):
+    """
+    Decorator for routes that request a specific file that might require access control enforcement
+    """
+
+    @wraps(route_fn)
+    async def auth_handler(path=""):
+        # If authentication is enabled, validate the user can access the file
+        auth_helper = current_app.config[CONFIG_AUTH_CLIENT]
+        search_client = current_app.config[CONFIG_SEARCH_CLIENT]
+        authorized = False
+        try:
+            auth_claims = await auth_helper.get_auth_claims_if_enabled(request.headers)
+            authorized = await auth_helper.check_path_auth(path, auth_claims, search_client)
+        except AuthError:
+            abort(403)
+        except Exception as error:
+            logging.exception("Problem checking path auth %s", error)
+            return error_response(error, route="/content")
+
+        if not authorized:
+            abort(403)
+
+        return await route_fn(path)
+
+    return auth_handler
+
+
+def authenticated(route_fn: Callable[[Dict[str, Any]], Any]):
+    """
+    Decorator for routes that might require access control. Unpacks Authorization header information into an auth_claims dictionary
+    """
+
+    @wraps(route_fn)
+    async def auth_handler():
+        auth_helper = current_app.config[CONFIG_AUTH_CLIENT]
+        try:
+            auth_claims = await auth_helper.get_auth_claims_if_enabled(request.headers)
+        except AuthError:
+            abort(403)
+
+        return await route_fn(auth_claims)
+
+    return auth_handler
diff --git a/app/backend/error.py b/app/backend/error.py
@@ -0,0 +1,23 @@
+import logging
+
+from openai import APIError
+from quart import jsonify
+
+ERROR_MESSAGE = """The app encountered an error processing your request.
+If you are an administrator of the app, view the full error in the logs. See aka.ms/appservice-logs for more information.
+Error type: {error_type}
+"""
+ERROR_MESSAGE_FILTER = """Your message contains content that was flagged by the OpenAI content filter."""
+
+
+def error_dict(error: Exception) -> dict:
+    if isinstance(error, APIError) and error.code == "content_filter":
+        return {"error": ERROR_MESSAGE_FILTER}
+    return {"error": ERROR_MESSAGE.format(error_type=type(error))}
+
+
+def error_response(error: Exception, route: str, status_code: int = 500):
+    logging.exception("Exception in %s: %s", route, error)
+    if isinstance(error, APIError) and error.code == "content_filter":
+        status_code = 400
+    return jsonify(error_dict(error)), status_code
diff --git a/app/frontend/src/api/api.ts b/app/frontend/src/api/api.ts
@@ -3,7 +3,7 @@ const BACKEND_URI = "";
 import { ChatAppResponse, ChatAppResponseOrError, ChatAppRequest, Config } from "./models";
 import { useLogin, appServicesToken } from "../authConfig";
 
-function getHeaders(idToken: string | undefined): Record<string, string> {
+export function getHeaders(idToken: string | undefined): Record<string, string> {
     var headers: Record<string, string> = {
         "Content-Type": "application/json"
     };
diff --git a/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx b/app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx
@@ -7,6 +7,10 @@ import { SupportingContent } from "../SupportingContent";
 import { ChatAppResponse } from "../../api";
 import { AnalysisPanelTabs } from "./AnalysisPanelTabs";
 import { ThoughtProcess } from "./ThoughtProcess";
+import { useMsal } from "@azure/msal-react";
+import { getHeaders } from "../../api";
+import { useLogin, getToken } from "../../authConfig";
+import { useState, useEffect } from "react";
 
 interface Props {
     className: string;
@@ -23,6 +27,25 @@ export const AnalysisPanel = ({ answer, activeTab, activeCitation, citationHeigh
     const isDisabledThoughtProcessTab: boolean = !answer.choices[0].context.thoughts;
     const isDisabledSupportingContentTab: boolean = !answer.choices[0].context.data_points;
     const isDisabledCitationTab: boolean = !activeCitation;
+    const [citation, setCitation] = useState("");
+
+    const client = useLogin ? useMsal().instance : undefined;
+
+    const fetchCitation = async () => {
+        const token = client ? await getToken(client) : undefined;
+        if (activeCitation) {
+            const response = await fetch(activeCitation, {
+                method: "GET",
+                headers: getHeaders(token)
+            });
+            const citationContent = await response.blob();
+            const citationObjectUrl = URL.createObjectURL(citationContent);
+            setCitation(citationObjectUrl);
+        }
+    };
+    useEffect(() => {
+        fetchCitation();
+    }, []);
 
     return (
         <Pivot
@@ -50,9 +73,9 @@ export const AnalysisPanel = ({ answer, activeTab, activeCitation, citationHeigh
                 headerButtonProps={isDisabledCitationTab ? pivotItemDisabledStyle : undefined}
             >
                 {activeCitation?.endsWith(".png") ? (
-                    <img src={activeCitation} className={styles.citationImg} />
+                    <img src={citation} className={styles.citationImg} />
                 ) : (
-                    <iframe title="Citation" src={activeCitation} width="100%" height={citationHeight} />
+                    <iframe title="Citation" src={citation} width="100%" height={citationHeight} />
                 )}
             </PivotItem>
         </Pivot>
diff --git a/tests/test_authenticationhelper.py b/tests/test_authenticationhelper.py