From 07d04172800b094cf5c7c49b025a31cd38b30fc5 Mon Sep 17 00:00:00 2001 From: Jared Refamonte Date: Sat, 12 Apr 2025 17:28:12 +0800 Subject: [PATCH 1/8] In `/embed`, check document sensitivity label before processing file --- app/routes/document_routes.py | 98 +++++++++++++++++++++++++++++++++++ requirements.txt | 6 ++- 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/app/routes/document_routes.py b/app/routes/document_routes.py index a1b32b2c..087b8ce7 100644 --- a/app/routes/document_routes.py +++ b/app/routes/document_routes.py @@ -34,8 +34,23 @@ from app.utils.document_loader import get_loader, clean_text, process_documents from app.utils.health import is_health_ok +import zipfile +import re +import pikepdf +from lxml import etree +from xml.etree import ElementTree as ET +from typing import Optional + router = APIRouter() +CONFIDENTIAL_LABELS = [ + "confidential", + "highly confidential", + "restricted", + "top secret", + "internal", + "privileged" +] @router.get("/ids") async def get_all_ids(): @@ -372,6 +387,18 @@ async def embed_file( chunk_size = 64 * 1024 # 64 KB while content := await file.read(chunk_size): await temp_file.write(content) + + # 🔐 Run Sensitivity Check BEFORE processing + sensitivity_label = await detect_sensitivity_label(temp_file_path, file.filename) + + logger.debug("File sensitivity label: %s", sensitivity_label) + + if sensitivity_label and any(label.lower() in sensitivity_label.lower() for label in CONFIDENTIAL_LABELS): + raise HTTPException( + status_code=400, + detail=f"File not processed due to sensitivity level: {sensitivity_label}." + ) + except Exception as e: logger.error( "Failed to save uploaded file | Path: %s | Error: %s | Traceback: %s", @@ -604,3 +631,74 @@ async def query_embeddings_by_file_ids(body: QueryMultipleBody): traceback.format_exc(), ) raise HTTPException(status_code=500, detail=str(e)) + + +# ------------------------------------------------------- +# 📁 Sensitivity Label Extractor +# ------------------------------------------------------- + +async def detect_sensitivity_label(file_path: str, filename: str) -> Optional[str]: + if filename.endswith(".docx") or filename.endswith(".xlsx") or filename.endswith(".pptx"): + return extract_office_sensitivity_label(file_path) + elif filename.endswith(".pdf"): + return extract_pdf_sensitivity_label(file_path) + return None + +def extract_office_sensitivity_label(file_path: str) -> Optional[str]: + try: + with zipfile.ZipFile(file_path, "r") as zipf: + if "docProps/custom.xml" in zipf.namelist(): + with zipf.open("docProps/custom.xml") as custom_file: + xml_content = custom_file.read().decode("utf-8") + tree = ET.fromstring(xml_content) + + # Define namespaces + ns = { + 'cp': 'http://schemas.openxmlformats.org/officeDocument/2006/custom-properties', + 'vt': 'http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes' + } + + # Loop through all property elements + for prop in tree.findall("cp:property", ns): + name = prop.attrib.get("name", "") + if name.endswith("_Name") or "ClassificationWatermarkText" in name: + value_elem = prop.find("vt:lpwstr", ns) + if value_elem is not None: + return value_elem.text.strip().lower() + except Exception as e: + logger.warning("Failed to extract Office label: %s", str(e)) + + return None + + +def extract_pdf_sensitivity_label(file_path: str) -> Optional[str]: + try: + with pikepdf.open(file_path) as pdf: + xmp = pdf.open_metadata() + xml_content = str(xmp) + + tree = ET.fromstring(xml_content) + + # Define namespace for pdfx (used in your metadata) + ns = { + 'pdfx': 'http://ns.adobe.com/pdfx/1.3/', + 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' + } + + # Search all rdf:Description elements under rdf:RDF + for description in tree.findall('.//rdf:Description', ns): + for key, value in description.attrib.items(): + for elem in description: + tag = elem.tag + if tag.startswith('{%s}' % ns['pdfx']) and tag.endswith('_Name') and elem.text: + label = elem.text.strip() + logger.info(f"Found sensitivity label: {label}") + for known in CONFIDENTIAL_LABELS: + if label.lower() == known.lower(): + return known + return label # Return even if not in known list + + except Exception as e: + logger.warning("Failed to extract PDF label: %s", str(e)) + + return None diff --git a/requirements.txt b/requirements.txt index ef964e02..8657879b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,4 +34,8 @@ cryptography==44.0.1 python-magic==0.4.27 python-pptx==0.6.23 xlrd==2.0.1 -pydantic==2.9.2 \ No newline at end of file +pydantic==2.9.2 +pikepdf +python-docx +lxml +zipfile36 \ No newline at end of file From bf28b3bfab400a338edededd95faf9bbc5c678ba Mon Sep 17 00:00:00 2001 From: Jared Refamonte Date: Tue, 15 Apr 2025 00:40:36 +0800 Subject: [PATCH 2/8] only files with `ALLOWED_LABELS` will be allowed --- README.md | 57 ++++++++++++++++++++--------------- app/routes/document_routes.py | 25 +++------------ app/utils/sensitivity.py | 45 +++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 46 deletions(-) create mode 100644 app/utils/sensitivity.py diff --git a/README.md b/README.md index 37bdcdf8..6ed55b7a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # ID-based RAG FastAPI ## Overview + This project integrates Langchain with FastAPI in an Asynchronous, Scalable manner, providing a framework for document indexing and retrieval, using PostgreSQL/pgvector. Files are organized into embeddings by `file_id`. The primary use case is for integration with [LibreChat](https://librechat.ai), but this simple API can be used for any ID-based use case. @@ -10,6 +11,7 @@ The main reason to use the ID approach is to work with embeddings on a file-leve The API will evolve over time to employ different querying/re-ranking methods, embedding models, and vector stores. ## Features + - **Document Management**: Methods for adding, retrieving, and deleting documents. - **Vector Store**: Utilizes Langchain's vector store for efficient document retrieval. - **Asynchronous Support**: Offers async operations for enhanced performance. @@ -29,6 +31,7 @@ The API will evolve over time to employ different querying/re-ranking methods, e - Local: - Make sure to setup `DB_HOST` to the correct database hostname - Run the following commands (preferably in a [virtual environment](https://realpython.com/python-virtual-environments-a-primer/)) + ```bash pip install -r requirements.txt uvicorn main:app @@ -39,7 +42,7 @@ uvicorn main:app The following environment variables are required to run the application: - `RAG_OPENAI_API_KEY`: The API key for OpenAI API Embeddings (if using default settings). - - Note: `OPENAI_API_KEY` will work but `RAG_OPENAI_API_KEY` will override it in order to not conflict with LibreChat setting. + - Note: `OPENAI_API_KEY` will work but `RAG_OPENAI_API_KEY` will override it in order to not conflict with LibreChat setting. - `RAG_OPENAI_BASEURL`: (Optional) The base URL for your OpenAI API Embeddings - `RAG_OPENAI_PROXY`: (Optional) Proxy for OpenAI API Embeddings - `VECTOR_DB_TYPE`: (Optional) select vector database type, default to `pgvector`. @@ -51,6 +54,7 @@ The following environment variables are required to run the application: - `RAG_HOST`: (Optional) The hostname or IP address where the API server will run. Defaults to "0.0.0.0" - `RAG_PORT`: (Optional) The port number where the API server will run. Defaults to port 8000. - `JWT_SECRET`: (Optional) The secret key used for verifying JWT tokens for requests. + - The secret is only used for verification. This basic approach assumes a signed JWT from elsewhere. - Omit to run API without requiring authentication @@ -63,19 +67,19 @@ The following environment variables are required to run the application: - `CONSOLE_JSON`: (Optional) Set to "True" to log as json for Cloud Logging aggregations - `EMBEDDINGS_PROVIDER`: (Optional) either "openai", "bedrock", "azure", "huggingface", "huggingfacetei" or "ollama", where "huggingface" uses sentence_transformers; defaults to "openai" - `EMBEDDINGS_MODEL`: (Optional) Set a valid embeddings model to use from the configured provider. - - **Defaults** - - openai: "text-embedding-3-small" - - azure: "text-embedding-3-small" (will be used as your Azure Deployment) - - huggingface: "sentence-transformers/all-MiniLM-L6-v2" - - huggingfacetei: "http://huggingfacetei:3000". Hugging Face TEI uses model defined on TEI service launch. - - ollama: "nomic-embed-text" - - bedrock: "amazon.titan-embed-text-v1" + - **Defaults** + - openai: "text-embedding-3-small" + - azure: "text-embedding-3-small" (will be used as your Azure Deployment) + - huggingface: "sentence-transformers/all-MiniLM-L6-v2" + - huggingfacetei: "http://huggingfacetei:3000". Hugging Face TEI uses model defined on TEI service launch. + - ollama: "nomic-embed-text" + - bedrock: "amazon.titan-embed-text-v1" - `RAG_AZURE_OPENAI_API_VERSION`: (Optional) Default is `2023-05-15`. The version of the Azure OpenAI API. - `RAG_AZURE_OPENAI_API_KEY`: (Optional) The API key for Azure OpenAI service. - - Note: `AZURE_OPENAI_API_KEY` will work but `RAG_AZURE_OPENAI_API_KEY` will override it in order to not conflict with LibreChat setting. + - Note: `AZURE_OPENAI_API_KEY` will work but `RAG_AZURE_OPENAI_API_KEY` will override it in order to not conflict with LibreChat setting. - `RAG_AZURE_OPENAI_ENDPOINT`: (Optional) The endpoint URL for Azure OpenAI service, including the resource. - - Example: `https://YOUR_RESOURCE_NAME.openai.azure.com`. - - Note: `AZURE_OPENAI_ENDPOINT` will work but `RAG_AZURE_OPENAI_ENDPOINT` will override it in order to not conflict with LibreChat setting. + - Example: `https://YOUR_RESOURCE_NAME.openai.azure.com`. + - Note: `AZURE_OPENAI_ENDPOINT` will work but `RAG_AZURE_OPENAI_ENDPOINT` will override it in order to not conflict with LibreChat setting. - `HF_TOKEN`: (Optional) if needed for `huggingface` option. - `OLLAMA_BASE_URL`: (Optional) defaults to `http://ollama:11434`. - `ATLAS_SEARCH_INDEX`: (Optional) the name of the vector search index if using Atlas MongoDB, defaults to `vector_index` @@ -83,6 +87,8 @@ The following environment variables are required to run the application: - `AWS_DEFAULT_REGION`: (Optional) defaults to `us-east-1` - `AWS_ACCESS_KEY_ID`: (Optional) needed for bedrock embeddings - `AWS_SECRET_ACCESS_KEY`: (Optional) needed for bedrock embeddings +- `ALLOWED_LABELS`: (Optional) A comma-separated list of sensitivity labels that are permitted for file processing. + - Default: public,personal,none Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables. @@ -97,7 +103,7 @@ COLLECTION_NAME= ATLAS_SEARCH_INDEX= ``` -The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition, create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json: +The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by LibreChat. Even if it is the same, the `$COLLECTION_NAME` collection needs to be a completely new one, separate from all collections used by LibreChat. In addition, create a vector search index for collection above (remember to assign `$ATLAS_SEARCH_INDEX`) with the following json: ```json { @@ -118,31 +124,32 @@ The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by Lib Follow one of the [four documented methods](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure) to create the vector index. - ### Cloud Installation Settings: #### AWS: + Make sure your RDS Postgres instance adheres to this requirement: `The pgvector extension version 0.5.0 is available on database instances in Amazon RDS running PostgreSQL 15.4-R2 and higher, 14.9-R2 and higher, 13.12-R2 and higher, and 12.16-R2 and higher in all applicable AWS Regions, including the AWS GovCloud (US) Regions.` In order to setup RDS Postgres with RAG API, you can follow these steps: -* Create a RDS Instance/Cluster using the provided [AWS Documentation](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_CreateDBInstance.html). -* Login to the RDS Cluster using the Endpoint connection string from the RDS Console or from your IaC Solution output. -* The login is via the *Master User*. -* Create a dedicated database for rag_api: -``` create database rag_api;```. -* Create a dedicated user\role for that database: -``` create role rag;``` +- Create a RDS Instance/Cluster using the provided [AWS Documentation](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_CreateDBInstance.html). +- Login to the RDS Cluster using the Endpoint connection string from the RDS Console or from your IaC Solution output. +- The login is via the _Master User_. +- Create a dedicated database for rag_api: + ` create database rag_api;`. +- Create a dedicated user\role for that database: + ` create role rag;` -* Switch to the database you just created: ```\c rag_api``` -* Enable the Vector extension: ```create extension vector;``` -* Use the documentation provided above to set up the connection string to the RDS Postgres Instance\Cluster. +- Switch to the database you just created: `\c rag_api` +- Enable the Vector extension: `create extension vector;` +- Use the documentation provided above to set up the connection string to the RDS Postgres Instance\Cluster. Notes: - * Even though you're logging with a Master user, it doesn't have all the super user privileges, that's why we cannot use the command: ```create role x with superuser;``` - * If you do not enable the extension, rag_api service will throw an error that it cannot create the extension due to the note above. + +- Even though you're logging with a Master user, it doesn't have all the super user privileges, that's why we cannot use the command: `create role x with superuser;` +- If you do not enable the extension, rag_api service will throw an error that it cannot create the extension due to the note above. ### Dev notes: diff --git a/app/routes/document_routes.py b/app/routes/document_routes.py index 087b8ce7..96405916 100644 --- a/app/routes/document_routes.py +++ b/app/routes/document_routes.py @@ -40,18 +40,10 @@ from lxml import etree from xml.etree import ElementTree as ET from typing import Optional +from app.utils.sensitivity import assert_sensitivity_allowed router = APIRouter() -CONFIDENTIAL_LABELS = [ - "confidential", - "highly confidential", - "restricted", - "top secret", - "internal", - "privileged" -] - @router.get("/ids") async def get_all_ids(): try: @@ -390,15 +382,10 @@ async def embed_file( # 🔐 Run Sensitivity Check BEFORE processing sensitivity_label = await detect_sensitivity_label(temp_file_path, file.filename) + assert_sensitivity_allowed(sensitivity_label) logger.debug("File sensitivity label: %s", sensitivity_label) - if sensitivity_label and any(label.lower() in sensitivity_label.lower() for label in CONFIDENTIAL_LABELS): - raise HTTPException( - status_code=400, - detail=f"File not processed due to sensitivity level: {sensitivity_label}." - ) - except Exception as e: logger.error( "Failed to save uploaded file | Path: %s | Error: %s | Traceback: %s", @@ -663,8 +650,7 @@ def extract_office_sensitivity_label(file_path: str) -> Optional[str]: name = prop.attrib.get("name", "") if name.endswith("_Name") or "ClassificationWatermarkText" in name: value_elem = prop.find("vt:lpwstr", ns) - if value_elem is not None: - return value_elem.text.strip().lower() + return value_elem.text.strip().lower() except Exception as e: logger.warning("Failed to extract Office label: %s", str(e)) @@ -693,10 +679,7 @@ def extract_pdf_sensitivity_label(file_path: str) -> Optional[str]: if tag.startswith('{%s}' % ns['pdfx']) and tag.endswith('_Name') and elem.text: label = elem.text.strip() logger.info(f"Found sensitivity label: {label}") - for known in CONFIDENTIAL_LABELS: - if label.lower() == known.lower(): - return known - return label # Return even if not in known list + return label except Exception as e: logger.warning("Failed to extract PDF label: %s", str(e)) diff --git a/app/utils/sensitivity.py b/app/utils/sensitivity.py new file mode 100644 index 00000000..6d25a4d3 --- /dev/null +++ b/app/utils/sensitivity.py @@ -0,0 +1,45 @@ +import os +from typing import Optional +from fastapi import HTTPException +from dotenv import load_dotenv +from app.config import logger + +# Load .env +load_dotenv() + +# Default values +DEFAULT_ALLOWED_LABELS = [ + "public", + "personal", + "none" +] + +def get_env_list(key: str, fallback: list[str]) -> list[str]: + raw_value = os.getenv(key) + if raw_value: + return [item.strip().lower() for item in raw_value.split(",") if item.strip()] + return fallback + +ALLOWED_LABELS = get_env_list("ALLOWED_LABELS", DEFAULT_ALLOWED_LABELS) + + +def normalize_label(label: Optional[str]) -> str: + return label.strip().lower() if label else "" + +def is_label_allowed(label: Optional[str]) -> bool: + if label is None: + return "none" in ALLOWED_LABELS + + normalized = normalize_label(label) + return any(allowed in normalized for allowed in ALLOWED_LABELS) + + + +def assert_sensitivity_allowed(sensitivity_label: str): + if is_label_allowed(sensitivity_label): + return + + raise HTTPException( + status_code=400, + detail=f"File not processed due to unauthorized sensitivity level: {sensitivity_label}." + ) From 4d6f5fd198af95f949b8f3a7d24f5dbfc8d130f4 Mon Sep 17 00:00:00 2001 From: Jared Refamonte Date: Tue, 15 Apr 2025 01:25:34 +0800 Subject: [PATCH 3/8] If `ALLOWED_LABELS` is not provided in `.env`, process all files --- README.md | 4 ++-- app/utils/sensitivity.py | 20 +++++++------------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 6ed55b7a..734a0534 100644 --- a/README.md +++ b/README.md @@ -87,8 +87,8 @@ The following environment variables are required to run the application: - `AWS_DEFAULT_REGION`: (Optional) defaults to `us-east-1` - `AWS_ACCESS_KEY_ID`: (Optional) needed for bedrock embeddings - `AWS_SECRET_ACCESS_KEY`: (Optional) needed for bedrock embeddings -- `ALLOWED_LABELS`: (Optional) A comma-separated list of sensitivity labels that are permitted for file processing. - - Default: public,personal,none +- `ALLOWED_LABELS`: (Optional) A comma-separated list of sensitivity labels that are permitted for processing of `.docx`, `.pptx`, `.xlsx`, and `.pdf` files. + - Omit to allow all files to be processed regardless of sensitivity labels. Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables. diff --git a/app/utils/sensitivity.py b/app/utils/sensitivity.py index 6d25a4d3..fa27f23b 100644 --- a/app/utils/sensitivity.py +++ b/app/utils/sensitivity.py @@ -7,34 +7,28 @@ # Load .env load_dotenv() -# Default values -DEFAULT_ALLOWED_LABELS = [ - "public", - "personal", - "none" -] - -def get_env_list(key: str, fallback: list[str]) -> list[str]: +def get_env_list(key: str) -> list[str]: raw_value = os.getenv(key) if raw_value: return [item.strip().lower() for item in raw_value.split(",") if item.strip()] - return fallback - -ALLOWED_LABELS = get_env_list("ALLOWED_LABELS", DEFAULT_ALLOWED_LABELS) + return [] # Return an empty list if the key is not found +ALLOWED_LABELS = get_env_list("ALLOWED_LABELS") def normalize_label(label: Optional[str]) -> str: return label.strip().lower() if label else "" def is_label_allowed(label: Optional[str]) -> bool: + # If no allowed labels are defined, allow all labels + if not ALLOWED_LABELS: + return True + if label is None: return "none" in ALLOWED_LABELS normalized = normalize_label(label) return any(allowed in normalized for allowed in ALLOWED_LABELS) - - def assert_sensitivity_allowed(sensitivity_label: str): if is_label_allowed(sensitivity_label): return From 821037e71145a34bf7bedcefba24689a3bc7f184 Mon Sep 17 00:00:00 2001 From: Jared Refamonte Date: Tue, 15 Apr 2025 01:30:11 +0800 Subject: [PATCH 4/8] Move sensitivity checker functions to `/utils/sensitivity.py` --- app/routes/document_routes.py | 78 +---------------------------------- app/utils/sensitivity.py | 72 ++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 76 deletions(-) diff --git a/app/routes/document_routes.py b/app/routes/document_routes.py index 96405916..bb5d3f2d 100644 --- a/app/routes/document_routes.py +++ b/app/routes/document_routes.py @@ -33,14 +33,7 @@ from app.services.vector_store.async_pg_vector import AsyncPgVector from app.utils.document_loader import get_loader, clean_text, process_documents from app.utils.health import is_health_ok - -import zipfile -import re -import pikepdf -from lxml import etree -from xml.etree import ElementTree as ET -from typing import Optional -from app.utils.sensitivity import assert_sensitivity_allowed +from app.utils.sensitivity import assert_sensitivity_allowed, detect_sensitivity_label router = APIRouter() @@ -617,71 +610,4 @@ async def query_embeddings_by_file_ids(body: QueryMultipleBody): str(e), traceback.format_exc(), ) - raise HTTPException(status_code=500, detail=str(e)) - - -# ------------------------------------------------------- -# 📁 Sensitivity Label Extractor -# ------------------------------------------------------- - -async def detect_sensitivity_label(file_path: str, filename: str) -> Optional[str]: - if filename.endswith(".docx") or filename.endswith(".xlsx") or filename.endswith(".pptx"): - return extract_office_sensitivity_label(file_path) - elif filename.endswith(".pdf"): - return extract_pdf_sensitivity_label(file_path) - return None - -def extract_office_sensitivity_label(file_path: str) -> Optional[str]: - try: - with zipfile.ZipFile(file_path, "r") as zipf: - if "docProps/custom.xml" in zipf.namelist(): - with zipf.open("docProps/custom.xml") as custom_file: - xml_content = custom_file.read().decode("utf-8") - tree = ET.fromstring(xml_content) - - # Define namespaces - ns = { - 'cp': 'http://schemas.openxmlformats.org/officeDocument/2006/custom-properties', - 'vt': 'http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes' - } - - # Loop through all property elements - for prop in tree.findall("cp:property", ns): - name = prop.attrib.get("name", "") - if name.endswith("_Name") or "ClassificationWatermarkText" in name: - value_elem = prop.find("vt:lpwstr", ns) - return value_elem.text.strip().lower() - except Exception as e: - logger.warning("Failed to extract Office label: %s", str(e)) - - return None - - -def extract_pdf_sensitivity_label(file_path: str) -> Optional[str]: - try: - with pikepdf.open(file_path) as pdf: - xmp = pdf.open_metadata() - xml_content = str(xmp) - - tree = ET.fromstring(xml_content) - - # Define namespace for pdfx (used in your metadata) - ns = { - 'pdfx': 'http://ns.adobe.com/pdfx/1.3/', - 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' - } - - # Search all rdf:Description elements under rdf:RDF - for description in tree.findall('.//rdf:Description', ns): - for key, value in description.attrib.items(): - for elem in description: - tag = elem.tag - if tag.startswith('{%s}' % ns['pdfx']) and tag.endswith('_Name') and elem.text: - label = elem.text.strip() - logger.info(f"Found sensitivity label: {label}") - return label - - except Exception as e: - logger.warning("Failed to extract PDF label: %s", str(e)) - - return None + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/app/utils/sensitivity.py b/app/utils/sensitivity.py index fa27f23b..dcefc765 100644 --- a/app/utils/sensitivity.py +++ b/app/utils/sensitivity.py @@ -3,6 +3,12 @@ from fastapi import HTTPException from dotenv import load_dotenv from app.config import logger +import zipfile +import re +import pikepdf +from lxml import etree +from xml.etree import ElementTree as ET +from typing import Optional # Load .env load_dotenv() @@ -37,3 +43,69 @@ def assert_sensitivity_allowed(sensitivity_label: str): status_code=400, detail=f"File not processed due to unauthorized sensitivity level: {sensitivity_label}." ) + +# ------------------------------------------------------- +# 📁 Sensitivity Label Extractor +# ------------------------------------------------------- + +async def detect_sensitivity_label(file_path: str, filename: str) -> Optional[str]: + if filename.endswith(".docx") or filename.endswith(".xlsx") or filename.endswith(".pptx"): + return extract_office_sensitivity_label(file_path) + elif filename.endswith(".pdf"): + return extract_pdf_sensitivity_label(file_path) + return None + +def extract_office_sensitivity_label(file_path: str) -> Optional[str]: + try: + with zipfile.ZipFile(file_path, "r") as zipf: + if "docProps/custom.xml" in zipf.namelist(): + with zipf.open("docProps/custom.xml") as custom_file: + xml_content = custom_file.read().decode("utf-8") + tree = ET.fromstring(xml_content) + + # Define namespaces + ns = { + 'cp': 'http://schemas.openxmlformats.org/officeDocument/2006/custom-properties', + 'vt': 'http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes' + } + + # Loop through all property elements + for prop in tree.findall("cp:property", ns): + name = prop.attrib.get("name", "") + if name.endswith("_Name") or "ClassificationWatermarkText" in name: + value_elem = prop.find("vt:lpwstr", ns) + return value_elem.text.strip().lower() + except Exception as e: + logger.warning("Failed to extract Office label: %s", str(e)) + + return None + + +def extract_pdf_sensitivity_label(file_path: str) -> Optional[str]: + try: + with pikepdf.open(file_path) as pdf: + xmp = pdf.open_metadata() + xml_content = str(xmp) + + tree = ET.fromstring(xml_content) + + # Define namespace for pdfx (used in your metadata) + ns = { + 'pdfx': 'http://ns.adobe.com/pdfx/1.3/', + 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' + } + + # Search all rdf:Description elements under rdf:RDF + for description in tree.findall('.//rdf:Description', ns): + for key, value in description.attrib.items(): + for elem in description: + tag = elem.tag + if tag.startswith('{%s}' % ns['pdfx']) and tag.endswith('_Name') and elem.text: + label = elem.text.strip() + logger.info(f"Found sensitivity label: {label}") + return label + + except Exception as e: + logger.warning("Failed to extract PDF label: %s", str(e)) + + return None From 0e940d13344a58d446dd7451fd00d28cc627f3bc Mon Sep 17 00:00:00 2001 From: Jared Refamonte Date: Tue, 15 Apr 2025 01:55:36 +0800 Subject: [PATCH 5/8] Add `CHECKED_DOC_TYPES` in `.env` to easily specify which file types will be checked --- README.md | 14 ++++++++++++-- app/utils/sensitivity.py | 41 +++++++++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 734a0534..abd49d5b 100644 --- a/README.md +++ b/README.md @@ -87,8 +87,18 @@ The following environment variables are required to run the application: - `AWS_DEFAULT_REGION`: (Optional) defaults to `us-east-1` - `AWS_ACCESS_KEY_ID`: (Optional) needed for bedrock embeddings - `AWS_SECRET_ACCESS_KEY`: (Optional) needed for bedrock embeddings -- `ALLOWED_LABELS`: (Optional) A comma-separated list of sensitivity labels that are permitted for processing of `.docx`, `.pptx`, `.xlsx`, and `.pdf` files. - - Omit to allow all files to be processed regardless of sensitivity labels. +- `ALLOWED_LABELS`: (Optional) A comma-separated list of sensitivity labels that are allowed for processing. If no labels are specified, all labels will be allowed by default. + + - Example: ALLOWED_LABELS=personal,none + - Default: If not defined, all labels are allowed. + +- `CHECKED_DOC_TYPES`: (Optional) A comma-separated list of document types (by file extension) to be checked for sensitivity labels. If no types are specified, all supported document types (pdf, docx, xlsx, pptx) will be checked by default. + - Example: CHECKED_DOC_TYPES=pdf,docx,pptx + - Default: If not defined, all supported types (pdf, docx, xlsx, pptx) are checked. + +Example: ALLOWED_LABELS=personal,none + +Default: If not defined, all labels are allowed. Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables. diff --git a/app/utils/sensitivity.py b/app/utils/sensitivity.py index dcefc765..ee3e0cac 100644 --- a/app/utils/sensitivity.py +++ b/app/utils/sensitivity.py @@ -4,11 +4,9 @@ from dotenv import load_dotenv from app.config import logger import zipfile -import re import pikepdf from lxml import etree from xml.etree import ElementTree as ET -from typing import Optional # Load .env load_dotenv() @@ -19,7 +17,12 @@ def get_env_list(key: str) -> list[str]: return [item.strip().lower() for item in raw_value.split(",") if item.strip()] return [] # Return an empty list if the key is not found +# Configuration for allowed document types and labels ALLOWED_LABELS = get_env_list("ALLOWED_LABELS") +CHECKED_DOC_TYPES = get_env_list("CHECKED_DOC_TYPES") + +# Define the supported document types for checking (by default, all these are checked if no CHECKED_DOC_TYPES is defined) +SUPPORTED_DOC_TYPES = ["pdf", "docx", "xlsx", "pptx"] def normalize_label(label: Optional[str]) -> str: return label.strip().lower() if label else "" @@ -35,6 +38,19 @@ def is_label_allowed(label: Optional[str]) -> bool: normalized = normalize_label(label) return any(allowed in normalized for allowed in ALLOWED_LABELS) +def is_doc_type_allowed(filename: str) -> bool: + """ + Check if the document type (based on its file extension) is allowed according to the config. + """ + file_ext = filename.split('.')[-1].lower() + + # If CHECKED_DOC_TYPES is defined, check if the file type is in the allowed list + if CHECKED_DOC_TYPES: + return file_ext in CHECKED_DOC_TYPES + + # If CHECKED_DOC_TYPES is not defined, allow all document types by default + return file_ext in SUPPORTED_DOC_TYPES + def assert_sensitivity_allowed(sensitivity_label: str): if is_label_allowed(sensitivity_label): return @@ -49,11 +65,18 @@ def assert_sensitivity_allowed(sensitivity_label: str): # ------------------------------------------------------- async def detect_sensitivity_label(file_path: str, filename: str) -> Optional[str]: + # First, check if the file type is allowed + if not is_doc_type_allowed(filename): + logger.warning(f"Document type {filename.split('.')[-1]} is not allowed for sensitivity check.") + return None + + # Proceed with the sensitivity label extraction if the type is allowed if filename.endswith(".docx") or filename.endswith(".xlsx") or filename.endswith(".pptx"): return extract_office_sensitivity_label(file_path) elif filename.endswith(".pdf"): return extract_pdf_sensitivity_label(file_path) - return None + + return None # Return None if the file type is not supported def extract_office_sensitivity_label(file_path: str) -> Optional[str]: try: @@ -98,12 +121,12 @@ def extract_pdf_sensitivity_label(file_path: str) -> Optional[str]: # Search all rdf:Description elements under rdf:RDF for description in tree.findall('.//rdf:Description', ns): for key, value in description.attrib.items(): - for elem in description: - tag = elem.tag - if tag.startswith('{%s}' % ns['pdfx']) and tag.endswith('_Name') and elem.text: - label = elem.text.strip() - logger.info(f"Found sensitivity label: {label}") - return label + for elem in description: + tag = elem.tag + if tag.startswith('{%s}' % ns['pdfx']) and tag.endswith('_Name') and elem.text: + label = elem.text.strip() + logger.info(f"Found sensitivity label: {label}") + return label except Exception as e: logger.warning("Failed to extract PDF label: %s", str(e)) From 1f93a650799cc4394f5018bfc9931530d3692306 Mon Sep 17 00:00:00 2001 From: Jared Refamonte Date: Wed, 16 Apr 2025 03:48:18 +0800 Subject: [PATCH 6/8] Update sensitivity checker to support special characters in labels --- app/utils/sensitivity.py | 66 +++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/app/utils/sensitivity.py b/app/utils/sensitivity.py index ee3e0cac..2bf2994c 100644 --- a/app/utils/sensitivity.py +++ b/app/utils/sensitivity.py @@ -7,49 +7,51 @@ import pikepdf from lxml import etree from xml.etree import ElementTree as ET +import json # Load .env load_dotenv() -def get_env_list(key: str) -> list[str]: +def get_env_json_list(key: str) -> list[str]: raw_value = os.getenv(key) - if raw_value: - return [item.strip().lower() for item in raw_value.split(",") if item.strip()] - return [] # Return an empty list if the key is not found + try: + return [item.strip().lower() for item in json.loads(raw_value)] if raw_value else [] + except json.JSONDecodeError: + logger.warning(f"Failed to parse {key} as JSON list.") + return [] + +def get_env_bool(key: str, default: bool = False) -> bool: + val = os.getenv(key) + return val.lower() in ("1", "true", "yes") if val is not None else default -# Configuration for allowed document types and labels -ALLOWED_LABELS = get_env_list("ALLOWED_LABELS") -CHECKED_DOC_TYPES = get_env_list("CHECKED_DOC_TYPES") +# Configuration +DOC_FLTR_ENABLED = get_env_bool("DOC_FLTR_ENABLED") +DOC_FLTR_ALLOWED_LABELS = get_env_json_list("DOC_FLTR_ALLOWED_LABELS") +DOC_FLTR_FILE_TYPES = get_env_json_list("DOC_FLTR_FILE_TYPES") -# Define the supported document types for checking (by default, all these are checked if no CHECKED_DOC_TYPES is defined) -SUPPORTED_DOC_TYPES = ["pdf", "docx", "xlsx", "pptx"] +SUPPORTED_FILE_TYPES = ["pdf", "docx", "xlsx", "pptx"] def normalize_label(label: Optional[str]) -> str: return label.strip().lower() if label else "" def is_label_allowed(label: Optional[str]) -> bool: - # If no allowed labels are defined, allow all labels - if not ALLOWED_LABELS: + if label is None: + return True # Always allow files with no label + + if not DOC_FLTR_ENABLED: return True - if label is None: - return "none" in ALLOWED_LABELS + if not DOC_FLTR_ALLOWED_LABELS: + return True # If filtering is on but no labels are defined, allow all normalized = normalize_label(label) - return any(allowed in normalized for allowed in ALLOWED_LABELS) + return normalized in DOC_FLTR_ALLOWED_LABELS def is_doc_type_allowed(filename: str) -> bool: - """ - Check if the document type (based on its file extension) is allowed according to the config. - """ file_ext = filename.split('.')[-1].lower() - - # If CHECKED_DOC_TYPES is defined, check if the file type is in the allowed list - if CHECKED_DOC_TYPES: - return file_ext in CHECKED_DOC_TYPES - - # If CHECKED_DOC_TYPES is not defined, allow all document types by default - return file_ext in SUPPORTED_DOC_TYPES + if DOC_FLTR_FILE_TYPES: + return file_ext in DOC_FLTR_FILE_TYPES + return file_ext in SUPPORTED_FILE_TYPES def assert_sensitivity_allowed(sensitivity_label: str): if is_label_allowed(sensitivity_label): @@ -65,18 +67,19 @@ def assert_sensitivity_allowed(sensitivity_label: str): # ------------------------------------------------------- async def detect_sensitivity_label(file_path: str, filename: str) -> Optional[str]: - # First, check if the file type is allowed + if not DOC_FLTR_ENABLED: + return None + if not is_doc_type_allowed(filename): logger.warning(f"Document type {filename.split('.')[-1]} is not allowed for sensitivity check.") return None - # Proceed with the sensitivity label extraction if the type is allowed if filename.endswith(".docx") or filename.endswith(".xlsx") or filename.endswith(".pptx"): return extract_office_sensitivity_label(file_path) elif filename.endswith(".pdf"): return extract_pdf_sensitivity_label(file_path) - - return None # Return None if the file type is not supported + + return None def extract_office_sensitivity_label(file_path: str) -> Optional[str]: try: @@ -86,13 +89,11 @@ def extract_office_sensitivity_label(file_path: str) -> Optional[str]: xml_content = custom_file.read().decode("utf-8") tree = ET.fromstring(xml_content) - # Define namespaces ns = { 'cp': 'http://schemas.openxmlformats.org/officeDocument/2006/custom-properties', 'vt': 'http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes' } - # Loop through all property elements for prop in tree.findall("cp:property", ns): name = prop.attrib.get("name", "") if name.endswith("_Name") or "ClassificationWatermarkText" in name: @@ -103,7 +104,6 @@ def extract_office_sensitivity_label(file_path: str) -> Optional[str]: return None - def extract_pdf_sensitivity_label(file_path: str) -> Optional[str]: try: with pikepdf.open(file_path) as pdf: @@ -112,13 +112,11 @@ def extract_pdf_sensitivity_label(file_path: str) -> Optional[str]: tree = ET.fromstring(xml_content) - # Define namespace for pdfx (used in your metadata) ns = { 'pdfx': 'http://ns.adobe.com/pdfx/1.3/', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' } - # Search all rdf:Description elements under rdf:RDF for description in tree.findall('.//rdf:Description', ns): for key, value in description.attrib.items(): for elem in description: @@ -131,4 +129,4 @@ def extract_pdf_sensitivity_label(file_path: str) -> Optional[str]: except Exception as e: logger.warning("Failed to extract PDF label: %s", str(e)) - return None + return None \ No newline at end of file From bd0cdbb2938cf0af93d6d9577d78aa8246bc954c Mon Sep 17 00:00:00 2001 From: Jared Refamonte Date: Wed, 16 Apr 2025 03:57:03 +0800 Subject: [PATCH 7/8] Add `DOC_FLTR_ENABLED`, `DOC_FLTR_ALLOWED_LABELS`, and `DOC_FLTR_FILE_TYPES` definitions in `README` --- README.md | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index abd49d5b..d9e29cc7 100644 --- a/README.md +++ b/README.md @@ -87,18 +87,22 @@ The following environment variables are required to run the application: - `AWS_DEFAULT_REGION`: (Optional) defaults to `us-east-1` - `AWS_ACCESS_KEY_ID`: (Optional) needed for bedrock embeddings - `AWS_SECRET_ACCESS_KEY`: (Optional) needed for bedrock embeddings -- `ALLOWED_LABELS`: (Optional) A comma-separated list of sensitivity labels that are allowed for processing. If no labels are specified, all labels will be allowed by default. - - Example: ALLOWED_LABELS=personal,none - - Default: If not defined, all labels are allowed. - -- `CHECKED_DOC_TYPES`: (Optional) A comma-separated list of document types (by file extension) to be checked for sensitivity labels. If no types are specified, all supported document types (pdf, docx, xlsx, pptx) will be checked by default. - - Example: CHECKED_DOC_TYPES=pdf,docx,pptx - - Default: If not defined, all supported types (pdf, docx, xlsx, pptx) are checked. - -Example: ALLOWED_LABELS=personal,none - -Default: If not defined, all labels are allowed. +- `DOC_FLTR_ENABLED`: Enables or disables sensitivity label filtering. + - Type: boolean + - Accepted values: true, 1, yes (case-insensitive) + - Default: false if not set +- `DOC_FLTR_ALLOWED_LABELS`: A JSON array of allowed sensitivity labels. If a document's label is not included in this list, it will be rejected when filtering is enabled. + - Type: JSON list of strings + - Format: Must be a valid JSON array (e.g., ["public", "confidential"]) + - Note: Labels are normalized (trimmed and lowercased). Special characters and spaces are allowed. + - Default: If unset or an empty array, all labels are allowed + - Example: `DOC_FLTR_ALLOWED_LABELS=["public", "personal", "confidential", "company name - confidential"]` +- `DOC_FLTR_FILE_TYPES`: A JSON array of allowed file extensions (e.g., "pdf", "docx"). Only these types will be checked for labels. + - Type: JSON list of strings + - Note: File extensions should be lowercase and without dots. + - Default: If unset, defaults to ["pdf", "docx", "xlsx", "pptx"] + - Example: `DOC_FLTR_FILE_TYPES=["pdf", "docx"]` Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables. From 0f2addf79dd23c83b83d470ddcbc36bf0d47788b Mon Sep 17 00:00:00 2001 From: Jared Refamonte Date: Fri, 18 Apr 2025 00:09:57 +0800 Subject: [PATCH 8/8] only load sensitivity functions when filtering is enabled --- app/routes/document_routes.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/app/routes/document_routes.py b/app/routes/document_routes.py index bb5d3f2d..711eb45a 100644 --- a/app/routes/document_routes.py +++ b/app/routes/document_routes.py @@ -33,7 +33,6 @@ from app.services.vector_store.async_pg_vector import AsyncPgVector from app.utils.document_loader import get_loader, clean_text, process_documents from app.utils.health import is_health_ok -from app.utils.sensitivity import assert_sensitivity_allowed, detect_sensitivity_label router = APIRouter() @@ -373,11 +372,15 @@ async def embed_file( while content := await file.read(chunk_size): await temp_file.write(content) - # 🔐 Run Sensitivity Check BEFORE processing - sensitivity_label = await detect_sensitivity_label(temp_file_path, file.filename) - assert_sensitivity_allowed(sensitivity_label) + # Run Sensitivity Check BEFORE processing + if os.getenv("DOC_FLTR_ENABLED"): + # Lazy import: only load sensitivity functions when filtering is enabled + from app.utils.sensitivity import detect_sensitivity_label, assert_sensitivity_allowed - logger.debug("File sensitivity label: %s", sensitivity_label) + sensitivity_label = await detect_sensitivity_label(temp_file_path, file.filename) + assert_sensitivity_allowed(sensitivity_label) + + logger.debug("File sensitivity label: %s", sensitivity_label) except Exception as e: logger.error(