diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ab89daada2..80960bd60b 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,6 +1,6 @@ { "name": "Azure Search OpenAI Demo", - "image": "mcr.microsoft.com/devcontainers/python:3.11-bookworm", + "image": "mcr.microsoft.com/devcontainers/python:3.13-bookworm", "features": { "ghcr.io/devcontainers/features/node:1": { // This should match the version of Node.js in Github Actions workflows diff --git a/.github/workflows/evaluate.yaml b/.github/workflows/evaluate.yaml index f4fd9e74b3..9c7fd149b1 100644 --- a/.github/workflows/evaluate.yaml +++ b/.github/workflows/evaluate.yaml @@ -128,7 +128,7 @@ jobs: uses: astral-sh/setup-uv@v6 with: enable-cache: true - version: "0.4.20" + version: "0.9.5" cache-dependency-glob: "requirements**.txt" python-version: "3.11" diff --git a/.github/workflows/python-test.yaml b/.github/workflows/python-test.yaml index 24a5b5cb70..2ae5ba3afb 100644 --- a/.github/workflows/python-test.yaml +++ b/.github/workflows/python-test.yaml @@ -25,7 +25,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest", "windows-latest"] - python_version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"] node_version: ["20.14", "22"] steps: - uses: actions/checkout@v5 @@ -36,7 +36,7 @@ jobs: uses: astral-sh/setup-uv@v6 with: enable-cache: true - version: "0.4.20" + version: "0.9.5" cache-dependency-glob: "requirements**.txt" python-version: ${{ matrix.python_version }} activate-environment: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aa106e2f47..87b8af219a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ repos: - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.3 + rev: v0.14.2 hooks: - id: ruff - repo: https://github.com/psf/black diff --git a/AGENTS.md b/AGENTS.md index 0021d98852..dc5b4faaa6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -86,6 +86,18 @@ When you're running tests, make sure you activate the .venv virtual environment source .venv/bin/activate ``` +To check for coverage, run the following command: + +```shell +pytest --cov --cov-report=annotate:cov_annotate +``` + +Open the cov_annotate directory to view the annotated source code. There will be one file per source file. If a file has 100% source coverage, it means all lines are covered by tests, so you do not need to open the file. + +For each file that has less than 100% test coverage, find the matching file in cov_annotate and review the file. + +If a line starts with a ! (exclamation mark), it means that the line is not covered by tests. Add tests to cover the missing lines. + ## Sending pull requests When sending pull requests, make sure to follow the PULL_REQUEST_TEMPLATE.md format. @@ -95,7 +107,7 @@ When sending pull requests, make sure to follow the PULL_REQUEST_TEMPLATE.md for To upgrade a particular package in the backend, use the following command, replacing `` with the name of the package you want to upgrade: ```shell -cd app/backend && uv pip compile requirements.in -o requirements.txt --python-version 3.9 --upgrade-package package-name +cd app/backend && uv pip compile requirements.in -o requirements.txt --python-version 3.10 --upgrade-package package-name ``` ## Checking Python type hints @@ -103,7 +115,7 @@ cd app/backend && uv pip compile requirements.in -o requirements.txt --python-ve To check Python type hints, use the following command: ```shell -cd app/backend && mypy . --config-file=../pyproject.toml +cd app/backend && mypy . --config-file=../../pyproject.toml ``` ```shell diff --git a/README.md b/README.md index f53d895551..e7a8aa2aac 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,7 @@ A related option is VS Code Dev Containers, which will open the project in your 1. Install the required tools: - [Azure Developer CLI](https://aka.ms/azure-dev/install) - - [Python 3.9, 3.10, or 3.11](https://www.python.org/downloads/) + - [Python 3.10, 3.11, 3.12, 3.13, or 3.14](https://www.python.org/downloads/) - **Important**: Python and the pip package manager must be in the path in Windows for the setup scripts to work. - **Important**: Ensure you can run `python --version` from console. On Ubuntu, you might need to run `sudo apt install python-is-python3` to link `python` to `python3`. - [Node.js 20+](https://nodejs.org/download/) diff --git a/app/backend/Dockerfile b/app/backend/Dockerfile index a84bd6e0b7..647873f589 100644 --- a/app/backend/Dockerfile +++ b/app/backend/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11-bullseye +FROM python:3.13-bookworm WORKDIR /app diff --git a/app/backend/app.py b/app/backend/app.py index ae38e70b12..f7b36794b4 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -5,9 +5,9 @@ import mimetypes import os import time -from collections.abc import AsyncGenerator, Awaitable +from collections.abc import AsyncGenerator, Awaitable, Callable from pathlib import Path -from typing import Any, Callable, Union, cast +from typing import Any, cast from azure.cognitiveservices.speech import ( ResultReason, @@ -477,7 +477,7 @@ async def setup_clients(): # Use the current user identity for keyless authentication to Azure services. # This assumes you use 'azd auth login' locally, and managed identity when deployed on Azure. # The managed identity is setup in the infra/ folder. - azure_credential: Union[AzureDeveloperCliCredential, ManagedIdentityCredential] + azure_credential: AzureDeveloperCliCredential | ManagedIdentityCredential azure_ai_token_provider: Callable[[], Awaitable[str]] if RUNNING_ON_AZURE: current_app.logger.info("Setting up Azure credential using ManagedIdentityCredential") diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py index 7c941d36b8..a8f3134c78 100644 --- a/app/backend/approaches/approach.py +++ b/app/backend/approaches/approach.py @@ -2,7 +2,7 @@ from abc import ABC from collections.abc import AsyncGenerator, Awaitable from dataclasses import dataclass, field -from typing import Any, Optional, TypedDict, Union, cast +from typing import Any, Optional, TypedDict, cast from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient from azure.search.documents.agent.models import ( @@ -190,7 +190,7 @@ def build_filter(self, overrides: dict[str, Any]) -> Optional[str]: filters.append("category eq '{}'".format(include_category.replace("'", "''"))) if exclude_category: filters.append("category ne '{}'".format(exclude_category.replace("'", "''"))) - return None if len(filters) == 0 else " and ".join(filters) + return None if not filters else " and ".join(filters) async def search( self, @@ -520,7 +520,7 @@ def create_chat_completion( temperature: Optional[float] = None, n: Optional[int] = None, reasoning_effort: Optional[ChatCompletionReasoningEffort] = None, - ) -> Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]: + ) -> Awaitable[ChatCompletion] | Awaitable[AsyncStream[ChatCompletionChunk]]: if chatgpt_model in self.GPT_REASONING_MODELS: params: dict[str, Any] = { # max_tokens is not supported diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py index 78aeddc39b..149edb4a23 100644 --- a/app/backend/approaches/chatreadretrieveread.py +++ b/app/backend/approaches/chatreadretrieveread.py @@ -1,7 +1,7 @@ import json import re from collections.abc import AsyncGenerator, Awaitable -from typing import Any, Optional, Union, cast +from typing import Any, Optional, cast from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient from azure.search.documents.aio import SearchClient @@ -215,7 +215,7 @@ async def run_until_final_call( overrides: dict[str, Any], auth_claims: dict[str, Any], should_stream: bool = False, - ) -> tuple[ExtraInfo, Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]]: + ) -> tuple[ExtraInfo, Awaitable[ChatCompletion] | Awaitable[AsyncStream[ChatCompletionChunk]]]: use_agentic_retrieval = True if overrides.get("use_agentic_retrieval") else False original_user_query = messages[-1]["content"] @@ -243,7 +243,7 @@ async def run_until_final_call( ) chat_coroutine = cast( - Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]], + Awaitable[ChatCompletion] | Awaitable[AsyncStream[ChatCompletionChunk]], self.create_chat_completion( self.chatgpt_deployment, self.chatgpt_model, diff --git a/app/backend/chat_history/cosmosdb.py b/app/backend/chat_history/cosmosdb.py index 764278bc76..4f32d1b100 100644 --- a/app/backend/chat_history/cosmosdb.py +++ b/app/backend/chat_history/cosmosdb.py @@ -1,6 +1,6 @@ import os import time -from typing import Any, Union +from typing import Any from azure.cosmos.aio import ContainerProxy, CosmosClient from azure.identity.aio import AzureDeveloperCliCredential, ManagedIdentityCredential @@ -209,9 +209,7 @@ async def setup_clients(): AZURE_CHAT_HISTORY_DATABASE = os.getenv("AZURE_CHAT_HISTORY_DATABASE") AZURE_CHAT_HISTORY_CONTAINER = os.getenv("AZURE_CHAT_HISTORY_CONTAINER") - azure_credential: Union[AzureDeveloperCliCredential, ManagedIdentityCredential] = current_app.config[ - CONFIG_CREDENTIAL - ] + azure_credential: AzureDeveloperCliCredential | ManagedIdentityCredential = current_app.config[CONFIG_CREDENTIAL] if USE_CHAT_HISTORY_COSMOS: current_app.logger.info("USE_CHAT_HISTORY_COSMOS is true, setting up CosmosDB client") diff --git a/app/backend/core/sessionhelper.py b/app/backend/core/sessionhelper.py index ddda8e03b7..ca3042a85c 100644 --- a/app/backend/core/sessionhelper.py +++ b/app/backend/core/sessionhelper.py @@ -1,10 +1,10 @@ import uuid -from typing import Union +from typing import Optional def create_session_id( config_chat_history_cosmos_enabled: bool, config_chat_history_browser_enabled: bool -) -> Union[str, None]: +) -> Optional[str]: if config_chat_history_cosmos_enabled: return str(uuid.uuid4()) if config_chat_history_browser_enabled: diff --git a/app/backend/decorators.py b/app/backend/decorators.py index 6638767435..451bb3a558 100644 --- a/app/backend/decorators.py +++ b/app/backend/decorators.py @@ -1,6 +1,7 @@ import logging +from collections.abc import Callable from functools import wraps -from typing import Any, Callable, TypeVar, cast +from typing import Any, TypeVar, cast from quart import abort, current_app, request diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 1e8bd9a10d..5debb5b537 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -3,7 +3,7 @@ import logging import os from enum import Enum -from typing import Optional, Union +from typing import Optional import aiohttp from azure.core.credentials import AzureKeyCredential @@ -45,7 +45,7 @@ logger = logging.getLogger("scripts") -def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]: +def clean_key_if_exists(key: Optional[str]) -> Optional[str]: """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None.""" if key is not None and key.strip() != "": return key.strip() @@ -69,16 +69,16 @@ async def setup_search_info( search_service: str, index_name: str, azure_credential: AsyncTokenCredential, - use_agentic_retrieval: Union[bool, None] = None, - azure_openai_endpoint: Union[str, None] = None, - agent_name: Union[str, None] = None, - agent_max_output_tokens: Union[int, None] = None, - azure_openai_searchagent_deployment: Union[str, None] = None, - azure_openai_searchagent_model: Union[str, None] = None, - search_key: Union[str, None] = None, - azure_vision_endpoint: Union[str, None] = None, + use_agentic_retrieval: Optional[bool] = None, + azure_openai_endpoint: Optional[str] = None, + agent_name: Optional[str] = None, + agent_max_output_tokens: Optional[int] = None, + azure_openai_searchagent_deployment: Optional[str] = None, + azure_openai_searchagent_model: Optional[str] = None, + search_key: Optional[str] = None, + azure_vision_endpoint: Optional[str] = None, ) -> SearchInfo: - search_creds: Union[AsyncTokenCredential, AzureKeyCredential] = ( + search_creds: AsyncTokenCredential | AzureKeyCredential = ( azure_credential if search_key is None else AzureKeyCredential(search_key) ) if use_agentic_retrieval and azure_openai_searchagent_model is None: @@ -104,10 +104,10 @@ def setup_blob_manager( storage_container: str, storage_resource_group: str, subscription_id: str, - storage_key: Union[str, None] = None, - image_storage_container: Union[str, None] = None, # Added this parameter + storage_key: Optional[str] = None, + image_storage_container: Optional[str] = None, # Added this parameter ): - storage_creds: Union[AsyncTokenCredential, str] = azure_credential if storage_key is None else storage_key + storage_creds: AsyncTokenCredential | str = azure_credential if storage_key is None else storage_key return BlobManager( endpoint=f"https://{storage_account}.blob.core.windows.net", @@ -122,18 +122,18 @@ def setup_blob_manager( def setup_list_file_strategy( azure_credential: AsyncTokenCredential, - local_files: Union[str, None], - datalake_storage_account: Union[str, None], - datalake_filesystem: Union[str, None], - datalake_path: Union[str, None], - datalake_key: Union[str, None], + local_files: Optional[str], + datalake_storage_account: Optional[str], + datalake_filesystem: Optional[str], + datalake_path: Optional[str], + datalake_key: Optional[str], enable_global_documents: bool = False, ): list_file_strategy: ListFileStrategy if datalake_storage_account: if datalake_filesystem is None or datalake_path is None: raise ValueError("DataLake file system and path are required when using Azure Data Lake Gen2") - adls_gen2_creds: Union[AsyncTokenCredential, str] = azure_credential if datalake_key is None else datalake_key + adls_gen2_creds: AsyncTokenCredential | str = azure_credential if datalake_key is None else datalake_key logger.info("Using Data Lake Gen2 Storage Account: %s", datalake_storage_account) list_file_strategy = ADLSGen2ListFileStrategy( data_lake_storage_account=datalake_storage_account, @@ -164,13 +164,13 @@ def setup_embeddings_service( openai_host: OpenAIHost, emb_model_name: str, emb_model_dimensions: int, - azure_openai_service: Union[str, None], - azure_openai_custom_url: Union[str, None], - azure_openai_deployment: Union[str, None], - azure_openai_key: Union[str, None], + azure_openai_service: Optional[str], + azure_openai_custom_url: Optional[str], + azure_openai_deployment: Optional[str], + azure_openai_key: Optional[str], azure_openai_api_version: str, - openai_key: Union[str, None], - openai_org: Union[str, None], + openai_key: Optional[str], + openai_org: Optional[str], disable_vectors: bool = False, disable_batch_vectors: bool = False, ): @@ -179,7 +179,7 @@ def setup_embeddings_service( return None if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: - azure_open_ai_credential: Union[AsyncTokenCredential, AzureKeyCredential] = ( + azure_open_ai_credential: AsyncTokenCredential | AzureKeyCredential = ( azure_credential if azure_openai_key is None else AzureKeyCredential(azure_openai_key) ) return AzureOpenAIEmbeddingService( @@ -207,12 +207,12 @@ def setup_embeddings_service( def setup_openai_client( openai_host: OpenAIHost, azure_credential: AsyncTokenCredential, - azure_openai_api_key: Union[str, None] = None, - azure_openai_api_version: Union[str, None] = None, - azure_openai_service: Union[str, None] = None, - azure_openai_custom_url: Union[str, None] = None, - openai_api_key: Union[str, None] = None, - openai_organization: Union[str, None] = None, + azure_openai_api_key: Optional[str] = None, + azure_openai_api_version: Optional[str] = None, + azure_openai_service: Optional[str] = None, + azure_openai_custom_url: Optional[str] = None, + openai_api_key: Optional[str] = None, + openai_organization: Optional[str] = None, ): if openai_host not in OpenAIHost: raise ValueError(f"Invalid OPENAI_HOST value: {openai_host}. Must be one of {[h.value for h in OpenAIHost]}.") @@ -264,23 +264,23 @@ def setup_openai_client( def setup_file_processors( azure_credential: AsyncTokenCredential, - document_intelligence_service: Union[str, None], - document_intelligence_key: Union[str, None] = None, + document_intelligence_service: Optional[str], + document_intelligence_key: Optional[str] = None, local_pdf_parser: bool = False, local_html_parser: bool = False, use_content_understanding: bool = False, use_multimodal: bool = False, - openai_client: Union[AsyncOpenAI, None] = None, - openai_model: Union[str, None] = None, - openai_deployment: Union[str, None] = None, - content_understanding_endpoint: Union[str, None] = None, + openai_client: Optional[AsyncOpenAI] = None, + openai_model: Optional[str] = None, + openai_deployment: Optional[str] = None, + content_understanding_endpoint: Optional[str] = None, ): sentence_text_splitter = SentenceTextSplitter() doc_int_parser: Optional[DocumentAnalysisParser] = None # check if Azure Document Intelligence credentials are provided if document_intelligence_service is not None: - documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = ( + documentintelligence_creds: AsyncTokenCredential | AzureKeyCredential = ( azure_credential if document_intelligence_key is None else AzureKeyCredential(document_intelligence_key) ) doc_int_parser = DocumentAnalysisParser( @@ -348,8 +348,8 @@ def setup_file_processors( def setup_image_embeddings_service( - azure_credential: AsyncTokenCredential, vision_endpoint: Union[str, None], use_multimodal: bool -) -> Union[ImageEmbeddings, None]: + azure_credential: AsyncTokenCredential, vision_endpoint: Optional[str], use_multimodal: bool +) -> Optional[ImageEmbeddings]: image_embeddings_service: Optional[ImageEmbeddings] = None if use_multimodal: if vision_endpoint is None: diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index 49e3a5a9f3..cb179615ff 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -3,7 +3,7 @@ import os import re from pathlib import Path -from typing import IO, Any, Optional, TypedDict, Union +from typing import IO, Any, Optional, TypedDict from urllib.parse import unquote from azure.core.credentials_async import AsyncTokenCredential @@ -169,7 +169,7 @@ async def _ensure_directory(self, directory_path: str, user_oid: str) -> DataLak await directory_client.set_access_control(owner=user_oid) return directory_client - async def upload_blob(self, file: Union[File, IO], filename: str, user_oid: str) -> str: + async def upload_blob(self, file: File | IO, filename: str, user_oid: str) -> str: """ Uploads a file directly to the user's directory in ADLS (no subdirectory). @@ -393,7 +393,7 @@ def __init__( self, endpoint: str, container: str, - credential: Union[AsyncTokenCredential, str], + credential: AsyncTokenCredential | str, image_container: Optional[str] = None, account: Optional[str] = None, resource_group: Optional[str] = None, diff --git a/app/backend/prepdocslib/embeddings.py b/app/backend/prepdocslib/embeddings.py index 3d1af5d293..fe70da2b1f 100644 --- a/app/backend/prepdocslib/embeddings.py +++ b/app/backend/prepdocslib/embeddings.py @@ -1,7 +1,7 @@ import logging from abc import ABC -from collections.abc import Awaitable -from typing import Callable, Optional, Union +from collections.abc import Awaitable, Callable +from typing import Optional from urllib.parse import urljoin import aiohttp @@ -160,13 +160,13 @@ class AzureOpenAIEmbeddingService(OpenAIEmbeddings): def __init__( self, - open_ai_service: Union[str, None], - open_ai_deployment: Union[str, None], + open_ai_service: Optional[str], + open_ai_deployment: Optional[str], open_ai_model_name: str, open_ai_dimensions: int, open_ai_api_version: str, - credential: Union[AsyncTokenCredential, AzureKeyCredential], - open_ai_custom_url: Union[str, None] = None, + credential: AsyncTokenCredential | AzureKeyCredential, + open_ai_custom_url: Optional[str] = None, disable_batch: bool = False, ): super().__init__(open_ai_model_name, open_ai_dimensions, disable_batch) @@ -184,7 +184,7 @@ def __init__( async def create_client(self) -> AsyncOpenAI: class AuthArgs(TypedDict, total=False): api_key: str - azure_ad_token_provider: Callable[[], Union[str, Awaitable[str]]] + azure_ad_token_provider: Callable[[], str | Awaitable[str]] auth_args = AuthArgs() if isinstance(self.credential, AzureKeyCredential): diff --git a/app/backend/prepdocslib/listfilestrategy.py b/app/backend/prepdocslib/listfilestrategy.py index c2e95ba212..7302bc7ed8 100644 --- a/app/backend/prepdocslib/listfilestrategy.py +++ b/app/backend/prepdocslib/listfilestrategy.py @@ -7,7 +7,7 @@ from abc import ABC from collections.abc import AsyncGenerator from glob import glob -from typing import IO, Optional, Union +from typing import IO, Optional from azure.core.credentials_async import AsyncTokenCredential from azure.storage.filedatalake.aio import ( @@ -148,7 +148,7 @@ def __init__( data_lake_storage_account: str, data_lake_filesystem: str, data_lake_path: str, - credential: Union[AsyncTokenCredential, str], + credential: AsyncTokenCredential | str, enable_global_documents: bool = False, ): self.data_lake_storage_account = data_lake_storage_account diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index a7996ea573..6589c854a3 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -4,7 +4,7 @@ import uuid from collections.abc import AsyncGenerator from enum import Enum -from typing import IO, Optional, Union +from typing import IO, Optional import pymupdf from azure.ai.documentintelligence.aio import DocumentIntelligenceClient @@ -65,15 +65,15 @@ class DocumentAnalysisParser(Parser): def __init__( self, endpoint: str, - credential: Union[AsyncTokenCredential, AzureKeyCredential], + credential: AsyncTokenCredential | AzureKeyCredential, model_id="prebuilt-layout", media_description_strategy: Enum = MediaDescriptionStrategy.NONE, # If using OpenAI, this is the client to use - openai_client: Union[AsyncOpenAI, None] = None, + openai_client: Optional[AsyncOpenAI] = None, openai_model: Optional[str] = None, openai_deployment: Optional[str] = None, # If using Content Understanding, this is the endpoint for the service - content_understanding_endpoint: Union[str, None] = None, + content_understanding_endpoint: Optional[str] = None, # should this take the blob storage info too? ): self.model_id = model_id @@ -98,7 +98,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: ) as document_intelligence_client: file_analyzed = False - media_describer: Union[ContentUnderstandingDescriber, MultimodalModelDescriber, None] = None + media_describer: Optional[ContentUnderstandingDescriber | MultimodalModelDescriber] = None if self.media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: if self.content_understanding_endpoint is None: raise ValueError( @@ -169,9 +169,11 @@ class ObjectType(Enum): TABLE = 0 FIGURE = 1 + MaskEntry = tuple[ObjectType, Optional[int]] + page_offset = page.spans[0].offset page_length = page.spans[0].length - mask_chars: list[tuple[ObjectType, Union[int, None]]] = [(ObjectType.NONE, None)] * page_length + mask_chars: list[MaskEntry] = [(ObjectType.NONE, None)] * page_length # mark all positions of the table spans in the page for table_idx, table in enumerate(tables_on_page): for span in table.spans: @@ -191,7 +193,7 @@ class ObjectType(Enum): # build page text by replacing characters in table spans with table html page_text = "" - added_objects = set() # set of object types todo mypy + added_objects: set[MaskEntry] = set() for idx, mask_char in enumerate(mask_chars): object_type, object_idx = mask_char if object_type == ObjectType.NONE: diff --git a/app/backend/prepdocslib/strategy.py b/app/backend/prepdocslib/strategy.py index 64673dd5dd..946b129e80 100644 --- a/app/backend/prepdocslib/strategy.py +++ b/app/backend/prepdocslib/strategy.py @@ -1,6 +1,6 @@ from abc import ABC from enum import Enum -from typing import Optional, Union +from typing import Optional from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential @@ -19,7 +19,7 @@ class SearchInfo: def __init__( self, endpoint: str, - credential: Union[AsyncTokenCredential, AzureKeyCredential], + credential: AsyncTokenCredential | AzureKeyCredential, index_name: str, use_agentic_retrieval: Optional[bool] = False, agent_name: Optional[str] = None, diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt index 9f043afab4..ae7e80b54c 100644 --- a/app/backend/requirements.txt +++ b/app/backend/requirements.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements.in -o requirements.txt --python-version 3.9 +# uv pip compile requirements.in -o requirements.txt --python-version 3.10 aiofiles==24.1.0 # via # prompty @@ -140,10 +140,7 @@ idna==3.10 # requests # yarl importlib-metadata==8.0.0 - # via - # flask - # opentelemetry-api - # quart + # via opentelemetry-api isodate==0.6.1 # via # azure-ai-documentintelligence @@ -326,7 +323,7 @@ packaging==24.1 # via # opentelemetry-instrumentation # opentelemetry-instrumentation-flask -pillow==10.4.0 +pillow==12.0.0 # via -r requirements.in priority==2.0.0 # via hypercorn @@ -396,8 +393,10 @@ taskgroup==0.2.2 # via hypercorn tenacity==9.1.2 # via -r requirements.in -tiktoken==0.8.0 - # via -r requirements.in +tiktoken==0.12.0 + # via + # -r requirements.in + # opentelemetry-instrumentation-openai tomli==2.2.1 # via hypercorn tqdm==4.66.5 @@ -431,8 +430,6 @@ typing-extensions==4.15.0 # pydantic # pydantic-core # pypdf - # quart - # quart-cors # taskgroup # typing-inspection # uvicorn diff --git a/pyproject.toml b/pyproject.toml index 195e98998d..44072581ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.ruff] -target-version = "py39" +target-version = "py310" lint.select = ["E", "F", "I", "UP"] -lint.ignore = ["E501", "E701"] # line too long, multiple statements on one line +lint.ignore = ["E501", "E701", "UP045"] # line too long, multiple statements on one line, keep Optional[X] src = ["app/backend", "scripts"] [tool.ruff.lint.isort] @@ -23,7 +23,7 @@ show_missing = true [tool.mypy] check_untyped_defs = true -python_version = 3.9 +python_version = "3.10" [[tool.mypy.overrides]] module = [ diff --git a/requirements-dev.txt b/requirements-dev.txt index 70f7aa3aee..edc9571a50 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ -r app/backend/requirements.txt -ruff +ruff>=0.14.2 black pytest pytest-asyncio diff --git a/scripts/manageacl.py b/scripts/manageacl.py index f715fe0ba0..c95cdf8e9c 100644 --- a/scripts/manageacl.py +++ b/scripts/manageacl.py @@ -3,7 +3,7 @@ import json import logging import os -from typing import Any, Union +from typing import Any from urllib.parse import urljoin from uuid import uuid4 @@ -37,7 +37,7 @@ def __init__( acl_action: str, acl_type: str, acl: str, - credentials: Union[AsyncTokenCredential, AzureKeyCredential], + credentials: AsyncTokenCredential | AzureKeyCredential, ): """ Initializes the command @@ -251,7 +251,7 @@ async def main(args: Any): if args.tenant_id is None else AzureDeveloperCliCredential(tenant_id=args.tenant_id, process_timeout=60) ) - search_credential: Union[AsyncTokenCredential, AzureKeyCredential] = azd_credential + search_credential: AsyncTokenCredential | AzureKeyCredential = azd_credential if args.search_key is not None: search_credential = AzureKeyCredential(args.search_key) diff --git a/scripts/pretty_print_jsonl.py b/scripts/pretty_print_jsonl.py deleted file mode 100644 index f14d67455b..0000000000 --- a/scripts/pretty_print_jsonl.py +++ /dev/null @@ -1,128 +0,0 @@ -"""Utility to pretty-format a JSONL (JSON Lines) file. - -NOTE: Classic JSONL expects one JSON object per single line. Once we pretty -print (indent) each object, the result is no longer *strict* JSONL because -objects will span multiple lines. This script offers a few output modes so -you can choose what you need: - -1. Default (stdout): Pretty prints each record (with indentation) separated - by a blank line for readability. -2. --in-place: Rewrites the source file by replacing each original single-line - object with its multi-line, indented representation separated by a blank line. -3. --output : Writes the pretty output to a new file (recommended if you - also want to keep the original valid JSONL file unchanged). -4. --as-array: Instead of individual objects, emit a single JSON array containing - all objects, using indentation (this produces standard JSON, not JSONL). - -Examples: - python scripts/pretty_print_jsonl.py evals/ground_truth_multimodal.jsonl - python scripts/pretty_print_jsonl.py evals/ground_truth_multimodal.jsonl --output evals/ground_truth_multimodal.pretty.jsonl - python scripts/pretty_print_jsonl.py evals/ground_truth_multimodal.jsonl --in-place - python scripts/pretty_print_jsonl.py evals/ground_truth_multimodal.jsonl --as-array --output evals/ground_truth_multimodal.pretty.json - -Safeguards: - * Refuses to use --in-place together with --as-array (ambiguous expectations). - * Backs up the original file to .bak before in-place rewrite unless - --no-backup is supplied. -""" - -from __future__ import annotations - -import argparse -import json -import sys -from pathlib import Path - - -def read_jsonl(path: Path): - """Yield parsed JSON objects from a JSONL file. - - Skips empty lines. Raises ValueError with context on parse failures. - """ - for idx, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1): - stripped = line.strip() - if not stripped: - continue - try: - yield json.loads(stripped) - except json.JSONDecodeError as e: - raise ValueError(f"Failed to parse JSON on line {idx} of {path}: {e}") from e - - -def write_pretty_individual(objs, indent: int) -> str: - """Return a string with each object pretty JSON, separated by a blank line.""" - parts = [json.dumps(o, indent=indent, ensure_ascii=False) for o in objs] - # Add trailing newline for file friendliness - return "\n\n".join(parts) + "\n" - - -def write_pretty_array(objs, indent: int) -> str: - return json.dumps(list(objs), indent=indent, ensure_ascii=False) + "\n" - - -def parse_args(argv: list[str]) -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Pretty-format a JSONL file.") - parser.add_argument( - "jsonl_file", - type=Path, - help="Path to the source JSONL file (one JSON object per line).", - ) - parser.add_argument("--indent", type=int, default=2, help="Indent level for json.dumps (default: 2)") - group = parser.add_mutually_exclusive_group() - group.add_argument( - "--in-place", - action="store_true", - help="Rewrite the original file with pretty-formatted objects (not strict JSONL).", - ) - group.add_argument( - "--output", - type=Path, - help="Path to write output. If omitted and not --in-place, prints to stdout.", - ) - parser.add_argument( - "--as-array", - action="store_true", - help="Emit a single JSON array instead of individual pretty objects.", - ) - parser.add_argument( - "--no-backup", - action="store_true", - help="When using --in-place, do not create a .bak backup file.", - ) - return parser.parse_args(argv) - - -def main(argv: list[str] | None = None) -> int: - args = parse_args(argv or sys.argv[1:]) - - if not args.jsonl_file.exists(): - print(f"Error: File not found: {args.jsonl_file}", file=sys.stderr) - return 1 - - objs = list(read_jsonl(args.jsonl_file)) - - if args.as_array: - output_text = write_pretty_array(objs, args.indent) - else: - output_text = write_pretty_individual(objs, args.indent) - - # Destination logic - if args.in_place: - if not args.no_backup: - backup_path = args.jsonl_file.with_suffix(args.jsonl_file.suffix + ".bak") - if not backup_path.exists(): - backup_path.write_text(args.jsonl_file.read_text(encoding="utf-8"), encoding="utf-8") - args.jsonl_file.write_text(output_text, encoding="utf-8") - print(f"Rewrote {args.jsonl_file} ({len(objs)} objects).") - elif args.output: - args.output.parent.mkdir(parents=True, exist_ok=True) - args.output.write_text(output_text, encoding="utf-8") - print(f"Wrote pretty output to {args.output} ({len(objs)} objects).") - else: - # stdout - sys.stdout.write(output_text) - return 0 - - -if __name__ == "__main__": # pragma: no cover - raise SystemExit(main()) diff --git a/tests/test_auth_init.py b/tests/test_auth_init.py index f02b83dd44..5511a9cdcd 100644 --- a/tests/test_auth_init.py +++ b/tests/test_auth_init.py @@ -1,6 +1,7 @@ import json import os from types import SimpleNamespace +from typing import Optional from unittest import mock import pytest @@ -112,7 +113,7 @@ async def get(self): class FakeOAuthGrant: def __init__(self): self.responses: list[SimpleNamespace] = [] - self.raise_on_post: APIError | None = None + self.raise_on_post: Optional[APIError] = None self.posted = [] self.post_attempts = 0 diff --git a/tests/test_prepdocs.py b/tests/test_prepdocs.py index bb89e6662e..77cb29c0be 100644 --- a/tests/test_prepdocs.py +++ b/tests/test_prepdocs.py @@ -1,13 +1,16 @@ import logging +from argparse import Namespace from unittest.mock import AsyncMock import openai import openai.types import pytest import tenacity +from azure.core.credentials import AzureKeyCredential from httpx import Request, Response from openai.types.create_embedding_response import Usage +import prepdocs from prepdocslib.embeddings import ( AzureOpenAIEmbeddingService, ImageEmbeddings, @@ -248,3 +251,185 @@ async def test_image_embeddings_success(mock_azurehttp_calls): ] mock_token_provider.assert_called_once() + + +def test_setup_blob_manager_respects_storage_key(monkeypatch: pytest.MonkeyPatch) -> None: + captured: dict[str, object] = {} + + class StubBlobManager: + def __init__( + self, + *, + endpoint: str, + container: str, + account: str, + credential: object, + resource_group: str, + subscription_id: str, + image_container: str | None = None, + ) -> None: + captured["endpoint"] = endpoint + captured["container"] = container + captured["account"] = account + captured["credential"] = credential + captured["resource_group"] = resource_group + captured["subscription_id"] = subscription_id + captured["image_container"] = image_container + + monkeypatch.setattr(prepdocs, "BlobManager", StubBlobManager) + + result = prepdocs.setup_blob_manager( + azure_credential=MockAzureCredential(), + storage_account="storageacct", + storage_container="docs", + storage_resource_group="rg", + subscription_id="sub-id", + storage_key="override-key", + image_storage_container="images", + ) + + assert isinstance(result, StubBlobManager) + assert captured["credential"] == "override-key" + assert captured["image_container"] == "images" + + +def test_setup_list_file_strategy_uses_datalake_key(monkeypatch: pytest.MonkeyPatch) -> None: + captured: dict[str, object] = {} + + class StubAdlsStrategy: + def __init__( + self, + *, + data_lake_storage_account: str, + data_lake_filesystem: str, + data_lake_path: str, + credential: object, + enable_global_documents: bool = False, + ) -> None: + captured["storage_account"] = data_lake_storage_account + captured["filesystem"] = data_lake_filesystem + captured["path"] = data_lake_path + captured["credential"] = credential + captured["enable_global_documents"] = enable_global_documents + + monkeypatch.setattr(prepdocs, "ADLSGen2ListFileStrategy", StubAdlsStrategy) + + strategy = prepdocs.setup_list_file_strategy( + azure_credential=MockAzureCredential(), + local_files=None, + datalake_storage_account="adlsacct", + datalake_filesystem="filesystem", + datalake_path="path", + datalake_key="custom-key", + enable_global_documents=True, + ) + + assert isinstance(strategy, StubAdlsStrategy) + assert captured["credential"] == "custom-key" + assert captured["enable_global_documents"] is True + + +@pytest.mark.asyncio +async def test_azure_embedding_service_create_client_uses_token_provider( + monkeypatch: pytest.MonkeyPatch, +) -> None: + async def fake_provider() -> str: + return "token" + + def fake_get_bearer_token_provider(credential: object, scope: str): + assert scope == "https://cognitiveservices.azure.com/.default" + return fake_provider + + captured: dict[str, object] = {} + + class StubAsyncAzureOpenAI: + def __init__( + self, + *, + azure_endpoint: str, + azure_deployment: str | None, + api_version: str, + **auth_args: object, + ) -> None: + captured["endpoint"] = azure_endpoint + captured["deployment"] = azure_deployment + captured["api_version"] = api_version + captured["auth_args"] = auth_args + + monkeypatch.setattr("prepdocslib.embeddings.get_bearer_token_provider", fake_get_bearer_token_provider) + monkeypatch.setattr("prepdocslib.embeddings.AsyncAzureOpenAI", StubAsyncAzureOpenAI) + + service = AzureOpenAIEmbeddingService( + open_ai_service="service", + open_ai_deployment="deployment", + open_ai_model_name=MOCK_EMBEDDING_MODEL_NAME, + open_ai_dimensions=MOCK_EMBEDDING_DIMENSIONS, + open_ai_api_version="2024-06-01", + credential=MockAzureCredential(), + ) + + client = await service.create_client() + + assert isinstance(client, StubAsyncAzureOpenAI) + assert captured["endpoint"] == "https://service.openai.azure.com" + assert "azure_ad_token_provider" in captured["auth_args"] + provider = captured["auth_args"]["azure_ad_token_provider"] + assert callable(provider) + assert await provider() == "token" + + +@pytest.mark.asyncio +async def test_manageacl_main_uses_search_key(monkeypatch: pytest.MonkeyPatch) -> None: + from scripts import manageacl as manageacl_module + + monkeypatch.setenv("AZURE_SEARCH_SERVICE", "searchsvc") + monkeypatch.setenv("AZURE_SEARCH_INDEX", "searchindex") + + monkeypatch.setattr(manageacl_module, "load_azd_env", lambda: None) + + class DummyAzureCredential: + def __init__(self, *args, **kwargs) -> None: # pragma: no cover - simple stub + pass + + monkeypatch.setattr(manageacl_module, "AzureDeveloperCliCredential", DummyAzureCredential) + + captured: dict[str, object] = {} + + class DummyManageAcl: + def __init__( + self, + *, + service_name: str, + index_name: str, + url: str, + acl_action: str, + acl_type: str | None, + acl: str | None, + credentials: object, + ) -> None: + captured["service_name"] = service_name + captured["index_name"] = index_name + captured["url"] = url + captured["credentials"] = credentials + + async def run(self) -> None: + captured["run_called"] = True + + monkeypatch.setattr(manageacl_module, "ManageAcl", DummyManageAcl) + + args = Namespace( + tenant_id=None, + search_key="secret", + url="https://example/document.pdf", + acl_action="view", + acl_type="oids", + acl="user1", + ) + + await manageacl_module.main(args) + + assert captured["run_called"] is True + assert isinstance(captured["credentials"], AzureKeyCredential) + assert captured["credentials"].key == "secret" + assert captured["service_name"] == "searchsvc" + assert captured["index_name"] == "searchindex"