api: Add tracking ID validation

phoevos · phoevos · commit d9fde4fb7383 · 2024-12-18T14:25:39.000Z
Validate the tracking ID in the API endpoints that require it, ensuring it's an alphanumeric string of length 1-256. The implementation and tests are based on MLflow's internal run ID validation: https://github.com/mlflow/mlflow/blob/92a1664ddbd7ef59f8db45e988e41437d179c3b1/mlflow/utils/validation.py#L374-L377 Signed-off-by: Phoevos Kalemkeris <phoevos.kalemkeris@ucl.ac.uk>
diff --git a/app/api/dependencies.py b/app/api/dependencies.py
@@ -1,11 +1,19 @@
 import logging
+import re
+from typing import Union
+from typing_extensions import Annotated
+
+from fastapi import HTTPException, Query
+from starlette.status import HTTP_400_BAD_REQUEST
 
 from typing import Optional
 from config import Settings
 from registry import model_service_registry
 from model_services.base import AbstractModelService
 from management.model_manager import ModelManager
 
+TRACKING_ID_REGEX = re.compile(r"^[a-zA-Z0-9][\w\-]{0,255}$")
+
 logger = logging.getLogger("cms")
 
 
@@ -45,3 +53,14 @@ def __init__(self, model_service: AbstractModelService) -> None:
 
     def __call__(self) -> ModelManager:
         return self._model_manager
+
+
+def validate_tracking_id(
+    tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the requested task")] = None,
+) -> Union[str, None]:
+    if tracking_id is not None and TRACKING_ID_REGEX.match(tracking_id) is None:
+        raise HTTPException(
+            status_code=HTTP_400_BAD_REQUEST,
+            detail=f"Invalid tracking ID '{tracking_id}', must be an alphanumeric string of length 1 to 256",
+        )
+    return tracking_id
diff --git a/app/api/routers/evaluation.py b/app/api/routers/evaluation.py
@@ -11,6 +11,7 @@
 from fastapi.responses import StreamingResponse, JSONResponse
 
 import api.globals as cms_globals
+from api.dependencies import validate_tracking_id
 from domain import Tags, Scope
 from model_services.base import AbstractModelService
 from processors.metrics_collector import (
@@ -34,7 +35,7 @@
              description="Evaluate the model being served with a trainer export")
 async def get_evaluation_with_trainer_export(request: Request,
                                              trainer_export: Annotated[List[UploadFile], File(description="One or more trainer export files to be uploaded")],
-                                             tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the evaluation task")] = None,
+                                             tracking_id: Union[str, None] = Depends(validate_tracking_id),
                                              model_service: AbstractModelService = Depends(cms_globals.model_service_dep)) -> JSONResponse:
     files = []
     file_names = []
@@ -70,7 +71,7 @@ async def get_evaluation_with_trainer_export(request: Request,
              description="Sanity check the model being served with a trainer export")
 def get_sanity_check_with_trainer_export(request: Request,
                                          trainer_export: Annotated[List[UploadFile], File(description="One or more trainer export files to be uploaded")],
-                                         tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the sanity check task")] = None,
+                                         tracking_id: Union[str, None] = Depends(validate_tracking_id),
                                          model_service: AbstractModelService = Depends(cms_globals.model_service_dep)) -> StreamingResponse:
     files = []
     file_names = []
@@ -106,7 +107,7 @@ def get_inter_annotator_agreement_scores(request: Request,
                                          annotator_a_project_id: Annotated[int, Query(description="The project ID from one annotator")],
                                          annotator_b_project_id: Annotated[int, Query(description="The project ID from another annotator")],
                                          scope: Annotated[str, Query(enum=[s.value for s in Scope], description="The scope for which the score will be calculated, e.g., per_concept, per_document or per_span")],
-                                         tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the IAA task")] = None) -> StreamingResponse:
+                                         tracking_id: Union[str, None] = Depends(validate_tracking_id)) -> StreamingResponse:
     files = []
     for te in trainer_export:
         temp_te = tempfile.NamedTemporaryFile()
@@ -143,7 +144,7 @@ def get_inter_annotator_agreement_scores(request: Request,
              description="Concatenate multiple trainer export files into a single file for download")
 def get_concatenated_trainer_exports(request: Request,
                                      trainer_export: Annotated[List[UploadFile], File(description="A list of trainer export files to be uploaded")],
-                                     tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the concatenation task")] = None) -> JSONResponse:
+                                     tracking_id: Union[str, None] = Depends(validate_tracking_id)) -> JSONResponse:
     files = []
     for te in trainer_export:
         temp_te = tempfile.NamedTemporaryFile()
@@ -167,7 +168,7 @@ def get_concatenated_trainer_exports(request: Request,
              description="Get annotation stats of trainer export files")
 def get_annotation_stats(request: Request,
                          trainer_export: Annotated[List[UploadFile], File(description="One or more trainer export files to be uploaded")],
-                         tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the annotation stats task")] = None) -> StreamingResponse:
+                         tracking_id: Union[str, None] = Depends(validate_tracking_id)) -> StreamingResponse:
     files = []
     file_names = []
     for te in trainer_export:
diff --git a/app/api/routers/invocation.py b/app/api/routers/invocation.py
@@ -20,6 +20,7 @@
 from domain import TextWithAnnotations, TextWithPublicKey, TextStreamItem, ModelCard, Tags
 from model_services.base import AbstractModelService
 from utils import get_settings
+from api.dependencies import validate_tracking_id
 from api.utils import get_rate_limiter, encrypt
 from management.prometheus_metrics import (
     cms_doc_annotations,
@@ -132,7 +133,7 @@ def get_entities_from_multiple_texts(request: Request,
              description="Upload a file containing a list of plain text and extract the NER entities in JSON")
 def extract_entities_from_multi_text_file(request: Request,
                                           multi_text_file: Annotated[UploadFile, File(description="A file containing a list of plain texts, in the format of [\"text_1\", \"text_2\", ..., \"text_n\"]")],
-                                          tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the bulk processing task")] = None,
+                                          tracking_id: Union[str, None] = Depends(validate_tracking_id),
                                           model_service: AbstractModelService = Depends(cms_globals.model_service_dep)) -> StreamingResponse:
     with tempfile.NamedTemporaryFile() as data_file:
         for line in multi_text_file.file:
diff --git a/app/api/routers/metacat_training.py b/app/api/routers/metacat_training.py
@@ -10,6 +10,7 @@
 from starlette.status import HTTP_202_ACCEPTED, HTTP_503_SERVICE_UNAVAILABLE
 
 import api.globals as cms_globals
+from api.dependencies import validate_tracking_id
 from domain import Tags
 from model_services.base import AbstractModelService
 from processors.metrics_collector import concat_trainer_exports
@@ -29,7 +30,7 @@ async def train_metacat(request: Request,
                         epochs: Annotated[int, Query(description="The number of training epochs", ge=0)] = 1,
                         log_frequency: Annotated[int, Query(description="The number of processed documents after which training metrics will be logged", ge=1)] = 1,
                         description: Annotated[Union[str, None], Query(description="The description on the training or change logs")] = None,
-                        tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the training task")] = None,
+                        tracking_id: Union[str, None] = Depends(validate_tracking_id),
                         model_service: AbstractModelService = Depends(cms_globals.model_service_dep)) -> JSONResponse:
     files = []
     file_names = []
diff --git a/app/api/routers/preview.py b/app/api/routers/preview.py
@@ -11,6 +11,7 @@
 from starlette.status import HTTP_404_NOT_FOUND
 
 import api.globals as cms_globals
+from api.dependencies import validate_tracking_id
 from domain import Doc, Tags
 from model_services.base import AbstractModelService
 from processors.metrics_collector import concat_trainer_exports
@@ -27,7 +28,7 @@
              description="Extract the NER entities in HTML for preview")
 async def get_rendered_entities_from_text(request: Request,
                                           text: Annotated[str, Body(description="The text to be sent to the model for NER", media_type="text/plain")],
-                                          tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the preview task")] = None,
+                                          tracking_id: Union[str, None] = Depends(validate_tracking_id),
                                           model_service: AbstractModelService = Depends(cms_globals.model_service_dep)) -> StreamingResponse:
     annotations = model_service.annotate(text)
     entities = annotations_to_entities(annotations, model_service.model_name)
@@ -50,7 +51,7 @@ def get_rendered_entities_from_trainer_export(request: Request,
                                               trainer_export_str: Annotated[str, Form(description="The trainer export raw JSON string")] = "{\"projects\": []}",
                                               project_id: Annotated[Union[int, None], Query(description="The target project ID, and if not provided, all projects will be included")] = None,
                                               document_id: Annotated[Union[int, None], Query(description="The target document ID, and if not provided, all documents of the target project(s) will be included")] = None,
-                                              tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the trainer export preview task")] = None) -> Response:
+                                              tracking_id: Union[str, None] = Depends(validate_tracking_id)) -> Response:
     data: Dict = {"projects": []}
     if trainer_export is not None:
         files = []
diff --git a/app/api/routers/supervised_training.py b/app/api/routers/supervised_training.py
@@ -10,6 +10,7 @@
 from starlette.status import HTTP_202_ACCEPTED, HTTP_503_SERVICE_UNAVAILABLE
 
 import api.globals as cms_globals
+from api.dependencies import validate_tracking_id
 from domain import Tags
 from model_services.base import AbstractModelService
 from processors.metrics_collector import concat_trainer_exports
@@ -32,7 +33,7 @@ async def train_supervised(request: Request,
                            test_size: Annotated[Union[float, None], Query(description="The override of the test size in percentage. (For a 'huggingface-ner' model, a negative value can be used to apply the train-validation-test split if implicitly defined in trainer export: 'projects[0]' is used for training, 'projects[1]' for validation, and 'projects[2]' for testing)")] = 0.2,
                            log_frequency: Annotated[int, Query(description="The number of processed documents after which training metrics will be logged", ge=1)] = 1,
                            description: Annotated[Union[str, None], Form(description="The description of the training or change logs")] = None,
-                           tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the training task")] = None,
+                           tracking_id: Union[str, None] = Depends(validate_tracking_id),
                            model_service: AbstractModelService = Depends(cms_globals.model_service_dep)) -> JSONResponse:
     files = []
     file_names = []
diff --git a/app/api/routers/unsupervised_training.py b/app/api/routers/unsupervised_training.py
@@ -12,6 +12,7 @@
 from fastapi.responses import JSONResponse
 from starlette.status import HTTP_202_ACCEPTED, HTTP_503_SERVICE_UNAVAILABLE
 import api.globals as cms_globals
+from api.dependencies import validate_tracking_id
 from domain import Tags, ModelType
 from model_services.base import AbstractModelService
 from utils import get_settings
@@ -33,7 +34,7 @@ async def train_unsupervised(request: Request,
                              test_size: Annotated[Union[float, None], Query(description="The override of the test size in percentage", ge=0.0)] = 0.2,
                              log_frequency: Annotated[int, Query(description="The number of processed documents after which training metrics will be logged", ge=1)] = 1000,
                              description: Annotated[Union[str, None], Query(description="The description of the training or change logs")] = None,
-                             tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the training task")] = None,
+                             tracking_id: Union[str, None] = Depends(validate_tracking_id),
                              model_service: AbstractModelService = Depends(cms_globals.model_service_dep)) -> JSONResponse:
     """
     Upload one or more plain text files and trigger the unsupervised training
@@ -97,7 +98,7 @@ async def train_unsupervised_with_hf_dataset(request: Request,
                                              test_size: Annotated[Union[float, None], Query(description="The override of the test size in percentage will only take effect if the dataset does not have predefined validation or test splits", ge=0.0)] = 0.2,
                                              log_frequency: Annotated[int, Query(description="The number of processed documents after which training metrics will be logged", ge=1)] = 1000,
                                              description: Annotated[Union[str, None], Query(description="The description of the training or change logs")] = None,
-                                             tracking_id: Annotated[Union[str, None], Query(description="The tracking ID of the training task")] = None,
+                                             tracking_id: Union[str, None] = Depends(validate_tracking_id),
                                              model_service: AbstractModelService = Depends(cms_globals.model_service_dep)) -> JSONResponse:
     """
     Trigger the unsupervised training with a dataset from Hugging Face Hub
diff --git a/tests/app/api/test_dependencies.py b/tests/app/api/test_dependencies.py
@@ -1,4 +1,7 @@
-from api.dependencies import ModelServiceDep
+import pytest
+from fastapi import HTTPException
+
+from api.dependencies import ModelServiceDep, validate_tracking_id
 from config import Settings
 from model_services.medcat_model import MedCATModel
 from model_services.medcat_model_icd10 import MedCATModelIcd10
@@ -36,3 +39,28 @@ def test_transformer_deid_dep():
 def test_huggingface_ner_dep():
     model_service_dep = ModelServiceDep("huggingface_ner", Settings())
     assert isinstance(model_service_dep(), HuggingFaceNerModel)
+
+
+@pytest.mark.parametrize(
+    "run_id",
+    [
+        "a" * 32,
+        "A" * 32,
+        "a" * 256,
+        "f0" * 16,
+        "abcdef0123456789" * 2,
+        "abcdefghijklmnopqrstuvqxyz",
+        "123e4567-e89b-12d3-a456-426614174000",
+        "123e4567e89b12d3a45642661417400",
+    ],
+)
+def test_validate_tracking_id(run_id):
+    assert validate_tracking_id(run_id) == run_id
+
+
+@pytest.mark.parametrize("run_id", ["a/bc" * 8, "", "a" * 400, "*" * 5])
+def test_validate_tracking_id_invalid(run_id):
+    with pytest.raises(HTTPException) as exc_info:
+        validate_tracking_id(run_id)
+    assert exc_info.value.status_code == 400
+    assert "Invalid tracking ID" in exc_info.value.detail