Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
matrix:
image:
- name: frontend
context: .
context: ./frontend
dockerfile: ./frontend/Dockerfile
- name: backend
context: .
Expand Down Expand Up @@ -263,16 +263,34 @@ jobs:
kubectl patch configmap backend-config -n model-platform --type merge -p "{\"data\":{\"IMAGE_TAG\":\"${IMAGE_TAG}\"}}"
make k8s-modelplatform IMAGE_TAG=${IMAGE_TAG}

- name: Wait for infrastructure to settle (3m)
- name: Wait for backend to be ready
run: |
echo "Waitin 3 minutes for infrastructure to settle"
sleep 60
echo "Waiting for backend healthcheck at http://model-platform.com/api/health/"
for i in $(seq 1 60); do
STATUS=$(curl -s -o /dev/null -w "%{http_code}" http://model-platform.com/api/health/)
if [ "$STATUS" = "200" ]; then
echo "Backend is ready (HTTP 200)"
exit 0
fi
echo "Attempt $i/60 — HTTP $STATUS, retrying in 10s..."
sleep 10
done
echo "Backend did not become ready after 10 minutes"
exit 1

- name: Run end-to-end tests
run: |
echo "Launching end-to-end tests"
uv run pytest tests/tests_end_to_end/test_from_project_creation_to_model_predict.py -v --tb=long

- name: Install Playwright browser (Chromium)
run: uv run playwright install --with-deps chromium

- name: Run frontend e2e tests
run: |
echo "Launching frontend e2e tests"
uv run pytest tests/tests_end_to_end/test_frontend_e2e.py -v --tb=long

- name: Collect logs on failure
if: failure()
run: |
Expand Down Expand Up @@ -316,7 +334,7 @@ jobs:
matrix:
image:
- name: frontend
context: .
context: ./frontend
dockerfile: ./frontend/Dockerfile
- name: backend
context: .
Expand Down
17 changes: 17 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ k8s-network-conf:
kubectl apply -f infrastructure/k8s/nginx-configmap.yaml
kubectl apply -f infrastructure/k8s/nginx-deployment.yaml
kubectl apply -f infrastructure/k8s/ingress.yaml
kubectl rollout restart deployment/nginx-reverse-proxy

k8s-backend:
kubectl apply -f infrastructure/k8s/backend-configmap.yaml
Expand Down Expand Up @@ -96,6 +97,22 @@ k8s-backend-local:
kubectl apply -f -
kubectl rollout restart deployment/backend -n model-platform

k8s-frontend-local:
eval $$(minikube docker-env) && \
docker build -t model-platform-frontend:local -f frontend/Dockerfile frontend
kubectl apply -f infrastructure/k8s/frontend-configmap.yaml
FRONTEND_IMAGE=model-platform-frontend IMAGE_TAG=local \
envsubst < infrastructure/k8s/frontend-deployment.yaml | \
sed 's/imagePullPolicy: Always/imagePullPolicy: Never/' | \
kubectl apply -f -
kubectl rollout restart deployment/frontend -n model-platform

k8s-infra: k8s-network-conf k8s-pgsql k8s-monitoring

k8s-modelplatform: k8s-backend k8s-frontend

dev-back:
uv run python -m backend

dev-front:
cd frontend && python -m http.server 8080
8 changes: 8 additions & 0 deletions backend/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# Ugly stuff to remove ugly warning, sorry TOUL
import bcrypt
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from loguru import logger

from backend.api import (
Expand Down Expand Up @@ -72,6 +73,13 @@ def create_app() -> FastAPI:
The configured FastAPI application instance.
"""
app = FastAPI(title="Model Platform API", version="1.0.0", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["http://localhost:8080", "http://localhost:3000", "http://127.0.0.1:8080"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.include_router(health_check.router, prefix="/health", tags=["Health"])
app.include_router(auth_routes.router, prefix="/auth", tags=["Authentication"])
app.include_router(models_routes.router, prefix="/{project_name}/models", tags=["Models"])
Expand Down
29 changes: 29 additions & 0 deletions backend/api/health_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
This module provides a health check endpoint for the API.
"""

import os

import httpx
from fastapi import APIRouter
from loguru import logger
from pydantic import BaseModel

router = APIRouter()
Expand All @@ -25,3 +29,28 @@ def health_check():
A dictionary with the status of the API.
"""
return HealthCheck(status="OK")


@router.get("/storage")
def storage_health_check():
"""Check MinIO/S3 storage reachability from the backend.

Uses the MinIO live health endpoint so the browser never has to reach
an internal cluster URL directly.

Returns
-------
dict
{"status": "ok"} or {"status": "error", "detail": "..."}
"""
s3_url = os.environ.get("MLFLOW_S3_ENDPOINT_URL", "")
if not s3_url:
return {"status": "error", "detail": "MLFLOW_S3_ENDPOINT_URL not configured"}
try:
response = httpx.get(f"{s3_url.rstrip('/')}/minio/health/live", timeout=3.0)
if response.status_code == 200:
return {"status": "ok"}
return {"status": "error", "detail": f"HTTP {response.status_code}"}
except Exception as e:
logger.warning(f"Storage health check failed: {e}")
return {"status": "error", "detail": str(e)}
19 changes: 17 additions & 2 deletions backend/api/projects_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from backend.domain.ports.user_handler import UserHandler
from backend.domain.use_cases import user_usecases
from backend.domain.use_cases.auth_usecases import get_current_user, get_user_adapter
from backend.domain.use_cases.deployed_models import get_registry_status_for_project
from backend.domain.use_cases.governance_usecases import (
download_project_models_governance_information,
return_project_models_governance_information,
Expand Down Expand Up @@ -154,8 +155,12 @@ def governance_route(
registry: ModelRegistry = registry_pool.get_registry_adapter(
project_name, get_project_registry_tracking_uri(project_name, request)
)
project_governance = return_project_models_governance_information(project_name, registry)
return JSONResponse(content={"project_gouvernance": project_governance}, media_type="application/json")
try:
project_governance = return_project_models_governance_information(project_name, registry)
except Exception as e:
logger.exception(f"Error fetching governance data for project {project_name}")
raise HTTPException(status_code=500, detail=str(e))
return JSONResponse(content={"project_governance": project_governance}, media_type="application/json")


@router.get("/{project_name}/users")
Expand Down Expand Up @@ -207,3 +212,13 @@ def route_change_user_role_for_project(
)
success = user_usecases.change_user_role_for_project(email, project_name, role, user_adapter)
return JSONResponse(content={"status": success}, media_type="application/json")


@router.get("/{project_name}/registry_status")
def registry_status_route(
project_name: str,
current_user: dict = Depends(get_current_user),
):
"""Return the K8s deployment status of the MLflow registry for a project."""
status = get_registry_status_for_project(project_name)
return {"status": status}
2 changes: 2 additions & 0 deletions backend/domain/entities/model_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class ModelDeployment(BaseModel):
deployment_name: str
deployment_date: int
dashboard_uid: str
status: str = "unknown"

def to_json(self) -> dict:
return {
Expand All @@ -19,4 +20,5 @@ def to_json(self) -> dict:
"deployment_name": self.deployment_name,
"deployment_date": str(datetime.fromtimestamp(self.deployment_date)),
"dashboard_url": f"/grafana/d/{self.dashboard_uid}",
"status": self.status,
}
20 changes: 20 additions & 0 deletions backend/domain/use_cases/deployed_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from backend.domain.entities.model_deployment import ModelDeployment
from backend.infrastructure.k8s_deployment_cluster_adapter import K8SDeploymentClusterAdapter
from backend.infrastructure.k8s_registry_deployment_adapter import K8SRegistryDeployment
from backend.utils import sanitize_project_name


def list_deployed_models_with_status_for_a_project(project_name: str) -> list[str]:
Expand All @@ -13,6 +14,25 @@ def list_deployed_models_with_status_for_a_project(project_name: str) -> list[st
return deployed_models_json


def get_registry_status_for_project(project_name: str) -> str:
"""Return the K8s deployment status of the MLflow registry for a project.

Returns one of: 'running', 'pending', 'error', 'not_found'.
"""
k8s = K8SDeploymentClusterAdapter()
namespace = sanitize_project_name(project_name)
try:
deployments = k8s.apps_api_instance.list_namespaced_deployment(
namespace=namespace, label_selector="type=model_registry"
)
if not deployments.items:
return "not_found"
return k8s._resolve_deployment_status(deployments.items[0].status)
except Exception as e:
logger.warning(f"Could not get registry status for {project_name}: {e}")
return "error"


def _remove_project_namespace(project_name: str) -> None:
k8s_deployment = K8SRegistryDeployment(project_name)
k8s_deployment.delete_namespace()
Expand Down
11 changes: 8 additions & 3 deletions backend/domain/use_cases/governance_usecases.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,14 @@ def _extract_model_artifacts(

def _filter_events_for_model(project_events: list, model_name: str, version: str):
model_events = []
for event in project_events:
event_entity = event.get("entity").replace("'", '"')
event_entity = json.loads(event_entity)
for event in project_events or []:
raw_entity = event.get("entity")
if not raw_entity:
continue
try:
event_entity = json.loads(raw_entity.replace("'", '"'))
except (json.JSONDecodeError, AttributeError):
continue
if (
"model_name" in event_entity
and event_entity["model_name"] == model_name
Expand Down
11 changes: 11 additions & 0 deletions backend/infrastructure/k8s_deployment_cluster_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,20 @@ def list_deployments_for_project(self, project_name: str) -> list[ModelDeploymen
for deployment in deployments.items:
labels = deployment.metadata.labels
labels["deployment_name"] = deployment.metadata.name
labels["status"] = self._resolve_deployment_status(deployment.status)
deployment_list.append(ModelDeployment(**labels))
return deployment_list

@staticmethod
def _resolve_deployment_status(deployment_status) -> str:
available = deployment_status.available_replicas or 0
desired = deployment_status.replicas or 0
if available >= 1:
return "running"
if desired >= 1:
return "pending"
return "error"

def list_all_registries(self) -> list:
registry_deployments = self.apps_api_instance.list_deployment_for_all_namespaces(
label_selector="type=model_registry"
Expand Down
7 changes: 5 additions & 2 deletions backend/infrastructure/mlflow_model_registry_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,11 @@ def _process_model_versions(model_version: list[ModelVersion]) -> list[dict]:
def _get_model_artifacts_path(self, run_id: str) -> str:
logger.info(f"Using mlflow tracking uri: {self.mlflow_client_manager.tracking_uri}")
logger.info(f"Using mlflow tracking uri: {self.mlflow_client.tracking_uri}")
file_info: FileInfo = self.mlflow_client.list_artifacts(run_id)[0]
return file_info.path
artifacts: list[FileInfo] = self.mlflow_client.list_artifacts(run_id)
# The model artifact is a directory (MLmodel, pkl, etc.). Pick the first directory;
# fall back to the first entry if no directory is found.
model_artifact = next((a for a in artifacts if a.is_dir), artifacts[0])
return model_artifact.path

def _download_run_id_artifacts(self, run_id: str, artifacts_path: str, destination_path: str) -> str:
return self.mlflow_client.download_artifacts(run_id, artifacts_path, destination_path)
Expand Down
59 changes: 32 additions & 27 deletions frontend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,27 +1,32 @@
FROM python:3.11-slim

WORKDIR /app

COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

COPY ./pyproject.toml ./uv.lock ./

RUN mkdir -p ~/.streamlit
COPY ./frontend/config.toml ~/.streamlit/config.toml

RUN uv sync --frozen --no-install-project --group frontend

COPY ./frontend ./frontend


ENV PYTHONPATH=/app
ENV PATH="/app/.venv/bin:$PATH"

EXPOSE 8501

CMD ["streamlit", "run", "frontend/app.py", \
"--server.port=8501", \
"--server.address=0.0.0.0", \
"--server.enableCORS=false", \
"--server.enableXsrfProtection=false", \
"--server.enableWebsocketCompression=false"]
FROM nginx:1.27-alpine

# Copy static frontend files
COPY . /usr/share/nginx/html

# nginx config: serve index.html for all routes (SPA fallback)
RUN printf 'server {\n\
listen 80;\n\
root /usr/share/nginx/html;\n\
index index.html;\n\
location / {\n\
try_files $uri $uri/ /index.html;\n\
}\n\
location ~* \.(css|js|png|jpg|svg|ico|woff2?)$ {\n\
add_header Cache-Control "no-cache";\n\
}\n\
}\n' > /etc/nginx/conf.d/default.conf

# Entrypoint: inject env vars into config.js at runtime
RUN printf '#!/bin/sh\n\
cat > /usr/share/nginx/html/js/config.js <<EOF\n\
window.API_BASE_URL = "${API_BASE_URL:-http://backend.model-platform.svc.cluster.local:8000}";\n\
window.MP_HOST_NAME = "${MP_HOST_NAME:-model-platform.com}";\n\
window.MP_REGISTRY_PATH = "${MP_REGISTRY_PATH:-registry}";\n\
window.MLFLOW_S3_ENDPOINT_URL = "${MLFLOW_S3_ENDPOINT_URL:-}";\n\
EOF\n\
exec nginx -g "daemon off;"\n' > /docker-entrypoint-override.sh && \
chmod +x /docker-entrypoint-override.sh

EXPOSE 80

CMD ["/docker-entrypoint-override.sh"]
3 changes: 0 additions & 3 deletions frontend/__init__.py

This file was deleted.

Empty file.
Loading
Loading