diff --git a/kubernetes/00-namespace.yaml b/kubernetes/00-namespace.yaml new file mode 100644 index 0000000..622e251 --- /dev/null +++ b/kubernetes/00-namespace.yaml @@ -0,0 +1,14 @@ +--- +# Namespace for ICRN Kernel Manager +apiVersion: v1 +kind: Namespace +metadata: + name: kernels + labels: + app: icrn-kernel-manager + name: kernels + annotations: + # Pod Security Standards for hardened cluster + pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/audit: restricted + pod-security.kubernetes.io/warn: restricted diff --git a/kubernetes/01-pv-pvc.yaml b/kubernetes/01-pv-pvc.yaml new file mode 100644 index 0000000..55fa7dc --- /dev/null +++ b/kubernetes/01-pv-pvc.yaml @@ -0,0 +1,40 @@ +--- +# PersistentVolume for NFS mount to kernel repository +apiVersion: v1 +kind: PersistentVolume +metadata: + name: icrn-kernels-pv + labels: + app: icrn-kernel-manager +spec: + storageClassName: nfs-static + capacity: + storage: 500Gi # Adjust based on your kernel repository size + accessModes: + - ReadWriteMany # ReadWriteMany allows multiple pods with read and write access + nfs: + server: harbor-cc.internal.ncsa.edu + path: /harbor/illinois/iccp/sw/icrn/dev/kernels + readOnly: false + persistentVolumeReclaimPolicy: Retain + +--- +# PersistentVolumeClaim for the kernels data +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: icrn-kernels-pvc + namespace: kernels # Change to your desired namespace + labels: + app: icrn-kernel-manager +spec: + storageClassName: nfs-static + accessModes: + - ReadWriteMany + resources: + requests: + storage: 500Gi + selector: + matchLabels: + app: icrn-kernel-manager + volumeName: icrn-kernels-pv diff --git a/kubernetes/02-web-deployment.yaml b/kubernetes/02-web-deployment.yaml new file mode 100644 index 0000000..2a0a4dc --- /dev/null +++ b/kubernetes/02-web-deployment.yaml @@ -0,0 +1,122 @@ +--- +# Deployment for ICRN Web Interface +apiVersion: apps/v1 +kind: Deployment +metadata: + name: icrn-web + namespace: kernels # Change to your desired namespace + labels: + app: icrn-web + component: web-interface +spec: + replicas: 1 # Adjust based on your needs + selector: + matchLabels: + app: icrn-web + template: + metadata: + labels: + app: icrn-web + component: web-interface + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + containers: + - name: web + image: hdpriest0uiuc/icrn-kernel-webserver:latest # Update with your registry/tag + imagePullPolicy: Always + ports: + - containerPort: 8000 + name: http + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + capabilities: + drop: + - ALL + env: + - name: COLLATED_MANIFESTS_PATH + value: "/app/data/collated_manifests.json" + - name: PACKAGE_INDEX_PATH + value: "/app/data/package_index.json" + - name: WORKERS + value: "4" + volumeMounts: + - name: kernels-data + mountPath: /app/data + readOnly: true + livenessProbe: + httpGet: + path: / + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: / + port: 8000 + initialDelaySeconds: 20 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + volumes: + - name: kernels-data + persistentVolumeClaim: + claimName: icrn-kernels-pvc + +--- +# Service for ICRN Web Interface +apiVersion: v1 +kind: Service +metadata: + name: icrn-web-service + namespace: kernels # Change to your desired namespace + labels: + app: icrn-web +spec: + type: ClusterIP # Change to LoadBalancer if needed + selector: + app: icrn-web + ports: + - protocol: TCP + port: 80 + targetPort: 8000 + name: http + +--- +# Ingress for ICRN Web Interface +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: icrn-web-ingress + namespace: kernels # Change to your desired namespace + labels: + app: icrn-web +spec: + ingressClassName: traefik # Adjust based on your ingress controller + tls: + - hosts: + - kernels.cori-dev.ncsa.illinois.edu + secretName: icrn-web-tls + rules: + - host: kernels.cori-dev.ncsa.illinois.edu + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: icrn-web-service + port: + number: 80 diff --git a/kubernetes/03-cronjob-indexer.yaml b/kubernetes/03-cronjob-indexer.yaml new file mode 100644 index 0000000..5089d58 --- /dev/null +++ b/kubernetes/03-cronjob-indexer.yaml @@ -0,0 +1,127 @@ +--- +# CronJob to run kernel indexer every hour +apiVersion: batch/v1 +kind: CronJob +metadata: + name: icrn-kernel-indexer + namespace: kernels # Change to your desired namespace + labels: + app: icrn-kernel-manager + component: kernel-indexer +spec: + # Run every hour at minute 0 + schedule: "0 * * * *" + + # Keep last 3 successful and 5 failed jobs for debugging + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 5 + + # Deadline to complete the job (in seconds) + startingDeadlineSeconds: 300 + + jobTemplate: + spec: + # Complete job within 30 minutes + backoffLimit: 2 + # Keep failed job pods for 7 days for debugging + ttlSecondsAfterFinished: 604800 + template: + metadata: + labels: + app: icrn-kernel-manager + component: kernel-indexer + spec: + serviceAccountName: icrn-indexer # See RBAC below + securityContext: + seccompProfile: + type: RuntimeDefault + fsGroup: 55311 + supplementalGroups: + - 55311 + + containers: + - name: kernel-indexer + image: hdpriest0uiuc/icrn-kernel-indexer:latest # Update with your registry/tag + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + capabilities: + drop: + - ALL + + env: + - name: KERNEL_ROOT + value: "/app/data" # Path where kernels are stored in the NFS mount + + volumeMounts: + - name: kernels-data + mountPath: /app/data + readOnly: false # Needs write access to update index files + + resources: + requests: + memory: "512Mi" + cpu: "500m" + limits: + memory: "1Gi" + cpu: "1000m" + + # Fail the job if it takes longer than 25 minutes + livenessProbe: + exec: + command: + - /bin/sh + - -c + - test -f /tmp/indexer.running || exit 1 + initialDelaySeconds: 60 + periodSeconds: 300 + + volumes: + - name: kernels-data + persistentVolumeClaim: + claimName: icrn-kernels-pvc + + restartPolicy: Never + +--- +# ServiceAccount for the kernel indexer CronJob +apiVersion: v1 +kind: ServiceAccount +metadata: + name: icrn-indexer + namespace: kernels # Change to your desired namespace + labels: + app: icrn-kernel-manager + +--- +# ClusterRole for kernel indexer (minimal permissions) +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: icrn-indexer-role + labels: + app: icrn-kernel-manager +rules: + # Minimal permissions - adjust as needed + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list"] + +--- +# ClusterRoleBinding for kernel indexer +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: icrn-indexer-binding + labels: + app: icrn-kernel-manager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: icrn-indexer-role +subjects: + - kind: ServiceAccount + name: icrn-indexer + namespace: kernels # Change to your desired namespace diff --git a/kubernetes/README.md b/kubernetes/README.md new file mode 100644 index 0000000..768c5d5 --- /dev/null +++ b/kubernetes/README.md @@ -0,0 +1,264 @@ +# ICRN Kernel Manager - Kubernetes Deployment + +This directory contains Kubernetes manifests for deploying the ICRN Kernel Manager components. + +## Overview + +The deployment consists of three main components: + +1. **PersistentVolume (PV) & PersistentVolumeClaim (PVC)**: NFS mount to the kernel repository +2. **Web Deployment**: FastAPI web interface with Nginx, Service, and Ingress +3. **CronJob**: Kernel indexer that runs every hour to generate/update index files + +## Files + +- `01-pv-pvc.yaml` - PersistentVolume and PersistentVolumeClaim for NFS mount +- `02-web-deployment.yaml` - Deployment, Service, and Ingress for web interface +- `03-cronjob-indexer.yaml` - CronJob and RBAC for kernel indexer + +## Prerequisites + +- Kubernetes 1.20+ cluster +- Access to `harbor-cc.internal.ncsa.edu` NFS server from cluster nodes +- Nginx Ingress Controller (for Ingress) +- Cert-Manager (optional, for HTTPS/TLS) +- kubectl CLI configured to access your cluster + +## Configuration + +Before deploying, update the following: + +### 1. Namespace (All Files) +Replace `default` with your target namespace: +```bash +kubectl create namespace icrn +# Then update all YAML files: namespace: default → namespace: icrn +``` + +### 2. Domain Name (02-web-deployment.yaml) +Update the Ingress host: +```yaml +- host: icrn-kernels.example.com # Change to your domain +``` + +### 3. Image Registry (All Files) +Update Docker image references if using a private registry: +```yaml +image: hdpriest/icrn-kernel-webserver:latest # Update as needed +image: hdpriest/icrn-kernel-indexer:latest # Update as needed +``` + +### 4. Kernel Path (03-cronjob-indexer.yaml) +Verify the kernel root path matches your NFS structure: +```yaml +- name: KERNEL_ROOT + value: "/data/Kernels" # Adjust if needed +``` + +### 5. Storage Size (01-pv-pvc.yaml) +Update if your kernel repository is larger than 500Gi: +```yaml +capacity: + storage: 500Gi # Change as needed +``` + +## Deployment + +### 1. Apply PV and PVC +```bash +kubectl apply -f 01-pv-pvc.yaml +``` + +Verify: +```bash +kubectl get pv,pvc +``` + +### 2. Apply Web Deployment +```bash +kubectl apply -f 02-web-deployment.yaml +``` + +Verify: +```bash +kubectl get deployment,service,ingress -l app=icrn-web +kubectl get pods -l app=icrn-web +``` + +### 3. Apply CronJob +```bash +kubectl apply -f 03-cronjob-indexer.yaml +``` + +Verify: +```bash +kubectl get cronjob,serviceaccount +``` + +### Deploy All at Once +```bash +kubectl apply -f . +``` + +## Verification + +### Check Pod Status +```bash +kubectl get pods -l app=icrn-web +kubectl logs -l app=icrn-web +``` + +### Check CronJob Status +```bash +kubectl get cronjob icrn-kernel-indexer +kubectl get jobs -l app=icrn-kernel-manager +kubectl logs -l component=kernel-indexer +``` + +### Check PVC Mount +```bash +kubectl exec -it -- ls /data/ +kubectl exec -it -- cat /data/collated_manifests.json | head +``` + +### Access the Web Interface +```bash +# Port forward to test +kubectl port-forward svc/icrn-web-service 8080:80 + +# Then visit http://localhost:8080 +``` + +Or if Ingress is configured: +``` +https://icrn-kernels.example.com +``` + +## Troubleshooting + +### PVC Not Binding +```bash +kubectl describe pvc icrn-kernels-pvc +kubectl describe pv icrn-kernels-pv +``` + +Check that: +- NFS server is accessible from cluster nodes +- NFS path permissions allow read access + +### CronJob Not Running +```bash +kubectl get cronjob icrn-kernel-indexer +kubectl describe cronjob icrn-kernel-indexer +``` + +Check that: +- ServiceAccount exists: `kubectl get sa icrn-indexer` +- RBAC bindings exist: `kubectl get clusterrolebinding | grep icrn` + +### Web Pod Crashing +```bash +kubectl logs +``` + +Check: +- NFS is mounted and readable +- JSON files exist: `/data/collated_manifests.json`, `/data/package_index.json` +- Resource limits aren't too restrictive + +### Missing Index Files +The CronJob generates these files: +- `/data/collated_manifests.json` +- `/data/package_index.json` + +If missing after first run, check: +- CronJob Job output: `kubectl logs ` +- NFS mount permissions on source +- Kernel directory structure at `/harbor/illinois/iccp/sw/icrn/dev/Kernels` + +## CronJob Schedule + +The kernel indexer runs on this schedule: +``` +0 * * * * (every hour at minute 0) +``` + +To change the schedule, edit `spec.schedule` in `03-cronjob-indexer.yaml`: +```yaml +schedule: "0 */6 * * *" # Every 6 hours +schedule: "0 0 * * *" # Daily at midnight +``` + +See [crontab.guru](https://crontab.guru) for schedule syntax. + +## Resource Requirements + +### Web Deployment +- **Requests**: 256Mi memory, 250m CPU +- **Limits**: 512Mi memory, 500m CPU + +### Kernel Indexer CronJob +- **Requests**: 512Mi memory, 500m CPU +- **Limits**: 1Gi memory, 1000m CPU + +Adjust based on your kernel repository size and cluster capacity. + +## Scaling + +### Scale Web Deployment +```bash +kubectl scale deployment icrn-web --replicas=3 +``` + +Or edit the deployment: +```bash +kubectl edit deployment icrn-web +# Change spec.replicas +``` + +## Cleanup + +To remove all components: +```bash +kubectl delete -f 03-cronjob-indexer.yaml +kubectl delete -f 02-web-deployment.yaml +kubectl delete -f 01-pv-pvc.yaml +``` + +## SSL/TLS Configuration + +If using cert-manager for automatic certificate generation: + +1. Install cert-manager: +```bash +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml +``` + +2. Create a ClusterIssuer: +```yaml +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: your-email@example.com + privateKeySecretRef: + name: letsencrypt-prod + solvers: + - http01: + ingress: + class: nginx +``` + +3. The Ingress will automatically request and manage certificates. + +## References + +- [Kubernetes PersistentVolumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) +- [Kubernetes Deployments](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) +- [Kubernetes Services](https://kubernetes.io/docs/concepts/services-networking/service/) +- [Kubernetes Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) +- [Kubernetes CronJobs](https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/) +- [NFS Volumes](https://kubernetes.io/docs/concepts/storage/volumes/#nfs) diff --git a/kubernetes/kick-cronjob.sh b/kubernetes/kick-cronjob.sh new file mode 100755 index 0000000..69d31ad --- /dev/null +++ b/kubernetes/kick-cronjob.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Manually trigger the ICRN kernel indexer cronjob +# Usage: ./kick-cronjob.sh + +NAMESPACE="kernels" +CRONJOB_NAME="icrn-kernel-indexer" +JOB_NAME="${CRONJOB_NAME}-manual-$(date +%s)" + +echo "Deleting old jobs from cronjob '${CRONJOB_NAME}'..." + +# Delete all old jobs (both from cronjob and manual runs) +kubectl get jobs -n "${NAMESPACE}" -o name | grep "${CRONJOB_NAME}" | xargs -r kubectl delete -n "${NAMESPACE}" --wait=true + +echo "Triggering cronjob '${CRONJOB_NAME}' in namespace '${NAMESPACE}'..." + +# Create a job from the cronjob template +kubectl create job "${JOB_NAME}" \ + --from=cronjob/"${CRONJOB_NAME}" \ + -n "${NAMESPACE}" + +if [ $? -eq 0 ]; then + echo "✓ Job '${JOB_NAME}' created successfully" + echo "" + echo "Monitor the job with:" + echo " kubectl logs -f job/${JOB_NAME} -n ${NAMESPACE}" + echo "" + echo "View job status:" + echo " kubectl get job ${JOB_NAME} -n ${NAMESPACE}" +else + echo "✗ Failed to create job" + exit 1 +fi diff --git a/web/Dockerfile b/web/Dockerfile index 9e687d1..efb99c9 100644 --- a/web/Dockerfile +++ b/web/Dockerfile @@ -1,10 +1,5 @@ FROM python:3.11-slim -# Install nginx -RUN apt-get update && \ - apt-get install -y nginx && \ - rm -rf /var/lib/apt/lists/* - # Set working directory WORKDIR /app @@ -14,7 +9,6 @@ RUN pip install --no-cache-dir -r requirements.txt # Copy application files COPY kernel_service.py . -COPY nginx.conf /etc/nginx/nginx.conf COPY start.sh /app/start.sh # Copy static files @@ -25,8 +19,8 @@ RUN mkdir -p /app/data && \ sed -i 's/\r$//' /app/start.sh && \ chmod +x /app/start.sh -# Expose port 80 (nginx) -EXPOSE 80 +# Expose port 8000 (uvicorn) +EXPOSE 8000 # Use the startup script CMD ["/bin/bash", "/app/start.sh"] diff --git a/web/kernel_service.py b/web/kernel_service.py index 34431ec..b2cbef0 100644 --- a/web/kernel_service.py +++ b/web/kernel_service.py @@ -12,7 +12,8 @@ from datetime import datetime from typing import Optional, Dict, Any, List from fastapi import FastAPI, HTTPException -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, FileResponse +from fastapi.staticfiles import StaticFiles import uvicorn app = FastAPI(title="ICRN Kernel Manager API", version="1.0.0") @@ -21,9 +22,14 @@ COLLATED_MANIFESTS_PATH = os.getenv("COLLATED_MANIFESTS_PATH", "/app/data/collated_manifests.json") PACKAGE_INDEX_PATH = os.getenv("PACKAGE_INDEX_PATH", "/app/data/package_index.json") KERNEL_ROOT = os.getenv("KERNEL_ROOT", "/app/data") +STATIC_DIR = Path("/app/static") DATA_DIR = Path(COLLATED_MANIFESTS_PATH).parent DATA_DIR.mkdir(parents=True, exist_ok=True) +# Mount static files directory +if STATIC_DIR.exists(): + app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static") + # Global variable to store loaded data collated_manifests: Optional[Dict[str, Any]] = None package_index: Optional[Dict[str, Any]] = None @@ -128,16 +134,20 @@ async def startup_event(): @app.get("/") async def root(): """ - Root endpoint - returns API information. - Note: This service only reads pre-generated JSON files and does not perform indexing. + Root endpoint - serves the web interface index.html. """ - return { - "service": "ICRN Kernel Manager API", - "version": "1.0.0", - "status": "running", - "note": "This service reads pre-generated files only - no indexing performed", - "last_refresh": last_refresh_time.isoformat() if last_refresh_time else None - } + index_path = STATIC_DIR / "index.html" + if index_path.exists(): + return FileResponse(str(index_path)) + else: + # Fallback to API info if index.html is missing + return { + "service": "ICRN Kernel Manager API", + "version": "1.0.0", + "status": "running", + "note": "This service reads pre-generated files only - no indexing performed", + "last_refresh": last_refresh_time.isoformat() if last_refresh_time else None + } @app.get("/health") diff --git a/web/start.sh b/web/start.sh index 82d5d44..6063e40 100644 --- a/web/start.sh +++ b/web/start.sh @@ -1,34 +1,14 @@ #!/bin/bash set -e -echo "Starting nginx and API server..." +echo "Starting FastAPI application with uvicorn..." # Create data directory if it doesn't exist mkdir -p /app/data -# Start nginx in the background -echo "Starting nginx..." -nginx -g "daemon off;" & -NGINX_PID=$! +# Get number of workers from environment variable (default: 4) +WORKERS=${WORKERS:-4} -# Wait a moment for nginx to start -sleep 2 - -# Start the FastAPI application -echo "Starting FastAPI application..." -python /app/kernel_service.py & -API_PID=$! - -# Function to handle shutdown -cleanup() { - echo "Shutting down..." - kill $NGINX_PID 2>/dev/null || true - kill $API_PID 2>/dev/null || true - exit 0 -} - -trap cleanup SIGTERM SIGINT - -# Wait for both processes -wait $NGINX_PID $API_PID +echo "Starting uvicorn with $WORKERS workers on port 8000..." +uvicorn kernel_service:app --host 0.0.0.0 --port 8000 --workers $WORKERS