Skip to content

feat: add CI test for k8s core deployment #1

feat: add CI test for k8s core deployment

feat: add CI test for k8s core deployment #1

name: Kubernetes Integration Test
# This workflow tests the CORE semantic-router Kubernetes deployment.
#
# Test Scope:
# ✅ Core deployment (namespace, pvc, deployment, service, configmap)
# ✅ Manifest validation (kubeconform)
# ✅ Service connectivity (gRPC, metrics, API ports)
# ✅ Security scanning (Trivy, Checkov)
# ✅ Basic syntax validation for observability and ai-gateway configs
#
# Out of Scope (planned for follow-up PRs):
# 🔄 Observability stack deployment (Prometheus + Grafana)
# 🔄 AI Gateway end-to-end testing (Envoy Gateway + InferencePool)
on:
pull_request:
paths:
- "deploy/kubernetes/**"
- ".github/workflows/k8s-integration-test.yml"
- "Dockerfile.extproc"
- "tools/kind/**"
workflow_dispatch: # Allow manual triggering
schedule:
# Run nightly at 3:00 AM UTC
- cron: "0 3 * * *"
env:
KIND_VERSION: v0.20.0
KUBECTL_VERSION: v1.28.0
KUSTOMIZE_VERSION: v5.2.1
jobs:
validate-manifests:
name: Validate Kubernetes Manifests
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Kustomize
run: |
curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash
sudo mv kustomize /usr/local/bin/
kustomize version
- name: Validate Kustomize build
run: |
echo "Building kustomization..."
kustomize build deploy/kubernetes > /tmp/k8s-manifests.yaml
echo "Kustomize build successful!"
echo "Generated manifests:"
cat /tmp/k8s-manifests.yaml
- name: Setup kubeconform
run: |
wget https://github.com/yannh/kubeconform/releases/latest/download/kubeconform-linux-amd64.tar.gz
tar xf kubeconform-linux-amd64.tar.gz
sudo mv kubeconform /usr/local/bin/
kubeconform -v
- name: Validate manifests with kubeconform
run: |
echo "Validating Kubernetes manifests..."
kustomize build deploy/kubernetes | \
kubeconform -strict -summary \
-kubernetes-version 1.28.0 \
-schema-location default \
-schema-location 'https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/{{.Group}}/{{.ResourceKind}}_{{.ResourceAPIVersion}}.json' \
-skip CustomResourceDefinition \
-ignore-missing-schemas
- name: Upload validated manifests
uses: actions/upload-artifact@v4
with:
name: k8s-manifests
path: /tmp/k8s-manifests.yaml
retention-days: 5
kind-integration-test:
name: kind Cluster Integration Test
runs-on: ubuntu-latest
needs: validate-manifests
timeout-minutes: 30
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build semantic-router image
uses: docker/build-push-action@v5
with:
context: .
file: ./Dockerfile.extproc
tags: ghcr.io/vllm-project/semantic-router/extproc:test
load: true
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Create kind cluster
uses: helm/[email protected]
with:
version: ${{ env.KIND_VERSION }}
config: tools/kind/kind-config.yaml
cluster_name: semantic-router-test
wait: 120s
- name: Verify cluster
run: |
kubectl cluster-info
kubectl get nodes
kubectl version
- name: Load image to kind cluster
run: |
kind load docker-image ghcr.io/vllm-project/semantic-router/extproc:test --name semantic-router-test
echo "Image loaded successfully!"
- name: Setup Kustomize
run: |
curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash
sudo mv kustomize /usr/local/bin/
- name: Create temporary kustomization for testing
run: |
# Create a test overlay directory
mkdir -p deploy/kubernetes/test-overlay
# Copy base files
cp deploy/kubernetes/kustomization.yaml deploy/kubernetes/test-overlay/
# Update image tag for testing
cd deploy/kubernetes/test-overlay
kustomize edit set image ghcr.io/vllm-project/semantic-router/extproc=ghcr.io/vllm-project/semantic-router/extproc:test
# Create resource list pointing to parent directory
cat > kustomization.yaml << 'EOF'
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../namespace.yaml
- ../pvc.yaml
- ../deployment.yaml
- ../service.yaml
configMapGenerator:
- name: semantic-router-config
files:
- ../config.yaml
- ../tools_db.json
namespace: vllm-semantic-router-system
images:
- name: ghcr.io/vllm-project/semantic-router/extproc
newTag: test
# Reduce resource requirements for CI testing
patches:
- patch: |-
- op: replace
path: /spec/template/spec/containers/0/resources/requests/memory
value: "1Gi"
- op: replace
path: /spec/template/spec/containers/0/resources/requests/cpu
value: "500m"
- op: replace
path: /spec/template/spec/containers/0/resources/limits/memory
value: "2Gi"
- op: replace
path: /spec/template/spec/containers/0/resources/limits/cpu
value: "1"
- op: replace
path: /spec/template/spec/initContainers/0/resources/requests/memory
value: "256Mi"
- op: replace
path: /spec/template/spec/initContainers/0/resources/requests/cpu
value: "100m"
- op: replace
path: /spec/template/spec/initContainers/0/resources/limits/memory
value: "512Mi"
- op: replace
path: /spec/template/spec/initContainers/0/resources/limits/cpu
value: "250m"
target:
kind: Deployment
name: semantic-router
EOF
cat kustomization.yaml
- name: Deploy to kind cluster
run: |
echo "Deploying semantic-router to kind cluster..."
kustomize build deploy/kubernetes/test-overlay | kubectl apply -f -
echo "Waiting for namespace to be active..."
kubectl wait --for=jsonpath='{.status.phase}'=Active namespace/vllm-semantic-router-system --timeout=60s
echo "Deployment initiated. Checking resources..."
kubectl get all -n vllm-semantic-router-system
- name: Wait for deployment readiness
run: |
echo "Waiting for deployment to be ready (this may take a few minutes)..."
# Wait for PVC to be bound
echo "Checking PVC status..."
kubectl get pvc -n vllm-semantic-router-system
# Note: In kind, we might need to wait for local-path-provisioner
timeout 300 bash -c 'until kubectl get pvc -n vllm-semantic-router-system semantic-router-models -o jsonpath="{.status.phase}" | grep -q "Bound"; do echo "Waiting for PVC to be bound..."; sleep 5; done' || true
# Wait for pods to be created
echo "Waiting for pods to be created..."
timeout 120 bash -c 'until kubectl get pods -n vllm-semantic-router-system | grep -q semantic-router; do echo "Waiting for pod creation..."; sleep 5; done'
# Show pod status
kubectl get pods -n vllm-semantic-router-system
# Wait for init container to complete (model download)
echo "Waiting for init container to complete (downloading models)..."
kubectl wait --for=condition=Initialized pods -l app=semantic-router -n vllm-semantic-router-system --timeout=600s || {
echo "Init container did not complete in time. Showing logs..."
kubectl logs -n vllm-semantic-router-system -l app=semantic-router -c model-downloader --tail=100 || true
exit 1
}
# Wait for main container to be ready
echo "Waiting for main container to be ready..."
kubectl wait --for=condition=Ready pods -l app=semantic-router -n vllm-semantic-router-system --timeout=300s || {
echo "Pod did not become ready in time. Showing status and logs..."
kubectl describe pods -n vllm-semantic-router-system -l app=semantic-router
kubectl logs -n vllm-semantic-router-system -l app=semantic-router --tail=100 || true
exit 1
}
echo "Deployment is ready!"
- name: Verify deployment
run: |
echo "=== Verifying Deployment ==="
# Check deployment status
kubectl get deployment -n vllm-semantic-router-system semantic-router -o wide
# Check pod status
kubectl get pods -n vllm-semantic-router-system -o wide
# Check services
kubectl get svc -n vllm-semantic-router-system
# Check configmaps
kubectl get configmap -n vllm-semantic-router-system
# Verify pod is running
POD_STATUS=$(kubectl get pods -n vllm-semantic-router-system -l app=semantic-router -o jsonpath='{.items[0].status.phase}')
if [ "$POD_STATUS" != "Running" ]; then
echo "Error: Pod is not running. Status: $POD_STATUS"
kubectl describe pods -n vllm-semantic-router-system -l app=semantic-router
exit 1
fi
echo "✓ Pod is running"
# Verify all containers are ready
READY_CONTAINERS=$(kubectl get pods -n vllm-semantic-router-system -l app=semantic-router -o jsonpath='{.items[0].status.containerStatuses[0].ready}')
if [ "$READY_CONTAINERS" != "true" ]; then
echo "Error: Container is not ready"
kubectl describe pods -n vllm-semantic-router-system -l app=semantic-router
exit 1
fi
echo "✓ All containers are ready"
- name: Test service connectivity
run: |
echo "=== Testing Service Connectivity ==="
# Get pod name
POD_NAME=$(kubectl get pods -n vllm-semantic-router-system -l app=semantic-router -o jsonpath='{.items[0].metadata.name}')
echo "Pod name: $POD_NAME"
# Test gRPC port
echo "Testing gRPC port (50051)..."
kubectl exec -n vllm-semantic-router-system $POD_NAME -- timeout 5 nc -zv localhost 50051 || {
echo "Warning: gRPC port test failed"
}
# Test metrics port
echo "Testing metrics port (9190)..."
kubectl exec -n vllm-semantic-router-system $POD_NAME -- timeout 5 nc -zv localhost 9190 || {
echo "Warning: Metrics port test failed"
}
# Test classify API port
echo "Testing classify API port (8080)..."
kubectl exec -n vllm-semantic-router-system $POD_NAME -- timeout 5 nc -zv localhost 8080 || {
echo "Warning: Classify API port test failed"
}
# Port forward for external testing
echo "Setting up port-forward for testing..."
kubectl port-forward -n vllm-semantic-router-system svc/semantic-router 8080:8080 &
PF_PID=$!
sleep 5
# Test HTTP endpoint (if available)
echo "Testing HTTP endpoint..."
curl -v http://localhost:8080/health || echo "Health endpoint not available or not implemented"
# Cleanup port-forward
kill $PF_PID || true
echo "✓ Service connectivity tests completed"
- name: Check logs
if: always()
run: |
echo "=== Deployment Logs ==="
kubectl logs -n vllm-semantic-router-system -l app=semantic-router --tail=200 --all-containers=true || true
echo "=== Events ==="
kubectl get events -n vllm-semantic-router-system --sort-by='.lastTimestamp' || true
- name: Export cluster logs on failure
if: failure()
run: |
echo "=== Exporting cluster information for debugging ==="
mkdir -p /tmp/k8s-logs
# Export pod descriptions
kubectl describe pods -n vllm-semantic-router-system > /tmp/k8s-logs/pod-descriptions.txt || true
# Export deployment description
kubectl describe deployment -n vllm-semantic-router-system > /tmp/k8s-logs/deployment-description.txt || true
# Export all logs
kubectl logs -n vllm-semantic-router-system -l app=semantic-router --all-containers=true --previous > /tmp/k8s-logs/previous-logs.txt || true
kubectl logs -n vllm-semantic-router-system -l app=semantic-router --all-containers=true > /tmp/k8s-logs/current-logs.txt || true
# Export events
kubectl get events -n vllm-semantic-router-system --sort-by='.lastTimestamp' > /tmp/k8s-logs/events.txt || true
# Export resource status
kubectl get all -n vllm-semantic-router-system -o yaml > /tmp/k8s-logs/all-resources.yaml || true
- name: Upload cluster logs
if: failure()
uses: actions/upload-artifact@v4
with:
name: k8s-cluster-logs
path: /tmp/k8s-logs/
retention-days: 7
- name: Cleanup
if: always()
run: |
echo "Cleaning up resources..."
kubectl delete namespace vllm-semantic-router-system --timeout=60s || true
test-with-custom-config:
name: Test with Custom Configuration
runs-on: ubuntu-latest
needs: validate-manifests
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Kustomize
run: |
curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash
sudo mv kustomize /usr/local/bin/
- name: Test kustomize with different overlays
run: |
echo "Testing base kustomization..."
kustomize build deploy/kubernetes > /tmp/base-manifests.yaml
echo "Validating generated resources..."
# Check if all expected resources are present
if ! grep -q "kind: Namespace" /tmp/base-manifests.yaml; then
echo "Error: Namespace not found"
exit 1
fi
if ! grep -q "kind: Deployment" /tmp/base-manifests.yaml; then
echo "Error: Deployment not found"
exit 1
fi
if ! grep -q "kind: Service" /tmp/base-manifests.yaml; then
echo "Error: Service not found"
exit 1
fi
if ! grep -q "kind: ConfigMap" /tmp/base-manifests.yaml; then
echo "Error: ConfigMap not found"
exit 1
fi
echo "✓ All expected resources are present"
- name: Verify ConfigMap generation
run: |
echo "Checking ConfigMap generation..."
kustomize build deploy/kubernetes | grep -A 20 "kind: ConfigMap"
# Verify config files are included
if ! kustomize build deploy/kubernetes | grep -q "config.yaml"; then
echo "Warning: config.yaml might not be properly included in ConfigMap"
fi
if ! kustomize build deploy/kubernetes | grep -q "tools_db.json"; then
echo "Warning: tools_db.json might not be properly included in ConfigMap"
fi
- name: Validate observability kustomization
run: |
echo "Validating observability stack kustomization..."
if [ -d "deploy/kubernetes/observability" ]; then
kustomize build deploy/kubernetes/observability > /tmp/observability-manifests.yaml
echo "✓ Observability kustomization is valid"
# Verify expected resources
for resource in "Deployment" "Service" "ConfigMap" "PersistentVolumeClaim"; do
if ! grep -q "kind: $resource" /tmp/observability-manifests.yaml; then
echo "Warning: $resource not found in observability manifests"
fi
done
else
echo "Observability directory not found, skipping..."
fi
- name: Validate AI Gateway configurations
run: |
echo "Validating AI Gateway configurations..."
# Check if ai-gateway directory exists
if [ -d "deploy/kubernetes/ai-gateway" ]; then
# Validate configuration yamls (without CRDs)
for yaml_file in deploy/kubernetes/ai-gateway/configuration/*.yaml; do
if [ -f "$yaml_file" ]; then
echo "Checking $yaml_file..."
# Basic YAML syntax check
kubectl create --dry-run=client -f "$yaml_file" || echo "Warning: Issues with $yaml_file"
fi
done
# Validate inference-pool manifests (skip CRD validation as they may not be installed)
for yaml_file in deploy/kubernetes/ai-gateway/inference-pool/*.yaml; do
if [ -f "$yaml_file" ]; then
echo "Checking $yaml_file for YAML syntax..."
# Just check if it's valid YAML
kubectl create --dry-run=client -f "$yaml_file" 2>&1 | grep -q "no matches for kind" && echo "✓ $yaml_file syntax valid (CRD not installed)" || echo "Validated $yaml_file"
fi
done
echo "✓ AI Gateway configuration validation completed"
else
echo "AI Gateway directory not found, skipping..."
fi
security-scan:
name: Security Scan for K8s Manifests
runs-on: ubuntu-latest
needs: validate-manifests
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Kustomize
run: |
curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash
sudo mv kustomize /usr/local/bin/
- name: Run Trivy security scan
uses: aquasecurity/trivy-action@master
with:
scan-type: "config"
scan-ref: "deploy/kubernetes"
format: "sarif"
output: "trivy-results.sarif"
severity: "CRITICAL,HIGH"
exit-code: "0" # Don't fail on vulnerabilities, just report
- name: Upload Trivy results to GitHub Security
uses: github/codeql-action/upload-sarif@v3
if: always()
with:
sarif_file: "trivy-results.sarif"
- name: Run Checkov scan
uses: bridgecrewio/checkov-action@master
with:
directory: deploy/kubernetes
framework: kubernetes
output_format: cli
soft_fail: true # Don't fail the build
summary:
name: Test Summary
runs-on: ubuntu-latest
needs:
[
validate-manifests,
kind-integration-test,
test-with-custom-config,
security-scan,
]
if: always()
steps:
- name: Check test results
run: |
echo "=== Kubernetes Integration Test Summary ==="
echo "Manifest Validation: ${{ needs.validate-manifests.result }}"
echo "kind Integration Test: ${{ needs.kind-integration-test.result }}"
echo "Custom Config Test: ${{ needs.test-with-custom-config.result }}"
echo "Security Scan: ${{ needs.security-scan.result }}"
if [[ "${{ needs.validate-manifests.result }}" == "failure" ]] || \
[[ "${{ needs.kind-integration-test.result }}" == "failure" ]] || \
[[ "${{ needs.test-with-custom-config.result }}" == "failure" ]]; then
echo "❌ Some tests failed"
exit 1
else
echo "✅ All tests passed"
fi