From 929a118398997a7ea66f21a8fb446dc5002ba186 Mon Sep 17 00:00:00 2001
From: Noa Limoy <nlimoy@nlimoy-thinkpadp1gen7.raanaii.csb>
Date: Mon, 8 Dec 2025 13:57:34 +0200
Subject: [PATCH] ci: optimize docker integration tests with minimal compose

Replace heavy quickstart.sh full-stack deployment with lightweight
CI-specific docker-compose configuration.

Changes:
- Add docker-compose.ci.yml with only 3 essential services
  (semantic-router, envoy, llm-katan) instead of 11+ services
- Remove UI services (grafana, openwebui, chat-ui, prometheus,
  jaeger, dashboard, mongo, pipelines) - not needed for CI testing
- Replace UI-based validation with simple curl health checks
- Add make targets: docker-compose-{up,down,logs,ps}-ci
- Reduce CI timeout from 30 to 20 minutes

This fixes frequent CI timeouts caused by pulling many heavy
container images from multiple registries on GitHub-hosted runners
which have no persistent Docker cache.

Fixes: #777
Signed-off-by: Noa Limoy <nlimoy@nlimoy-thinkpadp1gen7.raanaii.csb>
---
 .github/workflows/integration-test-docker.yml | 119 ++++++++++++++----
 deploy/docker-compose/docker-compose.ci.yml   |  92 ++++++++++++++
 tools/make/docker.mk                          |  27 ++++
 3 files changed, 212 insertions(+), 26 deletions(-)
 create mode 100644 deploy/docker-compose/docker-compose.ci.yml

diff --git a/.github/workflows/integration-test-docker.yml b/.github/workflows/integration-test-docker.yml
index 5b9eb8d9c..7b642fe3f 100644
--- a/.github/workflows/integration-test-docker.yml
+++ b/.github/workflows/integration-test-docker.yml
@@ -21,10 +21,10 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test-quickstart:
+  test-ci-compose:
     if: github.repository == 'vllm-project/semantic-router' && !github.event.pull_request.draft
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 20  # Reduced from 30 - CI compose is faster
 
     steps:
       - name: Check out the repo
@@ -46,33 +46,93 @@ jobs:
         with:
           python-version: '3.11'
 
-      - name: Install system dependencies
+      - name: Install dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y \
-            make \
-            curl \
-            docker-compose
+          sudo apt-get install -y make curl
+          pip install huggingface_hub[cli]
 
-      - name: Run quickstart script
-        id: quickstart
+      - name: Download models
         run: |
-          timeout 1200 bash scripts/quickstart.sh || {
-            exit_code=$?
-            if [ $exit_code -eq 124 ]; then
-              echo "::error::Quickstart script timed out after 20 minutes"
-            else
-              echo "::error::Quickstart script failed with exit code $exit_code"
-            fi
-            exit $exit_code
-          }
+          echo "Downloading minimal models for CI..."
+          make download-models
         env:
           CI: true
           CI_MINIMAL_MODELS: true
-          TERM: xterm
           HF_HUB_ENABLE_HF_TRANSFER: 1
           HF_HUB_DISABLE_TELEMETRY: 1
 
+      - name: Start CI services
+        run: |
+          echo "Starting minimal CI services (semantic-router, envoy, llm-katan)..."
+          make docker-compose-up-ci
+        env:
+          CI: true
+
+      - name: Wait for services to be healthy
+        run: |
+          echo "Waiting for services to be healthy..."
+          max_attempts=60
+          attempt=1
+          
+          while [ $attempt -le $max_attempts ]; do
+            echo "Attempt $attempt/$max_attempts: Checking service health..."
+            
+            # Check semantic-router health
+            if docker ps --filter "name=semantic-router" --filter "health=healthy" --format "{{.Names}}" | grep -q "semantic-router"; then
+              echo "✅ semantic-router is healthy"
+              
+              # Check envoy health
+              if docker ps --filter "name=envoy-proxy" --filter "health=healthy" --format "{{.Names}}" | grep -q "envoy-proxy"; then
+                echo "✅ envoy-proxy is healthy"
+                
+                # Check llm-katan health
+                if docker ps --filter "name=llm-katan" --filter "health=healthy" --format "{{.Names}}" | grep -q "llm-katan"; then
+                  echo "✅ llm-katan is healthy"
+                  echo "🎉 All services are healthy!"
+                  exit 0
+                fi
+              fi
+            fi
+            
+            # Show current status
+            docker ps --format "table {{.Names}}\t{{.Status}}" | grep -E "NAMES|semantic-router|envoy|llm-katan" || true
+            
+            sleep 5
+            ((attempt++))
+          done
+          
+          echo "❌ Timeout waiting for services to be healthy"
+          docker ps -a
+          exit 1
+
+      - name: Test semantic router health endpoint
+        run: |
+          echo "Testing semantic router health..."
+          curl -f http://localhost:8080/health || {
+            echo "❌ Health check failed"
+            exit 1
+          }
+          echo "✅ Health check passed"
+
+      - name: Test envoy proxy endpoint
+        run: |
+          echo "Testing envoy proxy..."
+          curl -f http://localhost:19000/ready || {
+            echo "❌ Envoy ready check failed"
+            exit 1
+          }
+          echo "✅ Envoy is ready"
+
+      - name: Test llm-katan endpoint
+        run: |
+          echo "Testing llm-katan..."
+          curl -f http://localhost:8002/health || {
+            echo "❌ LLM-Katan health check failed"
+            exit 1
+          }
+          echo "✅ LLM-Katan is healthy"
+
       - name: Test semantic routing functionality
         run: |
           echo "Testing semantic router with a sample query..."
@@ -85,24 +145,31 @@ jobs:
               "temperature": 0.7
             }')
 
-          echo "Full response: $response"
+          echo "Response: $response"
+          
+          # Verify we got a response
+          if echo "$response" | grep -q "choices"; then
+            echo "✅ Chat completions test passed"
+          else
+            echo "⚠️ Response may not contain expected fields, but request succeeded"
+          fi
 
       - name: Show service logs on failure
         if: failure()
         run: |
           echo "=== Docker Compose Logs ==="
-          docker compose -f deploy/docker-compose/docker-compose.yml logs
+          make docker-compose-logs-ci || docker compose -f deploy/docker-compose/docker-compose.ci.yml logs
           echo "=== Container Status ==="
           docker ps -a
           echo "=== Semantic Router Logs ==="
-          docker logs semantic-router || true
+          docker logs semantic-router 2>&1 | tail -100 || true
           echo "=== Envoy Logs ==="
-          docker logs envoy-proxy || true
-          echo "=== Dashboard Logs ==="
-          docker logs semantic-router-dashboard || true
+          docker logs envoy-proxy 2>&1 | tail -100 || true
+          echo "=== LLM-Katan Logs ==="
+          docker logs llm-katan 2>&1 | tail -100 || true
 
       - name: Clean up
         if: always()
         run: |
-          make docker-compose-down || true
+          make docker-compose-down-ci || true
           docker system prune -af --volumes || true
diff --git a/deploy/docker-compose/docker-compose.ci.yml b/deploy/docker-compose/docker-compose.ci.yml
new file mode 100644
index 000000000..9ed541818
--- /dev/null
+++ b/deploy/docker-compose/docker-compose.ci.yml
@@ -0,0 +1,92 @@
+# Minimal Docker Compose for CI testing
+# This file contains only essential services needed for integration testing.
+# Excludes: grafana, prometheus, jaeger, openwebui, chat-ui, pipelines, mongo, dashboard
+#
+# Usage:
+#   make docker-compose-up-ci
+#   # or directly:
+#   docker compose -f deploy/docker-compose/docker-compose.ci.yml up -d
+
+services:
+
+  # Semantic Router External Processor Service
+  semantic-router:
+    image: ghcr.io/vllm-project/semantic-router/extproc:latest
+    container_name: semantic-router
+    ports:
+      - "50051:50051"  # gRPC for ExtProc
+      - "8080:8080"    # HTTP API (health, classify, metrics)
+    volumes:
+      - ../../config:/app/config:ro,z
+      - ../../models:/app/models:ro,z
+      - ~/.cache/huggingface:/root/.cache/huggingface:z
+    environment:
+      - LD_LIBRARY_PATH=/app/lib
+      - CONFIG_FILE=${CONFIG_FILE:-/app/config/config.yaml}
+      - HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface
+      - HF_HUB_ENABLE_HF_TRANSFER=1
+    networks:
+      - semantic-network
+    healthcheck:
+      test: ["CMD", "curl", "-f", "localhost:8080/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 30s
+
+  # Envoy Proxy Service
+  envoy:
+    image: envoyproxy/envoy:v1.31.7
+    container_name: envoy-proxy
+    security_opt:
+      - label=disable
+    ports:
+      - "8801:8801"  # Main proxy port
+      - "19000:19000"  # Admin interface
+    volumes:
+      - ./addons/envoy.yaml:/etc/envoy/envoy.yaml:ro,z
+    command: ["/usr/local/bin/envoy", "-c", "/etc/envoy/envoy.yaml", "--component-log-level", "ext_proc:debug,router:debug"]
+    depends_on:
+      semantic-router:
+        condition: service_healthy
+    networks:
+      - semantic-network
+    healthcheck:
+      test: ["CMD", "bash", "-c", "(echo -e 'GET /ready HTTP/1.1\\r\\nHost: localhost\\r\\n\\r\\n' >&3; timeout 2 cat <&3) 3<>/dev/tcp/localhost/19000 | grep -q LIVE"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 10s
+
+  # LLM Katan service - lightweight mock LLM for testing
+  llm-katan:
+    image: ghcr.io/vllm-project/semantic-router/llm-katan:latest
+    container_name: llm-katan
+    ports:
+      - "8002:8002"
+    environment:
+      - HF_HUB_ENABLE_HF_TRANSFER=1
+    volumes:
+      - ../../models:/app/models:ro,z
+      - hf-cache:/home/llmkatan/.cache/huggingface
+    networks:
+      semantic-network:
+        ipv4_address: 172.28.0.20
+    command: ["llm-katan", "--model", "/app/models/Qwen/Qwen3-0.6B", "--served-model-name", "qwen3", "--host", "0.0.0.0", "--port", "8002"]
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://localhost:8002/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 10s
+
+networks:
+  semantic-network:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: 172.28.0.0/16
+
+volumes:
+  hf-cache:
+
diff --git a/tools/make/docker.mk b/tools/make/docker.mk
index 083fda460..bb193505b 100644
--- a/tools/make/docker.mk
+++ b/tools/make/docker.mk
@@ -179,6 +179,33 @@ docker-compose-down-llm-katan:
 	@echo "Stopping services with $(COMPOSE_CMD) (llm-katan profile)..."
 	@$(COMPOSE_CMD) --profile llm-katan down
 
+##@ CI Docker Compose (minimal services for CI testing)
+
+# CI compose file path
+CI_COMPOSE_FILE ?= deploy/docker-compose/docker-compose.ci.yml
+
+docker-compose-up-ci: ## Start minimal CI services (semantic-router, envoy, llm-katan)
+docker-compose-up-ci:
+	@$(LOG_TARGET)
+	@echo "Starting CI services with $(COMPOSE_CMD) (minimal for CI)..."
+	@$(COMPOSE_CMD) -f $(CI_COMPOSE_FILE) up -d
+
+docker-compose-down-ci: ## Stop CI services
+docker-compose-down-ci:
+	@$(LOG_TARGET)
+	@echo "Stopping CI services with $(COMPOSE_CMD)..."
+	@$(COMPOSE_CMD) -f $(CI_COMPOSE_FILE) down
+
+docker-compose-logs-ci: ## Show logs for CI services
+docker-compose-logs-ci:
+	@$(LOG_TARGET)
+	@$(COMPOSE_CMD) -f $(CI_COMPOSE_FILE) logs
+
+docker-compose-ps-ci: ## Show status of CI services
+docker-compose-ps-ci:
+	@$(LOG_TARGET)
+	@$(COMPOSE_CMD) -f $(CI_COMPOSE_FILE) ps
+
 # Help target for Docker commands
 docker-help:
 docker-help: ## Show help for Docker-related make targets and environment variables