Scaffold-your-shape/docker-compose.yml at main · TannedCung/Scaffold-your-shape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
networks:
  pili-network:
    driver: bridge

volumes:
  huggingface_cache:
    driver: local
  vllm_logs:
    driver: local

services:
  # Nginx reverse proxy
  nginx:
    image: nginx:alpine
    container_name: nginx-proxy
    ports:
      - "3008:80"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
      - ./logs/nginx:/var/log/nginx
    depends_on:
      # - web
      - pili-api
    restart: unless-stopped
    networks:
      - pili-network
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost/nginx-health"]
      interval: 30s
      timeout: 10s
      retries: 3

  # Main web application
  web:
    build: .
    container_name: scaffold-web
    expose:
      - "3006"
    env_file:
      - .env.prod
    environment:
      - NODE_ENV=production
      - PORT=3006
    restart: unless-stopped
    networks:
      - pili-network

  # Pili AI Chatbot service
  pili-api:
    image: tannedcung/scaffold-your-shape:pili-20250719
    container_name: pili-chatbot
    expose:
      - "8991"
    env_file:
      - ./.env.pili
    volumes:
      - ./logs:/app/logs
      - ./data:/app/data
    restart: unless-stopped
    networks:
      - pili-network
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8991/api/health"]
      interval: 600s
      timeout: 10s
      retries: 3
      start_period: 40s

  # vLLM AI Model service
  vllm:
    image: vllm/vllm-openai:latest
    container_name: pili_vllm_container
    restart: unless-stopped

    # GPU support for NVIDIA
    runtime: nvidia

    # Port mapping
    ports:
      - "8995:8000"

    # IPC mode for better GPU performance
    ipc: host

    # Shared memory size (important for GPU workloads)
    shm_size: 10gb

    # Environment variables
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
      - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
      - CUDA_VISIBLE_DEVICES=0

    # Volume mounts for persistence
    volumes:
      - huggingface_cache:/root/.cache/huggingface
      - vllm_logs:/app/logs

    # Model configuration
    command: >
      --model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --host 0.0.0.0
      --port 8000
      --served-model-name ${VLLM_SERVED_MODEL_NAME:-llama-3.2-3b}
      --max-model-len ${VLLM_MAX_MODEL_LEN:-16384}
      --gpu-memory-utilization ${VLLM_GPU_MEMORY_UTIL:-0.9}
      --tensor-parallel-size ${VLLM_TENSOR_PARALLEL:-1}
      --enable-prefix-caching
      --enable-auto-tool-choice
      --tool-call-parser llama3_json
      --dtype auto
      --trust-remote-code

    # Health check
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

    # Resource limits (adjust based on your GPU)
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]

    networks:
      - pili-network