docker/docker_compose_files/llm/mistral-vllm.yml at master · slabstech/docker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
services:
  vllm:
    image: vllm/vllm-openai:latest
    runtime: nvidia
    shm_size: 1g
    ports:
      - "8000:8000"
    volumes:
      - ~/.cache/huggingface:/root/.cache/huggingface
    environment:
      - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
    command:
      - --host
      - 0.0.0.0
      - --model
      - mistralai/Mistral-7B-v0.3  # fulll precision
      # - neuralmagic/Mistral-7B-Instruct-v0.3-quantized.w8a16  # quantized 1
      #- neuralmagic/Mistral-7B-Instruct-v0.3-GPTQ-4bit
      - --gpu-memory-utilization
      - "0.95"
      - --max-model-len
      - "8192"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              # count: all
              device_ids: ["0"]
              capabilities: [gpu]