Skip to content

Commit 05be2ef

Browse files
committed
feat: initial commit for openvino vllm serving microservice (#269)
1 parent 05ed3ae commit 05be2ef

File tree

5 files changed

+226
-0
lines changed

5 files changed

+226
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
data/ov_model
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
FROM debian:12-slim
5+
ARG DEBIAN_FRONTEND=noninteractive
6+
ARG VLLM_VERSION=v0.6.5
7+
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
8+
RUN apt-get update \
9+
&& apt-get upgrade -y \
10+
&& apt-get install --no-install-recommends -y \
11+
sudo \
12+
curl \
13+
git \
14+
gpg-agent \
15+
software-properties-common \
16+
python3.11 \
17+
python3.11-venv \
18+
&& curl -fsSL https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg \
19+
&& echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy client" > /etc/apt/sources.list.d/intel-graphics.list \
20+
&& apt-get update \
21+
&& apt-get install -y --no-install-recommends \
22+
intel-opencl-icd \
23+
intel-level-zero-gpu \
24+
libze1 \
25+
libze-dev \
26+
clinfo \
27+
&& addgroup --system intel --gid 1000 \
28+
&& adduser --system --ingroup intel --uid 1000 --home /home/intel intel \
29+
&& echo "intel ALL=(ALL:ALL) NOPASSWD:ALL" > /etc/sudoers.d/intel \
30+
&& rm -rf /var/lib/apt/lists/* \
31+
&& mkdir -p /usr/src \
32+
&& chown -R intel:intel /usr/src
33+
34+
USER intel
35+
WORKDIR /usr/src/app
36+
RUN python3 -m venv /usr/src/.venv \
37+
&& git clone https://github.com/vllm-project/vllm.git
38+
ENV PATH="/usr/src/.venv/bin:$PATH"
39+
40+
WORKDIR /usr/src/app/vllm
41+
RUN git checkout ${VLLM_VERSION} \
42+
&& python3 -m pip install --no-cache-dir --upgrade pip \
43+
&& PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install --no-cache-dir -r requirements-build.txt \
44+
&& PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install --no-cache-dir .
45+
46+
WORKDIR /usr/src/app
47+
RUN opt_in_out --opt_out
48+
COPY --chown=intel:intel entrypoint.sh /usr/src/app/entrypoint.sh
49+
RUN chmod +x /usr/src/app/entrypoint.sh
50+
51+
HEALTHCHECK --interval=1m --timeout=30s --start-period=5s --retries=10 \
52+
CMD curl -f http://localhost:8000/health || exit 1
53+
54+
ENTRYPOINT [ "/usr/src/app/entrypoint.sh" ]
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# VLLM OpenVINO
2+
3+
## Requirements
4+
5+
### Validated Hardware Requirements
6+
- **CPU:** 13th generation Intel Core processors or newer
7+
- **GPU:** Intel® Arc™ graphics
8+
- **RAM:** 32GB (may vary based on model size)
9+
- **Disk:** 128GB (may vary based on model size)
10+
11+
## Quick Start
12+
13+
### 1. Install Operating System
14+
Install the latest [Ubuntu 22.04 LTS Desktop](https://releases.ubuntu.com/jammy/). Refer to the [Ubuntu Desktop installation tutorial](https://ubuntu.com/tutorials/install-ubuntu-desktop#1-overview) if needed.
15+
16+
### 2. Install GPU Driver (Optional)
17+
If you plan to use a GPU for inference, install the appropriate GPU driver:
18+
- **Intel® Arc™ A-Series Graphics:** [Installation Guide](https://github.com/intel/edge-developer-kit-reference-scripts/tree/main/gpu/arc/dg2)
19+
- **Intel® Data Center GPU Flex Series:** [Installation Guide](https://github.com/intel/edge-developer-kit-reference-scripts/tree/main/gpu/flex/ats)
20+
21+
### 3. Set Up Docker
22+
Follow the instructions [here](https://docs.docker.com/engine/install/) to install Docker and Docker Compose.
23+
24+
### 4. Build the OpenVINO VLLM Docker Image
25+
```bash
26+
docker build -t ov-vllm .
27+
```
28+
29+
### 5. Run the OpenVINO VLLM container
30+
By default, on the container launch, it
31+
* **CPU**
32+
```bash
33+
docker run -it --rm \
34+
-p 8000:8000 \
35+
-e DEFAULT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct \
36+
-e MODEL_PRECISION=int4 \
37+
-e SERVED_MODEL_NAME=ov-vllm \
38+
-e MAX_MODEL_LEN=2048 \
39+
-e VLLM_OPENVINO_DEVICE=CPU \
40+
-e VLLM_OPENVINO_KVCACHE_SPACE=8 \
41+
-e VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 \
42+
-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
43+
-v ./data:/usr/src/app/data \
44+
ov-vllm
45+
```
46+
47+
* **GPU**
48+
```bash
49+
RENDER_GROUP_ID=$(getent group render | cut -d: -f3)
50+
docker run -it --rm \
51+
--group-add $RENDER_GROUP_ID \
52+
--device /dev/dri:/dev/dri \
53+
-p 8000:8000 \
54+
-e DEFAULT_MODEL_ID=Qwen/Qwen2.5-7B-Instruct \
55+
-e MODEL_PRECISION=int4 \
56+
-e SERVED_MODEL_NAME=ov-vllm \
57+
-e MAX_MODEL_LEN=2048 \
58+
-e GPU_MEMORY_UTILIZATION=0.9 \
59+
-e VLLM_OPENVINO_DEVICE=GPU \
60+
-e VLLM_OPENVINO_KVCACHE_SPACE=8 \
61+
-e VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 \
62+
-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
63+
-v ./data:/usr/src/app/data \
64+
ov-vllm
65+
```
66+
67+
### 6. Test the OpenVINO VLLM with chat completion API
68+
```bash
69+
curl "http://localhost:8000/v1/chat/completions" \
70+
-H "Content-Type: application/json" \
71+
-d '{
72+
"model": "ov-vllm",
73+
"messages": [
74+
{
75+
"role": "system",
76+
"content": "You are a helpful assistant."
77+
},
78+
{
79+
"role": "user",
80+
"content": "What is AI?"
81+
}
82+
],
83+
"stream": true
84+
}'
85+
```
86+
87+
88+
## FAQs
89+
### 1. How can I replace or use my own model?
90+
1. Convert the model into OpenVINO format. Refer to this [link](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/genai-model-preparation.html) for more information.
91+
2. After the model convertion steps, place the model in the following following file structures.
92+
```bash
93+
.
94+
├── data
95+
│ └── ov_model
96+
│ ├── added_tokens.json
97+
│ ├── config.json
98+
│ ├── generation_config.json
99+
│ ├── merges.txt
100+
│ ├── openvino_model.bin
101+
│ ├── openvino_model.xml
102+
│ ├── special_tokens_map.json
103+
│ ├── tokenizer_config.json
104+
│ ├── tokenizer.json
105+
│ └── vocab.json
106+
├── Dockerfile
107+
├── entrypoint.sh
108+
└── README.md
109+
```
110+
111+
### 2. How can I change the default model after it has been run once?
112+
1. Delete the existing model located in `./data/ov_model`.
113+
```bash
114+
rm -rf ./data/ov_model
115+
```
116+
2. Rerun the `docker run` command to load and quantize the new model.
117+
118+
### 3. How can I avoid redownload the model everytime to convert and quantize the model?
119+
1. Mount the huggingface cache path into the container
120+
```bash
121+
-v $HOME/.cache/huggingface:/home/intel/.cache/huggingface
122+
```

usecases/ai/microservices/text-generation/vllm/data/.gitkeep

Whitespace-only changes.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/bin/bash
2+
# Copyright (C) 2024 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
echo -e "Initializing OpenVINO VLLM service ..."
6+
export DEFAULT_MODEL_ID=${DEFAULT_MODEL_ID:-Qwen/Qwen2.5-7B-Instruct}
7+
export MODEL_PATH=${MODEL_PATH:-./data/ov_model}
8+
export MODEL_PRECISION=${MODEL_PRECISION:-int4}
9+
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-ov-vllm}
10+
export MAX_MODEL_LEN=${MAX_MODEL_LEN:-2048}
11+
export GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.9}
12+
export VLLM_OPENVINO_DEVICE=${VLLM_OPENVINO_DEVICE:-CPU}
13+
export VLLM_OPENVINO_KVCACHE_SPACE=${VLLM_OPENVINO_KVCACHE_SPACE:-8}
14+
export VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=${VLLM_OPENVINO_CPU_KV_CACHE_PRECISION:-u8}
15+
export VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=${VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS:-ON}
16+
17+
echo -e "Using the following configuration:"
18+
echo -e "- VLLM_OPENVINO_DEVICE: ${VLLM_OPENVINO_DEVICE}"
19+
echo -e "- VLLM_OPENVINO_KVCACHE_SPACE: ${VLLM_OPENVINO_KVCACHE_SPACE}"
20+
echo -e "- VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: ${VLLM_OPENVINO_CPU_KV_CACHE_PRECISION}"
21+
echo -e "- VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: ${VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS}"
22+
echo -e "- DEFAULT_MODEL_ID: ${DEFAULT_MODEL_ID}"
23+
echo -e "- MODEL_PATH: ${MODEL_PATH}"
24+
echo -e "- MODEL_PRECISION: ${MODEL_PRECISION}"
25+
echo -e "- SERVED_MODEL_NAME: ${SERVED_MODEL_NAME}"
26+
echo -e "- MAX_MODEL_LEN: ${MAX_MODEL_LEN}"
27+
echo -e "- GPU_MEMORY_UTILIZATION: ${GPU_MEMORY_UTILIZATION}"
28+
29+
if [ ! -d "$MODEL_PATH" ]; then
30+
echo -e "Model path does not exist: $MODEL_PATH. Downloading the default model: $DEFAULT_MODEL_ID ..."
31+
optimum-cli export openvino \
32+
--model "$DEFAULT_MODEL_ID" \
33+
--weight-format "$MODEL_PRECISION" \
34+
--sym \
35+
--ratio 1.0 \
36+
--group-size -1 \
37+
"$MODEL_PATH"
38+
fi
39+
40+
if [ ! -f "$MODEL_PATH/openvino_model.xml" ]; then
41+
echo -e "Model file does not exist: $MODEL_PATH/openvino_model.xml. Please export the model first and save to $MODEL_PATH"
42+
exit 1
43+
fi
44+
45+
echo -e "Starting OpenVINO VLLM service ..."
46+
vllm serve "$MODEL_PATH" \
47+
--served_model_name "$SERVED_MODEL_NAME" \
48+
--max-model-len "$MAX_MODEL_LEN" \
49+
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"

0 commit comments

Comments
 (0)