Skip to content

Commit 03a7284

Browse files
committed
Merge branch 'habana_main' into nixl2_buke
2 parents e4e1418 + 89e6254 commit 03a7284

File tree

135 files changed

+8447
-2406
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

135 files changed

+8447
-2406
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Parameterize base image components for RHEL 8.6/9.2/9.4/9.6 and TencentOS 3.1
2+
ARG DOCKER_URL=vault.habana.ai/gaudi-docker
3+
ARG VERSION=1.21.1
4+
ARG BASE_NAME=rhel8.6
5+
ARG PT_VERSION=2.6.0
6+
ARG REVISION=latest
7+
ARG REPO_TYPE=habanalabs
8+
9+
FROM ${DOCKER_URL}/${VERSION}/${BASE_NAME}/${REPO_TYPE}/pytorch-installer-${PT_VERSION}:${REVISION}
10+
11+
# Parameterize commit/branch for vllm-fork checkout
12+
ARG VLLM_FORK_COMMIT=v0.7.2+Gaudi-1.21.0
13+
14+
ARG BASE_NAME
15+
ENV BASE_NAME=${BASE_NAME}
16+
17+
ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
18+
19+
# Install required packages for RHEL 8.6/9.x and TencentOS 3.1
20+
RUN if echo "$BASE_NAME" | grep -qi "tencentos"; then \
21+
yum remove -y mpitests_openmpi perftest openmpi opensm-libs || true && \
22+
yum update -y --exclude=openmpi --exclude=opensm-libs && \
23+
yum install -y gettext jq python3-pip git --allowerasing --exclude=openmpi --exclude=opensm-libs && \
24+
ln -sf /usr/bin/python3 /usr/bin/python ; \
25+
elif echo "$BASE_NAME" | grep -q "^rhel8"; then \
26+
yum module reset perl -y && \
27+
yum module enable perl:5.26 -y && \
28+
yum update -y && \
29+
yum install -y gettext jq python3-pip git --allowerasing && \
30+
ln -sf /usr/bin/python3 /usr/bin/python ; \
31+
else \
32+
yum update -y && \
33+
yum install -y gettext jq python3-pip git --allowerasing && \
34+
ln -sf /usr/bin/python3 /usr/bin/python ; \
35+
fi
36+
37+
WORKDIR /root
38+
39+
ENV VLLM_PATH=/workspace/vllm
40+
41+
# Clone the vllm-fork repository
42+
RUN mkdir -p $VLLM_PATH && \
43+
git clone https://github.com/HabanaAI/vllm-fork.git $VLLM_PATH && \
44+
cd $VLLM_PATH && \
45+
git remote add upstream https://github.com/vllm-project/vllm.git && \
46+
git fetch upstream --tags || true && \
47+
git checkout ${VLLM_FORK_COMMIT}
48+
49+
# Install vllm-fork inside the container
50+
ENV VLLM_TARGET_DEVICE=hpu
51+
RUN pip3 install -v -e $VLLM_PATH
52+
RUN pip3 install -v -e $VLLM_PATH/tests/vllm_test_utils
53+
54+
# Install additional Python packages
55+
RUN pip3 install datasets pandas
56+
57+
# Copy utility scripts and configuration
58+
RUN mkdir -p /root/scripts/
59+
COPY templates /root/scripts/templates/
60+
COPY entrypoints /root/scripts/entrypoints/
61+
COPY server /root/scripts/server/
62+
COPY benchmark /root/scripts/benchmark/
63+
WORKDIR /root/scripts
64+
65+
# Set entrypoint script
66+
ENTRYPOINT ["python3", "-m", "entrypoints.entrypoint_main"]

.cd/Dockerfile.suse.pytorch.vllm

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Parameterize base image components for SUSE (tested on 15.5)
2+
ARG DOCKER_URL=vault.habana.ai/gaudi-docker
3+
ARG VERSION=1.21.1
4+
ARG BASE_NAME=sles15sp5
5+
ARG PT_VERSION=2.6.0
6+
ARG REVISION=latest
7+
ARG REPO_TYPE=habanalabs
8+
9+
FROM ${DOCKER_URL}/${VERSION}/${BASE_NAME}/${REPO_TYPE}/pytorch-installer-${PT_VERSION}:${REVISION}
10+
11+
# Parameterize commit/branch for vllm-fork checkout
12+
ARG VLLM_FORK_COMMIT=v0.7.2+Gaudi-1.21.0
13+
14+
ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
15+
16+
# Install required packages for SUSE
17+
RUN zypper --non-interactive refresh && \
18+
zypper --non-interactive install gettext jq python3-pip git && \
19+
ln -sf /usr/bin/python3 /usr/bin/python
20+
21+
WORKDIR /root
22+
23+
ENV VLLM_PATH=/workspace/vllm
24+
25+
# Clone the vllm-fork repository
26+
RUN mkdir -p $VLLM_PATH && \
27+
git clone https://github.com/HabanaAI/vllm-fork.git $VLLM_PATH && \
28+
cd $VLLM_PATH && \
29+
git remote add upstream https://github.com/vllm-project/vllm.git && \
30+
git fetch upstream --tags || true && \
31+
git checkout ${VLLM_FORK_COMMIT}
32+
33+
# Install vllm-fork inside the container
34+
ENV VLLM_TARGET_DEVICE=hpu
35+
RUN pip3 install -v -e $VLLM_PATH
36+
RUN pip3 install -v -e $VLLM_PATH/tests/vllm_test_utils
37+
38+
# Install additional Python packages
39+
RUN pip3 install datasets pandas
40+
41+
# Copy utility scripts and configuration
42+
RUN mkdir -p /root/scripts/
43+
COPY templates /root/scripts/templates/
44+
COPY entrypoints /root/scripts/entrypoints/
45+
COPY server /root/scripts/server/
46+
COPY benchmark /root/scripts/benchmark/
47+
WORKDIR /root/scripts
48+
49+
# Set entrypoint script
50+
ENTRYPOINT ["python3", "-m", "entrypoints.entrypoint_main"]

.cd/Dockerfile.ubuntu.pytorch.vllm

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,17 @@ RUN mkdir -p $VLLM_PATH && \
3030

3131
# Install vllm-fork inside the container
3232
ENV VLLM_TARGET_DEVICE=hpu
33-
RUN pip install -v -e $VLLM_PATH
34-
RUN pip install -v -e $VLLM_PATH/tests/vllm_test_utils
35-
36-
# Install additional Python packages
37-
RUN pip install datasets && \
38-
pip install pandas
33+
RUN pip install -v -r $VLLM_PATH/requirements-hpu.txt
34+
RUN pip install -v -e $VLLM_PATH --no-build-isolation
35+
RUN pip install -v -e $VLLM_PATH/tests/vllm_test_utils --no-build-isolation
3936

4037
# Copy utility scripts and configuration
41-
RUN mkdir -p /root/scripts/settings/
42-
COPY entrypoint.py vllm_autocalc.py vllm_autocalc_rules.py /root/scripts/
43-
COPY settings/settings_vllm.csv settings/defaults.yaml settings/varlist_conf.yaml settings/template_vllm_server.sh /root/scripts/settings/
38+
RUN mkdir -p /root/scripts/
39+
COPY templates /root/scripts/templates/
40+
COPY entrypoints /root/scripts/entrypoints/
41+
COPY server /root/scripts/server/
42+
COPY benchmark /root/scripts/benchmark/
4443
WORKDIR /root/scripts
4544

4645
# Set entrypoint script
47-
ENTRYPOINT ["python3", "/root/scripts/entrypoint.py"]
46+
ENTRYPOINT ["python3", "-m", "entrypoints.entrypoint_main"]

.cd/README.md

Lines changed: 122 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# vLLM for Gaudi – Quick Start
22

3-
This guide explains how to quickly run vLLM with multi-model support on Gaudi using a prebuilt Docker image.
3+
This guide explains how to quickly run vLLM on Gaudi using a prebuilt Docker image and Docker Compose, with options for custom parameters and benchmarking.
4+
Supports a wide range of validated models including LLaMa, Mistral, and Qwen families, with flexible configuration via environment variables or YAML files.
45

56
## Supported Models
67

@@ -22,116 +23,160 @@ This guide explains how to quickly run vLLM with multi-model support on Gaudi us
2223
| Qwen/Qwen2.5-7B-Instruct | 1 |
2324
| meta-llama/Llama-3.2-11B-Vision-Instruct | 1 |
2425
| meta-llama/Llama-3.2-90B-Vision-Instruct | 4 |
26+
| ibm-granite/granite-8b-code-instruct-4k | 1 |
27+
| ibm-granite/granite-20b-code-instruct-8k | 1 |
2528

2629
## How to Use
2730

28-
1. **Use the prebuilt vLLM container**
31+
### 1. Run the server using Docker Compose
2932

30-
You do **not** need to build the Docker image yourself.
31-
Use the ready-to-use image from an image registry:
33+
The recommended and easiest way to start the vLLM server is with Docker Compose. At a minimum, set the following environment variables:
34+
35+
- `MODEL` - Select a model from the table above.
36+
- `HF_TOKEN` - Your Hugging Face token (generate one at <https://huggingface.co>).
37+
- `DOCKER_IMAGE` - The vLLM Docker image URL from Gaudi or local repository.
38+
39+
**Example usage:**
40+
41+
```bash
42+
cd vllm-fork/.cd/
43+
MODEL="Qwen/Qwen2.5-14B-Instruct" \
44+
HF_TOKEN="<your huggingface token>" \
45+
DOCKER_IMAGE="<docker image url>" \
46+
docker compose up
47+
```
48+
49+
### 2. Running the Server with a Benchmark
50+
51+
To easily initiate benchmark dedicated for a specific model using default parameters, use the `--profile benchmark up` option with Docker Compose:
3252

3353
```bash
34-
docker pull <path to a docker image>
54+
cd vllm-fork/.cd/
55+
MODEL="Qwen/Qwen2.5-14B-Instruct" \
56+
HF_TOKEN="<your huggingface token>" \
57+
DOCKER_IMAGE="<docker image url>" \
58+
docker compose --profile benchmark up
3559
```
3660

37-
2. **Set required environment variables**
61+
This launches the vLLM server and runs the benchmark suite automatically.
3862

39-
- `export MODEL=` (choose from the table above)
40-
- `export HF_TOKEN=` (your huggingface token, can be generated from https://huggingface.co)
63+
### 3. Run the server using Docker Compose with custom parameters
4164

42-
Tips:
43-
- Model files can be large. For best performance, use an external disk for the Huggingface cache and set `HF_HOME` accordingly.
44-
Example: `-e HF_HOME=/mnt/huggingface -v /mnt/huggingface:/mnt`\
45-
- For a quick startup and to skip the initial model warmup (useful for development testing), you can add:
46-
`-e VLLM_SKIP_WARMUP=true`
65+
To override default settings, you can provide additional parameters when starting the server. This is a more advanced approach:
4766

48-
3. **Run the vLLM server**
67+
- `PT_HPU_LAZY_MODE` - Enables lazy execution mode for HPU (Habana Processing Unit), which may improve performance by batching operations.
68+
- `VLLM_SKIP_WARMUP` - If enabled, skips the model warmup phase, which can reduce startup time but may affect initial performance.
69+
- `MAX_MODEL_LEN` - Specifies the maximum sequence length the model can handle.
70+
- `MAX_NUM_SEQS` - Sets the maximum number of sequences that can be processed simultaneously.
71+
- `TENSOR_PARALLEL_SIZE` - Defines the number of parallel tensor partitions.
72+
- `VLLM_EXPONENTIAL_BUCKETING` - Controls enabling/disabling of exponential bucketing warmup strategy.
73+
- `VLLM_DECODE_BLOCK_BUCKET_STEP` - Sets the step size for allocating decode blocks during inference, affecting memory allocation granularity.
74+
- `VLLM_DECODE_BS_BUCKET_STEP` - Determines the batch size step for decode operations, influencing how batches are grouped and processed.
75+
- `VLLM_PROMPT_BS_BUCKET_STEP` - Sets the batch size step for prompt processing, impacting how prompt batches are handled.
76+
- `VLLM_PROMPT_SEQ_BUCKET_STEP` - Controls the step size for prompt sequence allocation, affecting how sequences are bucketed for processing.
77+
78+
**Example usage:**
4979

5080
```bash
51-
docker run -it --rm \
52-
-e MODEL=$MODEL \
53-
-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy \
54-
--cap-add=sys_nice \
55-
--ipc=host \
56-
--runtime=habana \
57-
-e HF_TOKEN=$HF_TOKEN \
58-
-e HABANA_VISIBLE_DEVICES=all \
59-
-p 8000:8000 \
60-
--name vllm-server \
61-
<docker image name>
81+
cd vllm-fork/.cd/
82+
MODEL="Qwen/Qwen2.5-14B-Instruct" \
83+
HF_TOKEN="<your huggingface token>" \
84+
DOCKER_IMAGE="<docker image url>" \
85+
TENSOR_PARALLEL_SIZE=1 \
86+
MAX_MODEL_LEN=2048 \
87+
docker compose up
6288
```
6389

64-
4. **(Optional) Test the server**
90+
### 4. Running the Server and Benchmark with Custom Parameters
6591

66-
In a separate terminal:
92+
You can customize benchmark parameters using:
93+
94+
- `INPUT_TOK` – Number of input tokens per prompt.
95+
- `OUTPUT_TOK` – Number of output tokens to generate per prompt.
96+
- `CON_REQ` – Number of concurrent requests to send during benchmarking.
97+
- `NUM_PROMPTS` – Total number of prompts to use in the benchmark.
98+
99+
**Example usage:**
67100

68101
```bash
69-
MODEL= # choose from the table above
70-
target=localhost
71-
curl_query="What is DeepLearning?"
72-
payload="{ \"model\": \"${MODEL}\", \"prompt\": \"${curl_query}\", \"max_tokens\": 128, \"temperature\": 0 }"
73-
curl -s --noproxy '*' http://${target}:8000/v1/completions -H 'Content-Type: application/json' -d "$payload"
102+
cd vllm-fork/.cd/
103+
MODEL="Qwen/Qwen2.5-14B-Instruct" \
104+
HF_TOKEN="<your huggingface token>" \
105+
DOCKER_IMAGE="<docker image url>" \
106+
INPUT_TOK=128 \
107+
OUTPUT_TOK=128 \
108+
CON_REQ=16 \
109+
NUM_PROMPTS=64 \
110+
docker compose --profile benchmark up
74111
```
75112

76-
5. **Customizing server parameters**
113+
This will launch the vLLM server and run the benchmark suite using your specified parameters.
114+
115+
### 5. Running the Server and Benchmark, both with Custom Parameters
77116

78-
You can override defaults with additional `-e` variables, for example:
117+
You can launch the vLLM server and benchmark together, specifying any combination of optional parameters for both the server and the benchmark. Set the desired environment variables before running Docker Compose.
118+
119+
**Example usage:**
79120

80121
```bash
81-
docker run -it --rm \
82-
-e MODEL=$MODEL \
83-
-e TENSOR_PARALLEL_SIZE=8 \
84-
-e MAX_MODEL_LEN=8192 \
85-
-e HABANA_VISIBLE_DEVICES=all \
86-
-e HF_TOKEN=$HF_TOKEN \
87-
-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy \
88-
--runtime=habana \
89-
--cap-add=sys_nice \
90-
--ipc=host \
91-
-p 8000:8000 \
92-
--name vllm-server \
93-
<docker image name>
122+
cd vllm-fork/.cd/
123+
MODEL="Qwen/Qwen2.5-14B-Instruct" \
124+
HF_TOKEN="<your huggingface token>" \
125+
DOCKER_IMAGE="<docker image url>" \
126+
VTENSOR_PARALLEL_SIZE=1 \
127+
MAX_MODEL_LEN=2048 \
128+
INPUT_TOK=128 \
129+
OUTPUT_TOK=128 \
130+
CON_REQ=16 \
131+
NUM_PROMPTS=64 \
132+
docker compose --profile benchmark up
94133
```
95134

96-
6. **Running multiple instances**
135+
This command will start the vLLM server and run the benchmark suite using your specified custom parameters.
97136

98-
Each instance should have unique values for `HABANA_VISIBLE_DEVICES`, host port, and container name.
99-
See [docs.habana.ai - Multiple Tenants](https://docs.habana.ai/en/latest/Orchestration/Multiple_Tenants_on_HPU/Multiple_Dockers_each_with_Single_Workload.html) for details.
137+
### 6. Running the Server and Benchmark Using Configuration Files
100138

101-
Example for two instances:
139+
You can also configure the server and benchmark by specifying parameters in configuration files. To do this, set the following environment variables:
140+
141+
- `VLLM_SERVER_CONFIG_FILE` – Path to the server configuration file inside the Docker container.
142+
- `VLLM_SERVER_CONFIG_NAME` – Name of the server configuration section.
143+
- `VLLM_BENCHMARK_CONFIG_FILE` – Path to the benchmark configuration file inside the Docker container.
144+
- `VLLM_BENCHMARK_CONFIG_NAME` – Name of the benchmark configuration section.
145+
146+
**Example:**
102147

103148
```bash
104-
# Instance 1
105-
docker run -it --rm \
106-
-e MODEL=$MODEL \
107-
-e TENSOR_PARALLEL_SIZE=4 \
108-
-e HABANA_VISIBLE_DEVICES=0,1,2,3 \
109-
-e MAX_MODEL_LEN=8192 \
110-
-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy \
111-
--runtime=habana \
112-
--cap-add=sys_nice \
113-
--ipc=host \
114-
-p 8000:8000 \
115-
--name vllm-server1 \
116-
<docker image name>
149+
HF_TOKEN=<your huggingface token> \
150+
VLLM_SERVER_CONFIG_FILE=server_configurations/server_text.yaml \
151+
VLLM_SERVER_CONFIG_NAME=llama31_8b_instruct \
152+
VLLM_BENCHMARK_CONFIG_FILE=benchmark_configurations/benchmark_text.yaml \
153+
VLLM_BENCHMARK_CONFIG_NAME=llama31_8b_instruct \
154+
docker compose --profile benchmark up
155+
```
117156

118-
# Instance 2 (in another terminal)
157+
> [!NOTE]
158+
> When using configuration files, you do not need to set the `MODEL` environment variable, as the model name is specified within the configuration file. However, you must still provide your `HF_TOKEN`.
159+
160+
### 7. Running the Server Directly with Docker
161+
162+
For full control, you can run the server using the `docker run` command. This approach allows you to specify any native Docker parameters as needed.
163+
164+
**Example:**
165+
166+
```bash
119167
docker run -it --rm \
120168
-e MODEL=$MODEL \
121-
-e TENSOR_PARALLEL_SIZE=4 \
122-
-e HABANA_VISIBLE_DEVICES=4,5,6,7 \
123-
-e MAX_MODEL_LEN=8192 \
124-
-e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy \
125-
--runtime=habana \
169+
-e HF_TOKEN=$HF_TOKEN \
170+
-e http_proxy=$http_proxy \
171+
-e https_proxy=$https_proxy \
172+
-e no_proxy=$no_proxy \
126173
--cap-add=sys_nice \
127174
--ipc=host \
128-
-p 9222:8000 \
129-
--name vllm-server2 \
175+
--runtime=habana \
176+
-e HABANA_VISIBLE_DEVICES=all \
177+
-p 8000:8000 \
178+
--name vllm-server \
130179
<docker image name>
131180
```
132181

133-
7. **Viewing logs**
134-
135-
```bash
136-
docker logs -f vllm-server
137-
```
182+
This method gives you full flexibility over Docker runtime options.

0 commit comments

Comments
 (0)