Skip to content

Commit 006bcfc

Browse files
authored
Add dockerfile for LLM examples (#3520)
1 parent c37e519 commit 006bcfc

File tree

13 files changed

+918
-71
lines changed

13 files changed

+918
-71
lines changed

dependency_version.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
gcc:
2+
max-version:
3+
min-version: 12.3.0
4+
llvm:
5+
version: 16.0.6
6+
pytorch:
7+
version: 2.0.1a0
8+
commit: v2.1.0
9+
torchaudio:
10+
version: 2.0.1a0
11+
commit: v2.1.0
12+
torchvision:
13+
version: 0.16.0a0
14+
commit: v0.16.0
15+
torch-ccl:
16+
repo: https://github.com/intel/torch-ccl.git
17+
commit: c8f89db1639558c1149c4d0eecf90c980064f609
18+
version: 2.1.100+xpu
19+
deepspeed:
20+
repo: https://github.com/microsoft/DeepSpeed.git
21+
version:
22+
commit: 4fc181b01077521ba42379013ce91a1c294e5d8e
23+
intel-extension-for-deepspeed:
24+
repo: https://github.com/intel/intel-extension-for-deepspeed.git
25+
commit: ec332772d8bb71c6d89804a862f039fc91ae96b5
26+
transformers:
27+
version: 4.31.0
28+
commit: v4.31.0
29+
protobuf:
30+
version: 3.20.3

docker/Dockerfile.compile

Lines changed: 20 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -26,19 +26,15 @@ RUN apt update && \
2626
software-properties-common \
2727
gnupg \
2828
gpg-agent
29-
RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \
30-
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified" | tee /etc/apt/sources.list.d/intel-gpu-jammy.list && \
31-
apt update && \
32-
apt install -y intel-opencl-icd=23.30.26918.50-736~22.04 \
33-
level-zero=1.13.1-719~22.04 \
34-
intel-level-zero-gpu=1.3.26918.50-736~22.04 \
35-
xpu-smi=1.2.22-31~22.04
36-
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
37-
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
38-
apt update
29+
COPY ./tools/basekit_driver_install_helper.sh .
30+
RUN bash ./basekit_driver_install_helper.sh add-apt-repo && \
31+
bash ./basekit_driver_install_helper.sh driver
3932

40-
RUN useradd -m ubuntu
41-
RUN echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
33+
ARG GID_RENDER=109
34+
RUN useradd -m -s /bin/bash ubuntu && \
35+
echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers && \
36+
groupadd -g $GID_RENDER render && \
37+
usermod -a -G video,render ubuntu
4238
USER ubuntu
4339
WORKDIR /home/ubuntu
4440

@@ -48,35 +44,23 @@ RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Minicon
4844
echo "\nsource ~/miniconda3/bin/activate" >> ./.bashrc
4945

5046
FROM base AS dev
51-
RUN sudo apt update && \
52-
sudo apt install -y level-zero-dev=1.13.1-719~22.04 \
53-
intel-level-zero-gpu-dev=1.3.26918.50-736~22.04 \
54-
intel-oneapi-dpcpp-cpp-2024.0 \
55-
intel-oneapi-mkl-devel-2024.0
56-
RUN sudo apt clean && \
57-
sudo rm -rf /var/lib/apt/lists/* && \
58-
if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi
59-
RUN mkdir ./intel-extension-for-pytorch
60-
COPY . ./intel-extension-for-pytorch/
61-
RUN sudo chown -R ubuntu:ubuntu ./intel-extension-for-pytorch && \
62-
cp ./intel-extension-for-pytorch/scripts/compile_bundle.sh ./ && \
47+
RUN bash /basekit_driver_install_helper.sh dev
48+
COPY --chown=ubuntu:ubuntu . ./intel-extension-for-pytorch/
49+
RUN cp ./intel-extension-for-pytorch/scripts/compile_bundle.sh ./ && \
6350
sed -i "s/VER_IPEX=.*/VER_IPEX=/" compile_bundle.sh
6451
RUN . ./miniconda3/bin/activate && \
6552
conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \
66-
bash compile_bundle.sh /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest pvc,ats-m150,acm-g11
53+
bash compile_bundle.sh /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest pvc,ats-m150,acm-g11 && \
54+
mkdir wheels && cp pytorch/dist/*.whl vision/dist/*.whl audio/dist/*.whl intel-extension-for-pytorch/dist/*.whl ./wheels
6755

6856
FROM base AS deploy
69-
RUN sudo apt update && \
70-
sudo apt install -y intel-oneapi-runtime-dpcpp-cpp=2024.0.0-49819 \
71-
intel-oneapi-runtime-mkl=2024.0.0-49656
72-
RUN sudo apt clean && \
57+
RUN bash /basekit_driver_install_helper.sh runtime && \
58+
sudo apt clean && \
7359
sudo rm -rf /var/lib/apt/lists/* && \
74-
if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi
75-
RUN mkdir ./wheels
76-
COPY --from=dev /home/ubuntu/pytorch/dist/*.whl ./wheels
77-
COPY --from=dev /home/ubuntu/vision/dist/*.whl ./wheels
78-
COPY --from=dev /home/ubuntu/audio/dist/*.whl ./wheels
79-
COPY --from=dev /home/ubuntu/intel-extension-for-pytorch/dist/*.whl ./wheels
60+
if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi && \
61+
sudo rm /basekit_driver_install_helper.sh
62+
COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/wheels ./wheels
63+
COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh .
8064
RUN . ./miniconda3/bin/activate && \
8165
conda create -y -n py310 python=3.10 && conda activate py310 && \
8266
conda install -y libstdcxx-ng=12 libpng libjpeg-turbo -c conda-forge && \
@@ -85,4 +69,4 @@ RUN . ./miniconda3/bin/activate && \
8569
conda clean -a -y && \
8670
rm -rf ./wheels && \
8771
echo "conda activate py310" >> ./.bashrc && \
88-
echo "export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so" >> ./.bashrc
72+
ldpreload=$(bash get_libstdcpp_lib.sh) && echo "export LD_PRELOAD=${ldpreload}" >> ./.bashrc && rm get_libstdcpp_lib.sh

docker/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ Run the following commands to build a docker image by compiling from source.
2020
git clone https://github.com/intel/intel-extension-for-pytorch.git
2121
cd intel-extension-for-pytorch
2222
git checkout v2.1.10+xpu
23-
docker build -f docker/Dockerfile.compile -t intel-extension-for-pytorch:2.1.10 .
23+
git submodule sync
24+
git submodule update --init --recursive
25+
docker build -f docker/Dockerfile.compile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -t intel-extension-for-pytorch:2.1.10 .
2426
```
2527

2628
Alternatively, `./build.sh` script has docker build command to install prebuilt wheel files, update all the relevant build arguments and execute the script. Run the command below in current directory.
@@ -44,7 +46,6 @@ docker container so that the GPU is accessible.
4446
IMAGE_NAME=intel/intel-extension-for-pytorch:xpu
4547
```
4648
```bash
47-
4849
docker run --rm \
4950
-v <your-local-dir>:/workspace \
5051
--device=/dev/dri \
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# NOTE: To build this you will need a docker version >= 19.03 and DOCKER_BUILDKIT=1
2+
#
3+
# If you do not use buildkit you are not going to have a good time
4+
#
5+
# For reference:
6+
# https://docs.docker.com/develop/develop-images/build_enhancements/
7+
8+
ARG BASE_IMAGE=ubuntu:22.04
9+
FROM ${BASE_IMAGE} AS base
10+
RUN if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi && \
11+
if [ ! -z ${HTTP_PROXY} ]; then echo "Acquire::http::Proxy \"${HTTP_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi && \
12+
if [ ! -z ${HTTPS_PROXY} ]; then echo "Acquire::http::Proxy \"${HTTPS_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi
13+
RUN apt update && \
14+
apt full-upgrade -y && \
15+
DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -y \
16+
sudo \
17+
git \
18+
wget \
19+
curl \
20+
vim \
21+
patch \
22+
gcc \
23+
g++ \
24+
make \
25+
pkg-config \
26+
software-properties-common \
27+
gnupg \
28+
gpg-agent
29+
COPY ./tools/basekit_driver_install_helper.sh .
30+
RUN bash ./basekit_driver_install_helper.sh add-apt-repo && \
31+
bash ./basekit_driver_install_helper.sh driver
32+
33+
ARG GID_RENDER=109
34+
RUN useradd -m -s /bin/bash ubuntu && \
35+
echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers && \
36+
groupadd -g $GID_RENDER render && \
37+
usermod -a -G video,render ubuntu
38+
USER ubuntu
39+
WORKDIR /home/ubuntu
40+
41+
RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
42+
bash miniconda.sh -b -p ./miniconda3 && \
43+
rm miniconda.sh && \
44+
echo "\nsource ~/miniconda3/bin/activate" >> ./.bashrc
45+
46+
FROM base AS dev
47+
# --build-arg COMPILE=ON to compile from source
48+
ARG COMPILE
49+
RUN bash /basekit_driver_install_helper.sh dev
50+
COPY --chown=ubuntu:ubuntu . ./intel-extension-for-pytorch/
51+
RUN . ./miniconda3/bin/activate && \
52+
conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \
53+
cd intel-extension-for-pytorch/examples/gpu/inference/python/llm && \
54+
if [ -z ${COMPILE} ]; then MODE=6; else MODE=2; fi && \
55+
bash tools/env_setup.sh ${MODE} /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest pvc,ats-m150,acm-g11
56+
57+
FROM base AS deploy
58+
RUN bash /basekit_driver_install_helper.sh runtime && \
59+
sudo apt clean && \
60+
sudo rm -rf /var/lib/apt/lists/* && \
61+
if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi && \
62+
sudo rm /basekit_driver_install_helper.sh
63+
COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/examples/gpu/inference/python/llm ./llm
64+
COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh .
65+
RUN . ./miniconda3/bin/activate && \
66+
conda create -y -n py310 python=3.10 && conda activate py310 && \
67+
echo "conda activate py310" >> ./.bashrc && \
68+
ldpreload=$(bash get_libstdcpp_lib.sh) && echo "export LD_PRELOAD=${ldpreload}" >> ./.bashrc && rm get_libstdcpp_lib.sh && \
69+
cd ./llm && \
70+
bash tools/env_setup.sh 1 && \
71+
python -m pip cache purge && \
72+
conda clean -a -y
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
# Text Generation
2+
3+
We provide the inference benchmarking scripts for large language models text generation.<br />
4+
Support large language model families, including LLaMA 2, GPT-J, OPT, and Bloom.<br />
5+
The scripts include both single instance and distributed (DeepSpeed) use cases.<br />
6+
The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (fp16 AMP and weight only quantization).<br />
7+
8+
# Supported Model List
9+
10+
| MODEL FAMILY | Verified < MODEL ID > (Huggingface hub)| FP16 | Weight only quantization INT4 |
11+
|---|:---:|:---:|:---:|
12+
|LLAMA 2| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Llama-2-70b-hf" |||
13+
|GPT-J| "EleutherAI/gpt-j-6b" |||
14+
|OPT|"facebook/opt-6.7b", "facebook/opt-30b"|||
15+
|Bloom|"bigscience/bloom-7b1", "bigscience/bloom"|||
16+
17+
*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache and fused ROPE. For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
18+
19+
# Supported platform
20+
21+
\* PVC(1550/1100): support all the models in model list<br />
22+
\* ATS-M, Arc: support GPT-J (EleutherAI/gpt-j-6b)
23+
24+
# Environment Setup
25+
26+
1. Get the Intel® Extension for PyTorch\* source code
27+
28+
```bash
29+
git clone https://github.com/intel/intel-extension-for-pytorch.git
30+
cd intel-extension-for-pytorch
31+
git checkout v2.1.10+xpu
32+
git submodule sync
33+
git submodule update --init --recursive
34+
```
35+
36+
2.a. It is highly recommended to build a Docker container from the provided `Dockerfile` for single-instance executions.
37+
38+
```bash
39+
# Build an image with the provided Dockerfile by compiling Intel® Extension for PyTorch* from source
40+
DOCKER_BUILDKIT=1 docker build -f examples/gpu/inference/python/llm/Dockerfile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') --build-arg COMPILE=ON -t ipex-llm:2.1.10 .
41+
42+
# Build an image with the provided Dockerfile by installing from Intel® Extension for PyTorch* prebuilt wheel files
43+
DOCKER_BUILDKIT=1 docker build -f examples/cpu/inference/python/llm/Dockerfile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -t ipex-llm:2.1.10 .
44+
45+
# Run the container with command below
46+
docker run --rm -it --privileged --device=/dev/dri --ipc=host ipex-llm:2.1.10 bash
47+
48+
# When the command prompt shows inside the docker container, enter llm examples directory
49+
cd llm
50+
```
51+
52+
2.b. Alternatively, you can take advantage of a provided environment configuration script to setup an environment without using a docker container.
53+
54+
```bash
55+
# GCC 12.3 is required. Installation can be taken care of by the environment configuration script.
56+
# Create a conda environment
57+
conda create -n llm python=3.9 -y
58+
conda activate llm
59+
60+
# Setup the environment with the provided script
61+
cd examples/gpu/inference/python/llm
62+
# If you want to install Intel® Extension for PyTorch\* from prebuilt wheel files, use the command below:
63+
bash ./tools/env_setup.sh 7
64+
# If you want to install Intel® Extension for PyTorch\* from source, use the commands below:
65+
bash ./tools/env_setup.sh 3 <DPCPP_ROOT> <ONEMKL_ROOT> <AOT>
66+
export LD_PRELOAD=$(bash ../../../../../tools/get_libstdcpp_lib.sh)
67+
```
68+
69+
\* `DPCPP_ROOT` is the placeholder for path where DPC++ compile was installed to. By default, it is `/opt/intel/oneapi/compiler/latest`.<br />
70+
\* `ONEMKL_ROOT` is the placeholder for path where oneMKL was installed to. By default, it is `/opt/intel/oneapi/mkl/latest`.<br />
71+
\* `AOT` is a text string to enable `Ahead-Of-Time` compilation for specific GPU models. Check [tutorial](../../../../../docs/tutorials/technical_details/AOT.md) for details.<br />
72+
73+
3. Once an environment is configured with either method above, set necessary environment variables with an environment variables activation script.
74+
75+
```bash
76+
# If you use docker images built from the provided Dockerfile, you do NOT need to run the following 2 commands.
77+
source <DPCPP_ROOT>/env/vars.sh
78+
source <ONEMKL_ROOT>/env/vars.sh
79+
80+
# Activate environment variables
81+
source ./tools/env_activate.sh
82+
```
83+
84+
85+
# Run Models Generations
86+
87+
| Benchmark mode | FP16 | Weight only quantization INT4 |
88+
|---|:---:|:---:|
89+
|Single instance |||
90+
| Distributed (autotp) |||
91+
92+
## Example usages of one-click Python script
93+
You can run LLM with a one-click bash script "run_benchmark.sh" for all inference cases.
94+
```
95+
bash run_benchmark.sh
96+
```
97+
98+
### Single Instance Performance
99+
100+
```bash
101+
# fp16 benchmark
102+
python -u run_generation.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --ipex --dtype float16 --token-latency
103+
```
104+
105+
Notes:
106+
107+
(1) By default, generations are based on bs = 1, input token size = 1024, output toke size = 128, iteration num = 10 and "beam search", and beam size = 4. For beam size = 1 and other settings, please export env settings, such as: "beam=1", "input=32", "output=32", "iter=5".
108+
109+
### Distributed Performance with DeepSpeed
110+
111+
You can run LLM with a one-click bash script "run_benchmark_ds.sh" for all distributed inference cases.
112+
```
113+
bash run_benchmark_ds.sh
114+
```
115+
116+
```bash
117+
# distributed env setting
118+
source ${ONECCL_DIR}/build/_install/env/setvars.sh
119+
# fp16 benchmark
120+
mpirun -np 2 --prepend-rank python -u run_generation_with_deepspeed.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --ipex --dtype float16 --token-latency
121+
```
122+
123+
Notes:
124+
125+
(1) By default, generations are based on bs = 1, input token size = 1024, output toke size = 128, iteration num = 10 and "beam search", and beam size = 4. For beam size = 1 and other settings, please export env settings, such as: "beam=1", "input=32", "output=32", "iter=5".
126+
127+
# Advanced Usage
128+
129+
## Weight only quantization with low precision checkpoint (Experimental)
130+
131+
Using INT4 weights can further improve performance by reducing memory bandwidth. However, direct per-channel quantization of weights to INT4 probably results in poor accuracy. Some algorithms can modify weights through calibration before quantizing weights to minimize accuracy drop. GPTQ is one of such algorithms. You may generate modified weights and quantization info (scales, zero points) for a certain model with a dataset for specified tasks by such algorithms. The results are saved as a `state_dict` in a `.pt` file. We provided a script here to run GPT-J .
132+
133+
### Single Instance GPT-J Weight only quantization Performance
134+
135+
```bash
136+
# quantization benchmark
137+
#To run quantization performance, you need to firstly get the quantized weight with the following step (1) and then run the performance benchmark with the following step (2)
138+
## (1) Get the quantized weight
139+
download link: https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/xpu/gptj_int4_weight_master.pt
140+
export weight_path = path-to-your-weight
141+
142+
## (2) Run quantization performance test
143+
python -u run_generation.py --device xpu --ipex --dtype float16 --input-tokens ${input} --max-new-tokens ${out} --token-latency --benchmark --num-beams ${beam} -m ${model} --sub-model-name ${sub_model_name} --woq --woq_checkpoint_path ${weight_path}
144+
```
145+
146+
### Single Instance GPT-J Weight only quantization INT4 Accuracy
147+
148+
```bash
149+
# we use "lambada_standard" task to check accuracy
150+
LLM_ACC_TEST=1 python -u run_generation.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task} --woq --woq_checkpoint_path ${weight_path}
151+
```
152+
153+
## Single Instance Accuracy
154+
155+
```bash
156+
Accuracy test {TASK_NAME}, choice in this [link](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md), by default we use "lambada_standard"
157+
158+
# one-click bash script
159+
bash run_accuracy.sh
160+
161+
# float16
162+
LLM_ACC_TEST=1 python -u run_generation.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task}
163+
```
164+
165+
## Distributed Accuracy with DeepSpeed
166+
167+
```bash
168+
# Run distributed accuracy with 2 ranks of one node for float16 with ipex
169+
source ${ONECCL_DIR}/build/_install/env/setvars.sh
170+
171+
# one-click bash script
172+
bash run_accuracy_ds.sh
173+
174+
# float16
175+
LLM_ACC_TEST=1 mpirun -np 2 --prepend-rank python -u run_generation_with_deepspeed.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task} 2>&1
176+
```
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
3+
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2

0 commit comments

Comments
 (0)