intel
diff --git a/‎dependency_version.yml‎
Lines changed: 30 additions & 0 deletions b/‎dependency_version.yml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎docker/Dockerfile.compile‎
Lines changed: 20 additions & 36 deletions b/‎docker/Dockerfile.compile‎
Lines changed: 20 additions & 36 deletions
diff --git a/‎docker/README.md‎
Lines changed: 3 additions & 2 deletions b/‎docker/README.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/gpu/inference/python/llm/Dockerfile‎
Lines changed: 72 additions & 0 deletions b/‎examples/gpu/inference/python/llm/Dockerfile‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎examples/gpu/inference/python/llm/README.md‎
Lines changed: 176 additions & 0 deletions b/‎examples/gpu/inference/python/llm/README.md‎
Lines changed: 176 additions & 0 deletions
diff --git a/‎examples/gpu/inference/python/llm/tools/env_activate.sh‎
Lines changed: 3 additions & 0 deletions b/‎examples/gpu/inference/python/llm/tools/env_activate.sh‎
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1,30 @@
+gcc:
+  max-version: 
+  min-version: 12.3.0
+llvm:
+  version: 16.0.6
+pytorch:
+  version: 2.0.1a0
+  commit: v2.1.0
+torchaudio:
+  version: 2.0.1a0
+  commit: v2.1.0
+torchvision:
+  version: 0.16.0a0
+  commit: v0.16.0
+torch-ccl:
+  repo: https://github.com/intel/torch-ccl.git
+  commit: c8f89db1639558c1149c4d0eecf90c980064f609
+  version: 2.1.100+xpu
+deepspeed:
+  repo: https://github.com/microsoft/DeepSpeed.git
+  version: 
+  commit: 4fc181b01077521ba42379013ce91a1c294e5d8e
+intel-extension-for-deepspeed:
+  repo: https://github.com/intel/intel-extension-for-deepspeed.git
+  commit: ec332772d8bb71c6d89804a862f039fc91ae96b5
+transformers:
+  version: 4.31.0
+  commit: v4.31.0
+protobuf:
+  version: 3.20.3
@@ -26,19 +26,15 @@ RUN apt update && \
     software-properties-common \
     gnupg \
     gpg-agent
-RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified" | tee /etc/apt/sources.list.d/intel-gpu-jammy.list && \
-    apt update && \
-    apt install -y intel-opencl-icd=23.30.26918.50-736~22.04 \
-    level-zero=1.13.1-719~22.04 \
-    intel-level-zero-gpu=1.3.26918.50-736~22.04 \
-    xpu-smi=1.2.22-31~22.04
-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
-    apt update
+COPY ./tools/basekit_driver_install_helper.sh .
+RUN bash ./basekit_driver_install_helper.sh add-apt-repo && \
+    bash ./basekit_driver_install_helper.sh driver
 
-RUN useradd -m ubuntu
-RUN echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
+ARG GID_RENDER=109
+RUN useradd -m -s /bin/bash ubuntu && \
+    echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers && \
+    groupadd -g $GID_RENDER render && \
+    usermod -a -G video,render ubuntu
 USER ubuntu
 WORKDIR /home/ubuntu
 
@@ -48,35 +44,23 @@ RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Minicon
     echo "\nsource ~/miniconda3/bin/activate" >> ./.bashrc
 
 FROM base AS dev
-RUN sudo apt update && \
-    sudo apt install -y level-zero-dev=1.13.1-719~22.04 \
-    intel-level-zero-gpu-dev=1.3.26918.50-736~22.04 \
-    intel-oneapi-dpcpp-cpp-2024.0 \
-    intel-oneapi-mkl-devel-2024.0
-RUN sudo apt clean && \
-    sudo rm -rf /var/lib/apt/lists/* && \
-    if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi
-RUN mkdir ./intel-extension-for-pytorch
-COPY . ./intel-extension-for-pytorch/
-RUN sudo chown -R ubuntu:ubuntu ./intel-extension-for-pytorch && \
-    cp ./intel-extension-for-pytorch/scripts/compile_bundle.sh ./ && \
+RUN bash /basekit_driver_install_helper.sh dev
+COPY --chown=ubuntu:ubuntu . ./intel-extension-for-pytorch/
+RUN cp ./intel-extension-for-pytorch/scripts/compile_bundle.sh ./ && \
     sed -i "s/VER_IPEX=.*/VER_IPEX=/" compile_bundle.sh
 RUN . ./miniconda3/bin/activate && \
     conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \
-    bash compile_bundle.sh /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest pvc,ats-m150,acm-g11
+    bash compile_bundle.sh /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest pvc,ats-m150,acm-g11 && \
+    mkdir wheels && cp pytorch/dist/*.whl vision/dist/*.whl audio/dist/*.whl intel-extension-for-pytorch/dist/*.whl ./wheels
 
 FROM base AS deploy
-RUN sudo apt update && \
-    sudo apt install -y intel-oneapi-runtime-dpcpp-cpp=2024.0.0-49819 \
-    intel-oneapi-runtime-mkl=2024.0.0-49656
-RUN sudo apt clean && \
+RUN bash /basekit_driver_install_helper.sh runtime && \
+    sudo apt clean && \
     sudo rm -rf /var/lib/apt/lists/* && \
-    if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi
-RUN mkdir ./wheels
-COPY --from=dev /home/ubuntu/pytorch/dist/*.whl ./wheels
-COPY --from=dev /home/ubuntu/vision/dist/*.whl ./wheels
-COPY --from=dev /home/ubuntu/audio/dist/*.whl ./wheels
-COPY --from=dev /home/ubuntu/intel-extension-for-pytorch/dist/*.whl ./wheels
+    if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi && \
+    sudo rm /basekit_driver_install_helper.sh
+COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/wheels ./wheels
+COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh .
 RUN . ./miniconda3/bin/activate && \
     conda create -y -n py310 python=3.10 && conda activate py310 && \
     conda install -y libstdcxx-ng=12 libpng libjpeg-turbo -c conda-forge && \
@@ -85,4 +69,4 @@ RUN . ./miniconda3/bin/activate && \
     conda clean -a -y && \
     rm -rf ./wheels && \
     echo "conda activate py310" >> ./.bashrc && \
-    echo "export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so" >> ./.bashrc
+    ldpreload=$(bash get_libstdcpp_lib.sh) && echo "export LD_PRELOAD=${ldpreload}" >> ./.bashrc && rm get_libstdcpp_lib.sh
@@ -20,7 +20,9 @@ Run the following commands to build a docker image by compiling from source.
 git clone https://github.com/intel/intel-extension-for-pytorch.git
 cd intel-extension-for-pytorch
 git checkout v2.1.10+xpu
-docker build -f docker/Dockerfile.compile -t intel-extension-for-pytorch:2.1.10 .
+git submodule sync
+git submodule update --init --recursive
+docker build -f docker/Dockerfile.compile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -t intel-extension-for-pytorch:2.1.10 .
 ```
 
 Alternatively, `./build.sh` script has docker build command to install prebuilt wheel files, update all the relevant build arguments and execute the script. Run the command below in current directory.
@@ -44,7 +46,6 @@ docker container so that the GPU is accessible.
 IMAGE_NAME=intel/intel-extension-for-pytorch:xpu
 ```
 ```bash
-
 docker run --rm \
     -v <your-local-dir>:/workspace \
     --device=/dev/dri \
 
@@ -0,0 +1,72 @@
+# NOTE: To build this you will need a docker version >= 19.03 and DOCKER_BUILDKIT=1
+#
+#       If you do not use buildkit you are not going to have a good time
+#
+#       For reference:
+#           https://docs.docker.com/develop/develop-images/build_enhancements/
+
+ARG BASE_IMAGE=ubuntu:22.04
+FROM ${BASE_IMAGE} AS base
+RUN if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi && \
+    if [ ! -z ${HTTP_PROXY} ]; then echo "Acquire::http::Proxy \"${HTTP_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi && \
+    if [ ! -z ${HTTPS_PROXY} ]; then echo "Acquire::http::Proxy \"${HTTPS_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi
+RUN apt update && \
+    apt full-upgrade -y && \
+    DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -y \
+    sudo \
+    git \
+    wget \
+    curl \
+    vim \
+    patch \
+    gcc \
+    g++ \
+    make \
+    pkg-config \
+    software-properties-common \
+    gnupg \
+    gpg-agent
+COPY ./tools/basekit_driver_install_helper.sh .
+RUN bash ./basekit_driver_install_helper.sh add-apt-repo && \
+    bash ./basekit_driver_install_helper.sh driver
+
+ARG GID_RENDER=109
+RUN useradd -m -s /bin/bash ubuntu && \
+    echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers && \
+    groupadd -g $GID_RENDER render && \
+    usermod -a -G video,render ubuntu
+USER ubuntu
+WORKDIR /home/ubuntu
+
+RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+    bash miniconda.sh -b -p ./miniconda3 && \
+    rm miniconda.sh && \
+    echo "\nsource ~/miniconda3/bin/activate" >> ./.bashrc
+
+FROM base AS dev
+# --build-arg COMPILE=ON to compile from source
+ARG COMPILE
+RUN bash /basekit_driver_install_helper.sh dev
+COPY --chown=ubuntu:ubuntu . ./intel-extension-for-pytorch/
+RUN . ./miniconda3/bin/activate && \
+    conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \
+    cd intel-extension-for-pytorch/examples/gpu/inference/python/llm && \
+    if [ -z ${COMPILE} ]; then MODE=6; else MODE=2; fi && \
+    bash tools/env_setup.sh ${MODE} /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest pvc,ats-m150,acm-g11
+
+FROM base AS deploy
+RUN bash /basekit_driver_install_helper.sh runtime && \
+    sudo apt clean && \
+    sudo rm -rf /var/lib/apt/lists/* && \
+    if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then sudo rm /etc/apt/apt.conf.d/proxy.conf; fi && \
+    sudo rm /basekit_driver_install_helper.sh
+COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/examples/gpu/inference/python/llm ./llm
+COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh .
+RUN . ./miniconda3/bin/activate && \
+    conda create -y -n py310 python=3.10 && conda activate py310 && \
+    echo "conda activate py310" >> ./.bashrc && \
+    ldpreload=$(bash get_libstdcpp_lib.sh) && echo "export LD_PRELOAD=${ldpreload}" >> ./.bashrc && rm get_libstdcpp_lib.sh && \
+    cd ./llm && \
+    bash tools/env_setup.sh 1 && \
+    python -m pip cache purge && \
+    conda clean -a -y
@@ -0,0 +1,176 @@
+# Text Generation
+
+We provide the inference benchmarking scripts for large language models text generation.<br />
+Support large language model families, including LLaMA 2, GPT-J, OPT, and Bloom.<br />
+The scripts include both single instance and distributed (DeepSpeed) use cases.<br />
+The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (fp16 AMP and weight only quantization).<br />
+
+# Supported Model List
+
+| MODEL FAMILY | Verified < MODEL ID > (Huggingface hub)| FP16 | Weight only quantization INT4 |
+|---|:---:|:---:|:---:|
+|LLAMA 2| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Llama-2-70b-hf" | ✅ | ❎ |
+|GPT-J| "EleutherAI/gpt-j-6b" | ✅ | ✅ |
+|OPT|"facebook/opt-6.7b", "facebook/opt-30b"| ✅ | ❎ |
+|Bloom|"bigscience/bloom-7b1", "bigscience/bloom"| ✅ | ❎ |
+
+*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache and fused ROPE. For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
+
+# Supported platform
+
+\* PVC(1550/1100): support all the models in model list<br />
+\* ATS-M, Arc: support GPT-J (EleutherAI/gpt-j-6b)
+
+# Environment Setup
+
+1. Get the Intel® Extension for PyTorch\* source code
+
+```bash
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout v2.1.10+xpu
+git submodule sync
+git submodule update --init --recursive
+```
+
+2.a. It is highly recommended to build a Docker container from the provided `Dockerfile` for single-instance executions.
+
+```bash
+# Build an image with the provided Dockerfile by compiling Intel® Extension for PyTorch* from source
+DOCKER_BUILDKIT=1 docker build -f examples/gpu/inference/python/llm/Dockerfile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') --build-arg COMPILE=ON -t ipex-llm:2.1.10 .
+
+# Build an image with the provided Dockerfile by installing from Intel® Extension for PyTorch* prebuilt wheel files
+DOCKER_BUILDKIT=1 docker build -f examples/cpu/inference/python/llm/Dockerfile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -t ipex-llm:2.1.10 .
+
+# Run the container with command below
+docker run --rm -it --privileged --device=/dev/dri --ipc=host ipex-llm:2.1.10 bash
+
+# When the command prompt shows inside the docker container, enter llm examples directory
+cd llm
+```
+
+2.b. Alternatively, you can take advantage of a provided environment configuration script to setup an environment without using a docker container.
+
+```bash
+# GCC 12.3 is required. Installation can be taken care of by the environment configuration script.
+# Create a conda environment
+conda create -n llm python=3.9 -y
+conda activate llm
+
+# Setup the environment with the provided script
+cd examples/gpu/inference/python/llm
+# If you want to install Intel® Extension for PyTorch\* from prebuilt wheel files, use the command below:
+bash ./tools/env_setup.sh 7
+# If you want to install Intel® Extension for PyTorch\* from source, use the commands below:
+bash ./tools/env_setup.sh 3 <DPCPP_ROOT> <ONEMKL_ROOT> <AOT>
+export LD_PRELOAD=$(bash ../../../../../tools/get_libstdcpp_lib.sh)
+```
+
+\* `DPCPP_ROOT` is the placeholder for path where DPC++ compile was installed to. By default, it is `/opt/intel/oneapi/compiler/latest`.<br />
+\* `ONEMKL_ROOT` is the placeholder for path where oneMKL was installed to. By default, it is `/opt/intel/oneapi/mkl/latest`.<br />
+\* `AOT` is a text string to enable `Ahead-Of-Time` compilation for specific GPU models. Check [tutorial](../../../../../docs/tutorials/technical_details/AOT.md) for details.<br />
+
+3. Once an environment is configured with either method above, set necessary environment variables with an environment variables activation script.
+
+```bash
+# If you use docker images built from the provided Dockerfile, you do NOT need to run the following 2 commands.
+source <DPCPP_ROOT>/env/vars.sh
+source <ONEMKL_ROOT>/env/vars.sh
+
+# Activate environment variables
+source ./tools/env_activate.sh
+```
+
+
+# Run Models Generations
+
+| Benchmark mode | FP16 | Weight only quantization INT4 |
+|---|:---:|:---:|
+|Single instance | ✅ | ✅ |
+| Distributed (autotp) |  ✅ | ❎ |
+
+## Example usages of one-click Python script
+You can run LLM with a one-click bash script "run_benchmark.sh" for all inference cases.
+```
+bash run_benchmark.sh
+```
+
+### Single Instance Performance
+
+```bash
+# fp16 benchmark
+python -u run_generation.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --ipex --dtype float16 --token-latency
+```
+
+Notes:
+
+(1) By default, generations are based on bs = 1, input token size = 1024, output toke size = 128, iteration num = 10 and "beam search", and beam size = 4. For beam size = 1 and other settings, please export env settings, such as: "beam=1", "input=32", "output=32", "iter=5".
+
+### Distributed Performance with DeepSpeed
+
+You can run LLM with a one-click bash script "run_benchmark_ds.sh" for all distributed inference cases.
+```
+bash run_benchmark_ds.sh
+```
+
+```bash
+# distributed env setting
+source ${ONECCL_DIR}/build/_install/env/setvars.sh
+# fp16 benchmark
+mpirun -np 2 --prepend-rank python -u run_generation_with_deepspeed.py --benchmark -m ${model} --sub-model-name ${sub_model_name} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${out} --device xpu --ipex --dtype float16 --token-latency
+```
+
+Notes:
+
+(1) By default, generations are based on bs = 1, input token size = 1024, output toke size = 128, iteration num = 10 and "beam search", and beam size = 4. For beam size = 1 and other settings, please export env settings, such as: "beam=1", "input=32", "output=32", "iter=5".
+
+# Advanced Usage
+
+## Weight only quantization with low precision checkpoint (Experimental)
+
+Using INT4 weights can further improve performance by reducing memory bandwidth. However, direct per-channel quantization of weights to INT4 probably results in poor accuracy. Some algorithms can modify weights through calibration before quantizing weights to minimize accuracy drop. GPTQ is one of such algorithms. You may generate modified weights and quantization info (scales, zero points) for a certain model with a dataset for specified tasks by such algorithms. The results are saved as a `state_dict` in a `.pt` file. We provided a script here to run GPT-J .
+
+### Single Instance GPT-J Weight only quantization Performance
+
+```bash
+# quantization benchmark
+#To run quantization performance, you need to firstly get the quantized weight with the following step (1) and then run the performance benchmark with the following step (2)
+## (1) Get the quantized weight
+download link: https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/xpu/gptj_int4_weight_master.pt
+export weight_path = path-to-your-weight
+
+## (2) Run quantization performance test
+python -u run_generation.py --device xpu --ipex --dtype float16 --input-tokens ${input} --max-new-tokens ${out}  --token-latency --benchmark  --num-beams ${beam}  -m ${model} --sub-model-name ${sub_model_name}  --woq --woq_checkpoint_path ${weight_path}
+```
+
+### Single Instance GPT-J Weight only quantization INT4 Accuracy
+
+```bash
+# we use "lambada_standard" task to check accuracy
+LLM_ACC_TEST=1 python -u run_generation.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task} --woq --woq_checkpoint_path ${weight_path}
+```
+
+## Single Instance Accuracy
+
+```bash
+Accuracy test {TASK_NAME}, choice in this [link](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md), by default we use "lambada_standard"
+
+# one-click bash script
+bash run_accuracy.sh
+
+# float16
+LLM_ACC_TEST=1 python -u run_generation.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task}
+```
+
+## Distributed Accuracy with DeepSpeed
+
+```bash
+# Run distributed accuracy with 2 ranks of one node for float16 with ipex
+source ${ONECCL_DIR}/build/_install/env/setvars.sh
+
+# one-click bash script
+bash run_accuracy_ds.sh
+
+# float16
+LLM_ACC_TEST=1 mpirun -np 2 --prepend-rank python -u run_generation_with_deepspeed.py -m ${model} --sub-model-name ${sub_model_name} --ipex --dtype float16 --accuracy-only --acc-tasks ${task} 2>&1
+```
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+`
	`3`	`+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2`