Add deepspeed in docker (#3829)

ZhaoqiongZ · web-flow · commit 6266f89833f8 · 2024-03-07T15:56:58.000+08:00
* enable deepspeed in docker and update the llm readme accordingly

* update compile_bundle and README.md

remove DOCKER_BUILDKIT

* Update Dockerfile remove BUILDKIT comments

* Update env_setup.sh  fix llm_eval package name

* Update env_setup.sh set llm_eval version by dependency_version.yml

* Update dependency_version.yml add llm_eval version 0.3.0

* update Dockerfile.compile from compile_bundle_main

* add ccl in compile bundle, copy from compile_bundle_main
diff --git a/dependency_version.yml b/dependency_version.yml
@@ -28,6 +28,8 @@ transformers:
   commit: v4.31.0
 protobuf:
   version: 3.20.3
+llm_eval:
+  version: 0.3.0
 basekit:
   dpcpp-cpp-rt:
     version: 2024.0.0
diff --git a/docker/Dockerfile.compile b/docker/Dockerfile.compile
@@ -49,7 +49,7 @@ RUN cp ./intel-extension-for-pytorch/scripts/compile_bundle.sh ./ && \
     sed -i "s/VER_IPEX=.*/VER_IPEX=/" compile_bundle.sh
 RUN . ./miniconda3/bin/activate && \
     conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \
-    bash compile_bundle.sh /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest pvc,ats-m150,acm-g11 && \
+    bash compile_bundle.sh /opt/intel/oneapi/compiler/latest /opt/intel/oneapi/mkl/latest /opt/intel/oneapi/ccl/latest pvc,ats-m150,acm-g11 && \
     mkdir wheels && cp pytorch/dist/*.whl vision/dist/*.whl audio/dist/*.whl intel-extension-for-pytorch/dist/*.whl ./wheels
 
 FROM base AS deploy
diff --git a/examples/gpu/inference/python/llm/Dockerfile b/examples/gpu/inference/python/llm/Dockerfile
@@ -1,12 +1,7 @@
-# NOTE: To build this you will need a docker version >= 19.03 and DOCKER_BUILDKIT=1
-#
-#       If you do not use buildkit you are not going to have a good time
-#
-#       For reference:
-#           https://docs.docker.com/develop/develop-images/build_enhancements/
 
 ARG BASE_IMAGE=ubuntu:22.04
 FROM ${BASE_IMAGE} AS base
+SHELL ["/bin/bash", "-c"]
 RUN if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi && \
     if [ ! -z ${HTTP_PROXY} ]; then echo "Acquire::http::Proxy \"${HTTP_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi && \
     if [ ! -z ${HTTPS_PROXY} ]; then echo "Acquire::https::Proxy \"${HTTPS_PROXY}\";" >> /etc/apt/apt.conf.d/proxy.conf; fi
diff --git a/examples/gpu/inference/python/llm/README.md b/examples/gpu/inference/python/llm/README.md
@@ -27,84 +27,75 @@ Here you can find the inference benchmarking scripts for large language models (
 
 ## Environment Setup
 
-1. Get the Intel® Extension for PyTorch\* source code:
+
+### [Recommended] Docker-based environment setup with compilation from source
+
+
 
 ```bash
+# Get the Intel® Extension for PyTorch* source code
 git clone https://github.com/intel/intel-extension-for-pytorch.git
 cd intel-extension-for-pytorch
-git checkout v2.1.10+xpu
+git checkout v2.1.20+xpu
 git submodule sync
 git submodule update --init --recursive
-```
-
-2. Do one of the following:
 
-   If you are planning to use DeepSpeed for execution, please use a bare-metal environment directly and follow 2.b session for the environment setup. Otherwise, we recommend you follow 2.a session with Docker, where the environment is already configured.
-    
-   a. (Recommended) Build a Docker container from the provided `Dockerfile` for single-instance executions.
+# Build an image with the provided Dockerfile by compiling Intel® Extension for PyTorch* from source
+docker build -f examples/gpu/inference/python/llm/Dockerfile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') --build-arg COMPILE=ON -t ipex-llm:2.1.20 .
 
-      ```bash
-      # Build an image with the provided Dockerfile by compiling Intel® Extension for PyTorch* from source
-      DOCKER_BUILDKIT=1 docker build -f examples/gpu/inference/python/llm/Dockerfile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') --build-arg COMPILE=ON -t ipex-llm:2.1.10 .
+# Build an image with the provided Dockerfile by installing from Intel® Extension for PyTorch* prebuilt wheel files
+docker build -f examples/gpu/inference/python/llm/Dockerfile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -t ipex-llm:2.1.20 .
 
-      # Build an image with the provided Dockerfile by installing from Intel® Extension for PyTorch* prebuilt wheel files
-      DOCKER_BUILDKIT=1 docker build -f examples/gpu/inference/python/llm/Dockerfile --build-arg GID_RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,') -t ipex-llm:2.1.10 .
+# Run the container with command below
+docker run --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path \
+--ipc=host --net=host --cap-add=ALL -v /lib/modules:/lib/modules --workdir /workspace  \
+--volume `pwd`/examples/gpu/inference/python/llm/:/workspace/llm ipex-llm:2.1.20 /bin/bash
 
-      # Run the container with command below
-      docker run --rm -it --privileged --device=/dev/dri --ipc=host ipex-llm:2.1.10 bash
 
-      # When the command prompt shows inside the docker container, enter llm examples directory
-      cd llm
-      ```
-   b. Alternatively, use the provided environment configuration script to set up environment without using a docker container:
+# When the command prompt shows inside the docker container, enter llm examples directory
+cd llm
 
-      Make sure the driver and Base Toolkit are installed without using a docker container. Refer to [Installation Guide](https://intel.github.io/intel-extension-for-pytorch/#installation?platform=gpu&version=v2.1.10%2Bxpu&os=linux%2Fwsl2&package=source).
-
-      OneCCL is also required if you run with DeepSpeed. We recommend to use apt/yum/dnf to install the oneCCL package. Refer to [Base Toolkit Installation](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html) for adding the APT/YUM/DNF key and sources for first-time users.
+# Activate environment variables
+source ./tools/env_activate.sh
+```
 
-      Example command:
+### Conda-based environment setup with compilation from source
 
-      ```bash
-      sudo apt install intel-oneapi-ccl-devel=2021.11.1-6
-      sudo yum install intel-oneapi-ccl-devel=2021.11.1-6
-      sudo dnf install intel-oneapi-ccl-devel=2021.11.1-6
-      ```
+Make sure the driver and Base Toolkit are installed without using a docker container. Refer to [Installation Guide](https://intel.github.io/intel-extension-for-pytorch/#installation?platform=gpu&version=v2.1.10%2Bxpu&os=linux%2Fwsl2&package=source).
 
 
-      ```bash
-         # Make sure you have GCC >= 11 is installed on your system.
-         # Create a conda environment
-         conda create -n llm python=3.10 -y
-         conda activate llm
 
-         # Setup the environment with the provided script
-         cd examples/gpu/inference/python/llm
-         # If you want to install Intel® Extension for PyTorch\* from prebuilt wheel files, use the command below:
-         bash ./tools/env_setup.sh 7
-         # If you want to install Intel® Extension for PyTorch\* from source, use the commands below:
-         bash ./tools/env_setup.sh 3 <DPCPP_ROOT> <ONEMKL_ROOT> <ONECCL_ROOT> <AOT>
-         export LD_PRELOAD=$(bash ../../../../../tools/get_libstdcpp_lib.sh)
-         export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
-         source <DPCPP_ROOT>/env/vars.sh
-         source <ONEMKL_ROOT>/env/vars.sh
-         source <ONECCL_ROOT>/env/vars.sh
-         source <MPI_ROOT>/env/vars.sh
-     ```
-     where <br />
-     - `DPCPP_ROOT` is the path to the DPC++ compiler. By default, it is `/opt/intel/oneapi/compiler/latest`.<br />
-     - `ONEMKL_ROOT` is the path to oneMKL. By default, it is `/opt/intel/oneapi/mkl/latest`.<br />
-     - `ONECCL_ROOT` is the path to oneCCL. By default, it is `/opt/intel/oneapi/ccl/latest`.<br />
-     - `MPI_ROOT` is the path to oneAPI MPI library. By default, it is `/opt/intel/oneapi/mpi/latest`.<br />
-     - `AOT` is a text string to enable `Ahead-Of-Time` compilation for specific GPU models. Check [tutorial](../../../../../docs/tutorials/technical_details/AOT.md) for details.<br />
+```bash
 
-3. Set necessary environment variables with the environment variables activation script.
+# Get the Intel® Extension for PyTorch* source code
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout v2.1.20+xpu
+git submodule sync
+git submodule update --init --recursive
 
-```bash
-# Activate environment variables
+# Make sure you have GCC >= 11 is installed on your system.
+# Create a conda environment
+conda create -n llm python=3.10 -y
+conda activate llm
+conda install pkg-config
+# Setup the environment with the provided script
+cd examples/gpu/inference/python/llm
+# If you want to install Intel® Extension for PyTorch\* from prebuilt wheel files, use the command below:
+bash ./tools/env_setup.sh 7
+# If you want to install Intel® Extension for PyTorch\* from source, use the commands below:
+bash ./tools/env_setup.sh 3 <DPCPP_ROOT> <ONEMKL_ROOT> <ONECCL_ROOT> <AOT>
+export LD_PRELOAD=$(bash ../../../../../tools/get_libstdcpp_lib.sh)
+export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
 source ./tools/env_activate.sh
+
 ```
 
+where <br />
+- `AOT` is a text string to enable `Ahead-Of-Time` compilation for specific GPU models. Check [tutorial](../../../../../docs/tutorials/technical_details/AOT.md) for details.<br />
 
+
+ 
 ## Run Models Generation
 
 | Benchmark mode | FP16 | Weight only quantization INT4 |
@@ -141,8 +132,6 @@ bash run_benchmark_ds.sh
 ```
 
 ```bash
-# distributed env setting
-source ${ONECCL_ROOT}/env/setvars.sh
 # fp16 benchmark
 mpirun -np 2 --prepend-rank python -u run_generation_with_deepspeed.py --benchmark -m ${model} --num-beams ${beam} --num-iter ${iter} --batch-size ${bs} --input-tokens ${input} --max-new-tokens ${output} --device xpu --ipex --dtype float16 --token-latency
 ```
@@ -190,9 +179,6 @@ LLM_ACC_TEST=1 python -u run_generation.py -m ${model} --ipex --dtype float16 --
 ### Distributed Accuracy with DeepSpeed
 
 ```bash
-# Run distributed accuracy with 2 ranks of one node for float16 with ipex
-source ${ONECCL_ROOT}/env/setvars.sh
-
 # one-click bash script
 bash run_accuracy_ds.sh
 
diff --git a/examples/gpu/inference/python/llm/tools/env_activate.sh b/examples/gpu/inference/python/llm/tools/env_activate.sh
@@ -1,3 +1,18 @@
 #!/bin/bash
 
+ONEAPI_ROOT=${ONEAPI_ROOT:-/opt/intel/oneapi}
+if test -f ${ONEAPI_ROOT}/setvars.sh ; then
+	source ${ONEAPI_ROOT}/setvars.sh
+else
+    export LD_LIBRARY_PATH=/opt/intel/oneapi/redist/opt/mpi/libfabric/lib:$LD_LIBRARY_PATH
+    export PATH=/opt/intel/oneapi/redist/bin:$PATH
+    export I_MPI_ROOT=/opt/intel/oneapi/redist/lib
+    export CCL_ROOT=/opt/intel/oneapi/redist
+    export FI_PROVIDER_PATH=/opt/intel/oneapi/redist/opt/mpi/libfabric/lib/prov
+fi
+
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
+export ENABLE_SDP_FUSION=1
+export TORCH_LLM_ALLREDUCE=1
+
+
diff --git a/examples/gpu/inference/python/llm/tools/env_setup.sh b/examples/gpu/inference/python/llm/tools/env_setup.sh
@@ -50,7 +50,6 @@ fi
 # Save current directory path
 BASEFOLDER=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 WHEELFOLDER=${BASEFOLDER}/../wheels
-TORCH_INSTALL_SCRIPT=${WHEELFOLDER}/torch_install.sh
 AUX_INSTALL_SCRIPT=${WHEELFOLDER}/aux_install.sh
 cd ${BASEFOLDER}/..
 
@@ -99,6 +98,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
     VER_TORCH=$(python tools/yaml_utils.py -f dependency_version.yml -d pytorch -k version)
     TRANSFORMERS_COMMIT=$(python tools/yaml_utils.py -f dependency_version.yml -d transformers -k commit)
     VER_PROTOBUF=$(python tools/yaml_utils.py -f dependency_version.yml -d protobuf -k version)
+    VER_LLM_EVAL=$(python tools/yaml_utils.py -f dependency_version.yml -d llm_eval -k version)
     VER_IPEX_MAJOR=$(grep "VERSION_MAJOR" version.txt | cut -d " " -f 2)
     VER_IPEX_MINOR=$(grep "VERSION_MINOR" version.txt | cut -d " " -f 2)
     VER_IPEX_PATCH=$(grep "VERSION_PATCH" version.txt | cut -d " " -f 2)
@@ -122,114 +122,29 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
     conda install -y cmake ninja
 
     echo "#!/bin/bash" > ${AUX_INSTALL_SCRIPT}
-    echo "#!/bin/bash" > ${TORCH_INSTALL_SCRIPT}
     if [ $((${MODE} & 0x04)) -ne 0 ]; then
-        echo "python -m pip install torch==${VER_TORCH} intel-extension-for-pytorch==${VER_IPEX} oneccl-bind-pt==${VER_TORCHCCL} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/" >> ${TORCH_INSTALL_SCRIPT}
+        echo "python -m pip install torch==${VER_TORCH} intel-extension-for-pytorch==${VER_IPEX} oneccl-bind-pt==${VER_TORCHCCL} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/" >> ${AUX_INSTALL_SCRIPT}
         python -m pip install torch==${VER_TORCH} intel-extension-for-pytorch==${VER_IPEX} oneccl-bind-pt==${VER_TORCHCCL} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
     else
         if [ ! -f ${ONECCL_ROOT}/env/vars.sh ]; then
             echo "oneCCL environment ${ONECCL_ROOT} doesn't seem to exist."
             exit 6
         fi
-        ONEAPIROOT=${ONEMKL_ROOT}/../..
 
         # Install PyTorch and Intel® Extension for PyTorch*
         cp intel-extension-for-pytorch/scripts/compile_bundle.sh .
         sed -i "s/VER_IPEX=.*/VER_IPEX=/" compile_bundle.sh
-        bash compile_bundle.sh ${DPCPP_ROOT} ${ONEMKL_ROOT} ${AOT} 0
+        bash compile_bundle.sh ${DPCPP_ROOT} ${ONEMKL_ROOT} ${ONECCL_ROOT} ${AOT} 1
         cp pytorch/dist/*.whl ${WHEELFOLDER}
         cp intel-extension-for-pytorch/dist/*.whl ${WHEELFOLDER}
-        rm -rf compile_bundle.sh llvm-project llvm-release pytorch
+        cp torch-ccl/dist/*.whl ${WHEELFOLDER}
+        rm -rf compile_bundle.sh llvm-project llvm-release pytorch torch-ccl
         export LD_PRELOAD=$(bash intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh)
-
-        # The following is only for DeepSpeed case
-        #Install oneccl-bind-pt(also named torch-ccl)
-        set +e
-        function env_backup() {
-            key=$1
-            env | grep ${key} > /dev/null
-            if [ $? -gt 0 ]; then
-                echo "unset"
-            else
-                value=$(env | grep "^${key}=")
-                echo ${value#"${key}="}
-            fi
-        }
-        function env_recover() {
-            key=$1
-            value=$2
-            if [ "$value" == "unset" ]; then
-                unset ${key}
-            else
-                export ${key}=${value}
-            fi
-        }
-
-        PKG_CONFIG_PATH_BK=$(env_backup PKG_CONFIG_PATH)
-        ACL_BOARD_VENDOR_PATH_BK=$(env_backup ACL_BOARD_VENDOR_PATH)
-        FPGA_VARS_DIR_BK=$(env_backup FPGA_VARS_DIR)
-        DIAGUTIL_PATH_BK=$(env_backup DIAGUTIL_PATH)
-        MANPATH_BK=$(env_backup MANPATH)
-        CMAKE_PREFIX_PATH_BK=$(env_backup CMAKE_PREFIX_PATH)
-        CMPLR_ROOT_BK=$(env_backup CMPLR_ROOT)
-        FPGA_VARS_ARGS_BK=$(env_backup FPGA_VARS_ARGS)
-        LIBRARY_PATH_BK=$(env_backup LIBRARY_PATH)
-        OCL_ICD_FILENAMES_BK=$(env_backup OCL_ICD_FILENAMES)
-        INTELFPGAOCLSDKROOT_BK=$(env_backup INTELFPGAOCLSDKROOT)
-        LD_LIBRARY_PATH_BK=$(env_backup LD_LIBRARY_PATH)
-        MKLROOT_BK=$(env_backup MKLROOT)
-        NLSPATH_BK=$(env_backup NLSPATH)
-        PATH_BK=$(env_backup PATH)
-        CPATH_BK=$(env_backup CPATH)
-        set -e
-        source ${DPCPP_ROOT}/env/vars.sh
-        source ${ONEMKL_ROOT}/env/vars.sh
-
-        if [ -d torch-ccl ]; then
-            rm -rf torch-ccl
-        fi
-        git clone ${TORCHCCL_REPO}
-        cd torch-ccl
-        git checkout ${TORCHCCL_COMMIT}
-        git submodule sync
-        git submodule update --init --recursive
-        if [ -d ${CONDA_PREFIX}/lib/gcc/x86_64-conda-linux-gnu ]; then
-            export DPCPP_GCC_INSTALL_DIR="${CONDA_PREFIX}/lib/gcc/x86_64-conda-linux-gnu/12.3.0"
-        fi
-        export INTELONEAPIROOT=${ONEAPIROOT}
-        USE_SYSTEM_ONECCL=ON COMPUTE_BACKEND=dpcpp python setup.py bdist_wheel
-        unset INTELONEAPIROOT
-        if [ -d ${CONDA_PREFIX}/lib/gcc/x86_64-conda-linux-gnu ]; then
-            unset DPCPP_GCC_INSTALL_DIR
-        fi
-        cp dist/*.whl ${WHEELFOLDER}
-        python -m pip install dist/*.whl
-        cd ..
-        rm -rf torch-ccl
-
-        set +e
-        env_recover PKG_CONFIG_PATH ${PKG_CONFIG_PATH_BK}
-        env_recover ACL_BOARD_VENDOR_PATH ${ACL_BOARD_VENDOR_PATH_BK}
-        env_recover FPGA_VARS_DIR ${FPGA_VARS_DIR_BK}
-        env_recover DIAGUTIL_PATH ${DIAGUTIL_PATH_BK}
-        env_recover MANPATH ${MANPATH_BK}
-        env_recover CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH_BK}
-        env_recover CMPLR_ROOT ${CMPLR_ROOT_BK}
-        env_recover FPGA_VARS_ARGS ${FPGA_VARS_ARGS_BK}
-        env_recover LIBRARY_PATH ${LIBRARY_PATH_BK}
-        env_recover OCL_ICD_FILENAMES ${OCL_ICD_FILENAMES_BK}
-        env_recover INTELFPGAOCLSDKROOT ${INTELFPGAOCLSDKROOT_BK}
-        env_recover LD_LIBRARY_PATH ${LD_LIBRARY_PATH_BK}
-        env_recover MKLROOT ${MKLROOT_BK}
-        env_recover NLSPATH ${NLSPATH_BK}
-        env_recover PATH ${PATH_BK}
-        env_recover CPATH ${CPATH_BK}
-        set -e
     fi
 
     echo "python -m pip install impi-devel" >> ${AUX_INSTALL_SCRIPT}
-    echo "python -m pip install cpuid accelerate datasets sentencepiece protobuf==${VER_PROTOBUF} huggingface_hub mpi4py mkl" >> ${AUX_INSTALL_SCRIPT}
-    echo "python -m pip install lm_eval" >> ${AUX_INSTALL_SCRIPT}
+    echo "python -m pip install cpuid accelerate datasets sentencepiece diffusers protobuf==${VER_PROTOBUF} huggingface_hub mpi4py mkl" >> ${AUX_INSTALL_SCRIPT}
+    echo "python -m pip install llm_eval==${VER_LLM_EVAL}" >> ${AUX_INSTALL_SCRIPT}
     
 
     # Install Transformers
@@ -277,14 +192,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
     rm -rf DeepSpeed
 fi
 if [ $((${MODE} & 0x01)) -ne 0 ]; then
-    bash ${TORCH_INSTALL_SCRIPT}
     python -m pip install ${WHEELFOLDER}/*.whl
     bash ${AUX_INSTALL_SCRIPT}
     rm -rf ${WHEELFOLDER}
-    if [ -f ${TORCH_INSTALL_SCRIPT} ]; then
-        rm ${TORCH_INSTALL_SCRIPT}
-    fi
-    if [ -f ${AUX_INSTALL_SCRIPT} ]; then
-        rm ${AUX_INSTALL_SCRIPT}
-    fi
 fi
diff --git a/scripts/compile_bundle.sh b/scripts/compile_bundle.sh
diff --git a/setup.py b/setup.py