[ROCm] Upgrade xformers version for ROCm & update doc (#2079)

tjtanaa · big-yellow-duck · web-flow · commit f375ec844036 · 2023-12-13T00:56:05.000-08:00
Co-authored-by: miloice &lt;jeffaw99@hotmail.com&gt;
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -47,12 +47,12 @@ RUN mkdir libs \
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
-RUN pip install xformers==0.0.22.post7 --no-deps
+RUN pip install xformers==0.0.23 --no-deps
 
 RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
-    && bash patch_xformers-0.0.22.post7.rocm.sh \
+    && bash patch_xformers-0.0.23.rocm.sh \
     && python3 setup.py install \
     && cd ..
 
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
@@ -3,7 +3,7 @@
 Installation with ROCm
 ======================
 
-vLLM 0.2.x onwards supports model inferencing and serving on AMD GPUs with ROCm.
+vLLM 0.2.4 onwards supports model inferencing and serving on AMD GPUs with ROCm.
 At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported.
 Data types currently supported in ROCm are FP16 and BF16.
 
@@ -29,7 +29,7 @@ Installation options:
 
 .. code-block:: console
 
-    $ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.3
+    $ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.4
     $ docker run -it \
        --network=host \
        --group-add=video \
@@ -70,12 +70,12 @@ You can build and install vLLM from source:
     - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
     - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
 
-2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention
+2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
 
     .. code-block:: console
 
-        $ pip install xformers==0.0.22.post7 --no-deps
-        $ bash patch_xformers-0.0.22.post7.rocm.sh
+        $ pip install xformers==0.0.23 --no-deps
+        $ bash patch_xformers.rocm.sh
 
 3. Build vLLM.
 
@@ -127,12 +127,12 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from
     - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
     - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
 
-2. Setup `xformers==0.0.22.post7` without dependencies, and apply patches to adapt for ROCm flash attention
+2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
 
     .. code-block:: console
 
-        $ pip install xformers==0.0.22.post7 --no-deps
-        $ bash patch_xformers-0.0.22.post7.rocm.sh
+        $ pip install xformers==0.0.23 --no-deps
+        $ bash patch_xformers.rocm.sh
 
 3. Build vLLM.
 
diff --git a/patch_xformers.rocm.sh b/patch_xformers.rocm.sh
@@ -1,21 +1,32 @@
 #!/bin/bash
+set -e
+
+XFORMERS_VERSION="0.0.23"
+
+export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)')
+
+if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then
+    echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed"
+    exit 1
+fi
+
 export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)')
 export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)')
 
-echo $XFORMERS_FMHA_FLASH_PATH
-echo $XFORMERS_FMHA_COMMON_PATH
+echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}"
+echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}"
 
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"; then
+if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
     echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
-    patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"
+    patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"
     echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
 else
     echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
 fi
 
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"; then
+if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
     echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
-    patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"
+    patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"
     echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
 else
     echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
@@ -8,7 +8,6 @@ pyarrow  # Required for Ray data.
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0
-huggingface_hub<0.18,>=0.16.4
 transformers >= 4.36.0  # Required for Mixtral.
 fastapi
 uvicorn[standard]
diff --git a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch b/rocm_patch/commonpy_xformers-0.0.23.rocm.patch
diff --git a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
@@ -1,6 +1,6 @@
---- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/flash.py	2023-11-29 03:17:03.930103539 +0000
-+++ flash.py	2023-11-28 16:14:25.206128903 +0000
-@@ -31,39 +31,39 @@
+--- flash_ori.py	2023-12-13 05:43:31.530752623 +0000
++++ flash_patch.py	2023-12-13 06:00:45.962403104 +0000
+@@ -36,44 +36,44 @@
  
  FLASH_VERSION = "0.0.0"
  try:
@@ -15,9 +15,12 @@
 -        from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
 -
 -        FLASH_VERSION = flash_attn.__version__
--        flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2])
--        if flash_ver_parsed < (2, 3):
--            raise ImportError("Requires 2.3 for sliding window support")
+-        flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
+-        if (
+-            flash_ver_parsed != (2, 3, 6)
+-            and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
+-        ):
+-            raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
 +    #try:
 +    #    from ... import _C_flashattention  # type: ignore[attr-defined]
 +    #    from ..._cpp_lib import _build_metadata
@@ -29,88 +32,103 @@
 +    from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
 +
 +    FLASH_VERSION = flash_attn.__version__
-+    #    flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2])
-+    #    if flash_ver_parsed < (2, 3):
-+    #        raise ImportError("Requires 2.3 for sliding window support")
++    #    flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
++    #    if (
++    #        flash_ver_parsed != (2, 3, 6)
++    #        and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
++    #    ):
++    #        raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
  
      # create library so that flash-attn goes through the PyTorch Dispatcher
 -    _flash_lib = torch.library.Library("xformers_flash", "DEF")
-+    #_flash_lib = torch.library.Library("xformers_flash", "DEF")
- 
+-
 -    _flash_lib.define(
 -        "flash_fwd(Tensor query, Tensor key, Tensor value, "
--        "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, "
+-        "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
 -        "int max_seqlen_q, int max_seqlen_k, "
 -        "float p, float softmax_scale, "
--        "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)"
+-        "bool is_causal, int window_left, "
+-        "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
 -    )
--
++    #_flash_lib = torch.library.Library("xformers_flash", "DEF")
+ 
 -    _flash_lib.define(
 -        "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
 -        "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
 -        "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
 -        "int max_seqlen_q, int max_seqlen_k, "
--        "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
+-        "float p, float softmax_scale, bool is_causal, "
+-        "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
 -    )
 +    #_flash_lib.define(
 +    #    "flash_fwd(Tensor query, Tensor key, Tensor value, "
-+    #    "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, "
++    #    "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
 +    #    "int max_seqlen_q, int max_seqlen_k, "
 +    #    "float p, float softmax_scale, "
-+    #    "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)"
++    #    "bool is_causal, int window_left, "
++    #    "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
 +    #)
 +
 +    #_flash_lib.define(
 +    #    "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
 +    #    "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
 +    #    "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
 +    #    "int max_seqlen_q, int max_seqlen_k, "
-+    #    "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
++    #    "float p, float softmax_scale, bool is_causal, "
++    #    "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
 +    #)
  
      def _flash_fwd(
          query,
-@@ -98,8 +98,8 @@
+@@ -111,8 +111,8 @@
                  p,
                  softmax_scale,
                  is_causal,
--                window_size - 1,  # window_size_left
--                -1,  # window_size_right
-+        #        window_size - 1,  # window_size_left
-+        #        -1,  # window_size_right
+-                window_left,  # window_size_left
+-                window_right,  # window_size_right
++        #        window_left,  # window_size_left
++        #        window_right,  # window_size_right
                  return_softmax,
                  None,  # rng
              )
-@@ -127,8 +127,8 @@
+@@ -134,15 +134,15 @@
+                 out,
+                 cu_seq_lens_q,
+                 cu_seq_lens_k,
+-                seqused_k,
++         #       seqused_k,
+                 max_seq_len_q,
+                 max_seq_len_k,
+                 p,
                  softmax_scale,
                  False,
                  is_causal,
--                window_size - 1,  # window_size_left
--                -1,  # window_size_right
-+         #       window_size - 1,  # window_size_left
-+         #       -1,  # window_size_right
+-                window_left,
+-                window_right,
++         #       window_left,
++         #       window_right,
                  return_softmax,
                  None,
              )
-@@ -169,8 +169,8 @@
+@@ -184,8 +184,8 @@
                  p,
                  softmax_scale,
                  is_causal,
--                window_size - 1,  # window_size_left
--                -1,  # window_size_right
-+        #        window_size - 1,  # window_size_left
-+        #        -1,  # window_size_right
+-                window_left,
+-                window_right,
++        #        window_left,
++        #        window_right,
                  None,
                  rng_state,
              )
-@@ -193,15 +193,15 @@
+@@ -208,15 +208,15 @@
                  softmax_scale,
                  False,  # zero_tensors
                  is_causal,
--                window_size - 1,  # window_size_left
--                -1,  # window_size_right
-+        #        window_size - 1,  # window_size_left
-+        #        -1,  # window_size_right
+-                window_left,
+-                window_right,
++        #        window_left,
++        #        window_right,
                  None,
                  rng_state,
              )
@@ -123,7 +141,7 @@
  except ImportError:
      pass
  
-@@ -348,7 +348,7 @@
+@@ -400,7 +400,7 @@
          implementation.
      """