|
| 1 | +# syntax=docker/dockerfile:1.2 |
| 2 | + |
| 3 | +ARG BASE_IMAGE |
| 4 | +ARG DEEPSPEED_VERSION="0.9.4" |
| 5 | +ARG FLASH_ATTN_VERSION="1.0.7" |
| 6 | + |
| 7 | +FROM alpine/git:2.36.3 as flash-attn-downloader |
| 8 | +WORKDIR /git |
| 9 | +ARG FLASH_ATTN_VERSION |
| 10 | +RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \ |
| 11 | + https://github.com/HazyResearch/flash-attention -b v${FLASH_ATTN_VERSION} && \ |
| 12 | + rm -rf flash-attention/.git |
| 13 | + |
| 14 | + |
| 15 | +# Dependencies requiring NVCC are built ahead of time in a separate stage |
| 16 | +# so that the ~2 GiB dev library installations don't have to be included |
| 17 | +# in the final image. |
| 18 | +FROM ${BASE_IMAGE} as builder-base |
| 19 | +RUN export \ |
| 20 | + CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \ |
| 21 | + CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \ |
| 22 | + export \ |
| 23 | + CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \ |
| 24 | + apt-get -qq update && apt-get install -y --no-install-recommends \ |
| 25 | + cuda-nvcc-${CUDA_PACKAGE_VERSION} \ |
| 26 | + cuda-nvml-dev-${CUDA_PACKAGE_VERSION} \ |
| 27 | + libcurand-dev-${CUDA_PACKAGE_VERSION} \ |
| 28 | + libcublas-dev-${CUDA_PACKAGE_VERSION} \ |
| 29 | + libcusparse-dev-${CUDA_PACKAGE_VERSION} \ |
| 30 | + libcusolver-dev-${CUDA_PACKAGE_VERSION} \ |
| 31 | + cuda-nvprof-${CUDA_PACKAGE_VERSION} \ |
| 32 | + cuda-profiler-api-${CUDA_PACKAGE_VERSION} \ |
| 33 | + libaio-dev \ |
| 34 | + ninja-build \ |
| 35 | + parallel \ |
| 36 | + # gcc-10/g++-10/lld do not need to be installed here, but they improve the build. |
| 37 | + # gfortran-10 is just for compiler_wrapper.f95. |
| 38 | + gcc-10 g++-10 gfortran-10 lld && \ |
| 39 | + apt-get clean && \ |
| 40 | + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \ |
| 41 | + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \ |
| 42 | + update-alternatives --install \ |
| 43 | + /usr/bin/gfortran gfortran /usr/bin/gfortran-10 10 && \ |
| 44 | + update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1 |
| 45 | + |
| 46 | +RUN mkdir /wheels /build |
| 47 | +WORKDIR /build |
| 48 | + |
| 49 | +# DeepSpeed forces -march=native into the compiler options, |
| 50 | +# making the result dependent on the processor architecture |
| 51 | +# used on the builder machine. |
| 52 | +# The compiler wrapper normalizes -march=native to -march=skylake |
| 53 | +# along with a couple other transformations before invoking GCC. |
| 54 | +COPY compiler_wrapper.f95 . |
| 55 | +RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 |
| 56 | + |
| 57 | + |
| 58 | +FROM builder-base as deepspeed-builder |
| 59 | +# DeepSpeed build flags |
| 60 | +# See: https://www.deepspeed.ai/tutorials/advanced-install |
| 61 | +ARG DS_BUILD_OPS="1" |
| 62 | +ARG DS_BUILD_CPU_ADAM="" |
| 63 | +ARG DS_BUILD_FUSED_ADAM="" |
| 64 | +ARG DS_BUILD_FUSED_LAMB="" |
| 65 | +# sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4 |
| 66 | +ARG DS_BUILD_SPARSE_ATTN="0" |
| 67 | +ARG DS_BUILD_TRANSFORMER="" |
| 68 | +ARG DS_BUILD_TRANSFORMER_INFERENCE="" |
| 69 | +ARG DS_BUILD_STOCHASTIC_TRANSFORMER="" |
| 70 | +ARG DS_BUILD_UTILS="" |
| 71 | +ARG DS_BUILD_AIO="" |
| 72 | + |
| 73 | +ARG DEEPSPEED_VERSION |
| 74 | + |
| 75 | +SHELL ["/bin/bash", "-c"] |
| 76 | +RUN python3 -m pip install -U --no-cache-dir \ |
| 77 | + setuptools wheel pip && \ |
| 78 | + { \ |
| 79 | + # DeepSpeed doesn't handle blank environment variables |
| 80 | + # in the same way as unset ones, so clear any blank ones. |
| 81 | + for VAR in \ |
| 82 | + DS_BUILD_OPS \ |
| 83 | + DS_BUILD_CPU_ADAM \ |
| 84 | + DS_BUILD_FUSED_ADAM \ |
| 85 | + DS_BUILD_FUSED_LAMB \ |
| 86 | + DS_BUILD_SPARSE_ATTN \ |
| 87 | + DS_BUILD_TRANSFORMER \ |
| 88 | + DS_BUILD_TRANSFORMER_INFERENCE \ |
| 89 | + DS_BUILD_STOCHASTIC_TRANSFORMER \ |
| 90 | + DS_BUILD_UTILS \ |
| 91 | + DS_BUILD_AIO; \ |
| 92 | + do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \ |
| 93 | + } && \ |
| 94 | + CC=$(realpath -e ./compiler) \ |
| 95 | + python3 -m pip wheel -w /wheels \ |
| 96 | + --no-cache-dir --no-build-isolation --no-deps \ |
| 97 | + deepspeed==${DEEPSPEED_VERSION} && \ |
| 98 | + rm ./* |
| 99 | +SHELL ["/bin/sh", "-c"] |
| 100 | + |
| 101 | +WORKDIR /wheels |
| 102 | + |
| 103 | + |
| 104 | +FROM builder-base as flash-attn-builder |
| 105 | +ARG FLASH_ATTN_VERSION |
| 106 | + |
| 107 | +RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/ \ |
| 108 | + python3 -m pip install -U --no-cache-dir \ |
| 109 | + packaging setuptools wheel pip && \ |
| 110 | + export CC=$(realpath -e ./compiler) && \ |
| 111 | + cd flash-attention && \ |
| 112 | + parallel 'cd {} && python3 setup.py bdist_wheel --dist-dir /wheels' ::: \ |
| 113 | + . \ |
| 114 | + csrc/ft_attention \ |
| 115 | + csrc/fused_dense_lib \ |
| 116 | + csrc/fused_softmax \ |
| 117 | + csrc/layer_norm \ |
| 118 | + csrc/rotary \ |
| 119 | + csrc/xentropy |
| 120 | + |
| 121 | +WORKDIR /wheels |
| 122 | + |
| 123 | + |
| 124 | +FROM ${BASE_IMAGE} |
| 125 | + |
| 126 | +RUN apt-get -qq update && \ |
| 127 | + apt-get install -y --no-install-recommends libaio-dev && \ |
| 128 | + apt-get clean |
| 129 | + |
| 130 | +RUN --mount=type=bind,from=deepspeed-builder,source=/wheels,target=/tmp/wheels \ |
| 131 | + python3 -m pip install --no-cache-dir /tmp/wheels/*.whl |
| 132 | +RUN --mount=type=bind,from=flash-attn-builder,source=/wheels,target=/tmp/wheels \ |
| 133 | + python3 -m pip install --no-cache-dir /tmp/wheels/*.whl |
| 134 | +RUN rm -r /tmp/wheels |
0 commit comments