Skip to content

Commit 949759a

Browse files
authored
Merge pull request #26 from Eta0/es/torch-extras
feat: PyTorch Extras Container
2 parents a0333d3 + a8aa08c commit 949759a

File tree

6 files changed

+248
-4
lines changed

6 files changed

+248
-4
lines changed

.github/workflows/build.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,25 @@ on:
1515
tag-suffix:
1616
required: false
1717
type: string
18+
outputs:
19+
outcome:
20+
description: "The outcome of the build"
21+
value: ${{ jobs.build.outputs.outcome }}
22+
tags:
23+
description: "The resulting image tags"
24+
value: ${{ jobs.build.outputs.tags }}
25+
version:
26+
description: "The resulting image version"
27+
value: ${{ jobs.build.outputs.tags }}
1828

1929
jobs:
2030
build:
2131
name: Build Images
2232
runs-on: [self-hosted, Linux]
33+
outputs:
34+
outcome: ${{ steps.docker-build.outcome }}
35+
tags: ${{ steps.meta.outputs.tags }}
36+
version: ${{ steps.meta.outputs.version }}
2337
steps:
2438
- uses: actions/checkout@v3
2539
- name: Set up Docker Buildx

.github/workflows/torch-base.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,5 @@ jobs:
2525
base-image: nvidia/cuda:${{ matrix.cuda }}-base-ubuntu20.04
2626
torch-version: ${{ matrix.torch }}
2727
torchvision-version: ${{ matrix.vision }}
28-
torchaudio-version: ${{ matrix.audio }}
28+
torchaudio-version: ${{ matrix.audio }}
29+
build-extras: true

.github/workflows/torch-nccl.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,5 @@ jobs:
3434
base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
3535
torch-version: ${{ matrix.torch }}
3636
torchvision-version: ${{ matrix.vision }}
37-
torchaudio-version: ${{ matrix.audio }}
37+
torchaudio-version: ${{ matrix.audio }}
38+
build-extras: true

.github/workflows/torch.yml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ on:
2323
required: false
2424
type: string
2525
default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
26+
build-extras:
27+
required: false
28+
type: boolean
29+
default: false
2630

2731
workflow_dispatch:
2832
inputs:
@@ -48,11 +52,15 @@ on:
4852
required: false
4953
type: string
5054
default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
55+
build-extras:
56+
required: false
57+
type: boolean
58+
default: false
5159

5260
jobs:
5361
build:
5462
uses: ./.github/workflows/build.yml
55-
with:
63+
with:
5664
image-name: torch
5765
folder: torch
5866
tag-suffix: ${{ inputs.tag }}
@@ -63,4 +71,14 @@ jobs:
6371
BUILD_TORCH_VERSION=${{ inputs.torch-version }}
6472
BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
6573
BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
66-
${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
74+
${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
75+
build-extras:
76+
if: inputs.build-extras
77+
needs: build
78+
uses: ./.github/workflows/build.yml
79+
with:
80+
image-name: torch-extras
81+
folder: torch-extras
82+
tag-suffix: ${{ inputs.tag }}
83+
build-args: |
84+
BASE_IMAGE=${{ needs.build.outputs.tags }}

torch-extras/Dockerfile

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# syntax=docker/dockerfile:1.2
2+
3+
ARG BASE_IMAGE
4+
ARG DEEPSPEED_VERSION="0.9.4"
5+
ARG FLASH_ATTN_VERSION="1.0.7"
6+
7+
FROM alpine/git:2.36.3 as flash-attn-downloader
8+
WORKDIR /git
9+
ARG FLASH_ATTN_VERSION
10+
RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \
11+
https://github.com/HazyResearch/flash-attention -b v${FLASH_ATTN_VERSION} && \
12+
rm -rf flash-attention/.git
13+
14+
15+
# Dependencies requiring NVCC are built ahead of time in a separate stage
16+
# so that the ~2 GiB dev library installations don't have to be included
17+
# in the final image.
18+
FROM ${BASE_IMAGE} as builder-base
19+
RUN export \
20+
CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \
21+
CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \
22+
export \
23+
CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \
24+
apt-get -qq update && apt-get install -y --no-install-recommends \
25+
cuda-nvcc-${CUDA_PACKAGE_VERSION} \
26+
cuda-nvml-dev-${CUDA_PACKAGE_VERSION} \
27+
libcurand-dev-${CUDA_PACKAGE_VERSION} \
28+
libcublas-dev-${CUDA_PACKAGE_VERSION} \
29+
libcusparse-dev-${CUDA_PACKAGE_VERSION} \
30+
libcusolver-dev-${CUDA_PACKAGE_VERSION} \
31+
cuda-nvprof-${CUDA_PACKAGE_VERSION} \
32+
cuda-profiler-api-${CUDA_PACKAGE_VERSION} \
33+
libaio-dev \
34+
ninja-build \
35+
parallel \
36+
# gcc-10/g++-10/lld do not need to be installed here, but they improve the build.
37+
# gfortran-10 is just for compiler_wrapper.f95.
38+
gcc-10 g++-10 gfortran-10 lld && \
39+
apt-get clean && \
40+
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \
41+
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \
42+
update-alternatives --install \
43+
/usr/bin/gfortran gfortran /usr/bin/gfortran-10 10 && \
44+
update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1
45+
46+
RUN mkdir /wheels /build
47+
WORKDIR /build
48+
49+
# DeepSpeed forces -march=native into the compiler options,
50+
# making the result dependent on the processor architecture
51+
# used on the builder machine.
52+
# The compiler wrapper normalizes -march=native to -march=skylake
53+
# along with a couple other transformations before invoking GCC.
54+
COPY compiler_wrapper.f95 .
55+
RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
56+
57+
58+
FROM builder-base as deepspeed-builder
59+
# DeepSpeed build flags
60+
# See: https://www.deepspeed.ai/tutorials/advanced-install
61+
ARG DS_BUILD_OPS="1"
62+
ARG DS_BUILD_CPU_ADAM=""
63+
ARG DS_BUILD_FUSED_ADAM=""
64+
ARG DS_BUILD_FUSED_LAMB=""
65+
# sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4
66+
ARG DS_BUILD_SPARSE_ATTN="0"
67+
ARG DS_BUILD_TRANSFORMER=""
68+
ARG DS_BUILD_TRANSFORMER_INFERENCE=""
69+
ARG DS_BUILD_STOCHASTIC_TRANSFORMER=""
70+
ARG DS_BUILD_UTILS=""
71+
ARG DS_BUILD_AIO=""
72+
73+
ARG DEEPSPEED_VERSION
74+
75+
SHELL ["/bin/bash", "-c"]
76+
RUN python3 -m pip install -U --no-cache-dir \
77+
setuptools wheel pip && \
78+
{ \
79+
# DeepSpeed doesn't handle blank environment variables
80+
# in the same way as unset ones, so clear any blank ones.
81+
for VAR in \
82+
DS_BUILD_OPS \
83+
DS_BUILD_CPU_ADAM \
84+
DS_BUILD_FUSED_ADAM \
85+
DS_BUILD_FUSED_LAMB \
86+
DS_BUILD_SPARSE_ATTN \
87+
DS_BUILD_TRANSFORMER \
88+
DS_BUILD_TRANSFORMER_INFERENCE \
89+
DS_BUILD_STOCHASTIC_TRANSFORMER \
90+
DS_BUILD_UTILS \
91+
DS_BUILD_AIO; \
92+
do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \
93+
} && \
94+
CC=$(realpath -e ./compiler) \
95+
python3 -m pip wheel -w /wheels \
96+
--no-cache-dir --no-build-isolation --no-deps \
97+
deepspeed==${DEEPSPEED_VERSION} && \
98+
rm ./*
99+
SHELL ["/bin/sh", "-c"]
100+
101+
WORKDIR /wheels
102+
103+
104+
FROM builder-base as flash-attn-builder
105+
ARG FLASH_ATTN_VERSION
106+
107+
RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/ \
108+
python3 -m pip install -U --no-cache-dir \
109+
packaging setuptools wheel pip && \
110+
export CC=$(realpath -e ./compiler) && \
111+
cd flash-attention && \
112+
parallel 'cd {} && python3 setup.py bdist_wheel --dist-dir /wheels' ::: \
113+
. \
114+
csrc/ft_attention \
115+
csrc/fused_dense_lib \
116+
csrc/fused_softmax \
117+
csrc/layer_norm \
118+
csrc/rotary \
119+
csrc/xentropy
120+
121+
WORKDIR /wheels
122+
123+
124+
FROM ${BASE_IMAGE}
125+
126+
RUN apt-get -qq update && \
127+
apt-get install -y --no-install-recommends libaio-dev && \
128+
apt-get clean
129+
130+
RUN --mount=type=bind,from=deepspeed-builder,source=/wheels,target=/tmp/wheels \
131+
python3 -m pip install --no-cache-dir /tmp/wheels/*.whl
132+
RUN --mount=type=bind,from=flash-attn-builder,source=/wheels,target=/tmp/wheels \
133+
python3 -m pip install --no-cache-dir /tmp/wheels/*.whl
134+
RUN rm -r /tmp/wheels

torch-extras/compiler_wrapper.f95

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
PROGRAM compiler_wrapper
2+
! Wraps GCC invocations,
3+
! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions
4+
! with -D__AVX256__, and -march=native with -march=skylake,
5+
! for better reproducibility and compatibility.
6+
IMPLICIT NONE
7+
INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
8+
CHARACTER(len=:), ALLOCATABLE :: arg, command
9+
ALLOCATE(CHARACTER(len=128) :: arg)
10+
command = "gcc"
11+
12+
DO i = 1, COMMAND_ARGUMENT_COUNT()
13+
DO
14+
CALL GET_COMMAND_ARGUMENT(i, arg, full_length, truncated)
15+
IF (truncated == 0) THEN
16+
EXIT
17+
ELSE IF (truncated == -1) THEN
18+
DEALLOCATE(arg)
19+
ALLOCATE(CHARACTER(len=full_length) :: arg)
20+
ELSE
21+
CALL EXIT(95)
22+
END IF
23+
END DO
24+
IF (arg == "-march=native") THEN
25+
command = command // " '-march=skylake'"
26+
ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN
27+
command = command // " '-D__AVX256__'"
28+
ELSE
29+
command = command // shell_escaped(arg)
30+
END IF
31+
END DO
32+
CALL SYSTEM(command, exitcode)
33+
IF (exitcode > 255) THEN
34+
exitcode = MAX(IAND(exitcode, 255), 1)
35+
END IF
36+
CALL EXIT(exitcode)
37+
38+
39+
CONTAINS
40+
FUNCTION shell_escaped(str) RESULT(out)
41+
! Turns [str] into [ 'str'] and replaces all
42+
! internal ['] characters with ['"'"']
43+
IMPLICIT NONE
44+
CHARACTER(len=*), INTENT(IN) :: str
45+
CHARACTER(len=:), ALLOCATABLE :: out
46+
INTEGER :: old_i, out_i, old_len, out_len
47+
48+
old_len = LEN_TRIM(str)
49+
! Figure out the new length to allocate by scanning `str`.
50+
! This always needs to add at least [ '] at the beginning
51+
! and ['] at the end, so the length increases by at least 3.
52+
out_len = old_len + 3
53+
DO old_i = 1, old_len
54+
IF (str(old_i:old_i) == "'") THEN
55+
out_len = out_len + 4
56+
END IF
57+
END DO
58+
ALLOCATE(CHARACTER(len=out_len) :: out)
59+
60+
! Copy over the string, performing necessary escapes.
61+
out(1:2) = " '"
62+
out_i = 3
63+
DO old_i = 1, old_len
64+
IF (str(old_i:old_i) == "'") THEN
65+
! Escape internal single-quotes
66+
out(out_i:out_i + 4) = '''"''"'''
67+
out_i = out_i + 5
68+
ELSE
69+
! No escaping needed
70+
out(out_i:out_i) = str(old_i:old_i)
71+
out_i = out_i + 1
72+
END IF
73+
END DO
74+
out(out_i:out_i) = "'"
75+
END FUNCTION
76+
END PROGRAM

0 commit comments

Comments
 (0)