Skip to content

Commit 67ce3eb

Browse files
otajlexierule
authored andcommitted
[CI] fix horovod tests (#14382)
1 parent 7ccaa5c commit 67ce3eb

File tree

3 files changed

+22
-26
lines changed

3 files changed

+22
-26
lines changed

.azure/gpu-tests.yml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ jobs:
4444

4545
- bash: |
4646
CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}')
47-
FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
47+
FILTER='.azure/gpu_*|src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
4848
echo $CHANGED_FILES > changed_files.txt
4949
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
5050
echo $MATCHES
@@ -72,12 +72,15 @@ jobs:
7272
set -e
7373
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
7474
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)"
75+
TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
7576
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
7677
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])")
78+
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
79+
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt ${PYTORCH_VERSION}
80+
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
7781
pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0"
78-
pip install -e .[strategies]
79-
pip install -U deepspeed # TODO: remove when docker images are upgraded
80-
pip install --requirement requirements/pytorch/devel.txt
82+
pip install -e .[strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
83+
pip install --requirement requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
8184
pip list
8285
env:
8386
PACKAGE_NAME: pytorch

dockers/base-conda/Dockerfile

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ RUN \
3434
# https://github.com/NVIDIA/nvidia-docker/issues/1631
3535
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
3636
apt-get update -qq --fix-missing && \
37+
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
38+
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
39+
MAX_ALLOWED_NCCL=2.11.4 && \
40+
TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \
3741
apt-get install -y --no-install-recommends \
3842
build-essential \
3943
cmake \
@@ -42,17 +46,15 @@ RUN \
4246
curl \
4347
unzip \
4448
ca-certificates \
45-
libopenmpi-dev
46-
47-
RUN \
49+
libopenmpi-dev \
50+
libnccl2=$TO_INSTALL_NCCL \
51+
libnccl-dev=$TO_INSTALL_NCCL && \
4852
# Install conda and python.
4953
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
5054
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
5155
chmod +x ~/miniconda.sh && \
5256
~/miniconda.sh -b && \
53-
rm ~/miniconda.sh
54-
55-
RUN \
57+
rm ~/miniconda.sh && \
5658
# Cleaning
5759
apt-get autoremove -y && \
5860
apt-get clean && \

dockers/base-cuda/Dockerfile

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,11 @@ RUN \
3737
# https://github.com/NVIDIA/nvidia-docker/issues/1631
3838
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
3939
apt-get update -qq --fix-missing && \
40-
apt-get install -y --no-install-recommends \
40+
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
41+
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
42+
MAX_ALLOWED_NCCL=2.11.4 && \
43+
TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \
44+
apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
4145
build-essential \
4246
pkg-config \
4347
cmake \
@@ -50,19 +54,17 @@ RUN \
5054
libopenmpi-dev \
5155
openmpi-bin \
5256
ssh \
53-
&& \
54-
57+
libnccl2=$TO_INSTALL_NCCL \
58+
libnccl-dev=$TO_INSTALL_NCCL && \
5559
# Install python
5660
add-apt-repository ppa:deadsnakes/ppa && \
5761
apt-get install -y \
5862
python${PYTHON_VERSION} \
5963
python${PYTHON_VERSION}-distutils \
6064
python${PYTHON_VERSION}-dev \
6165
&& \
62-
6366
update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
6467
update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
65-
6668
# Cleaning
6769
apt-get autoremove -y && \
6870
apt-get clean && \
@@ -78,7 +80,6 @@ RUN \
7880
wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
7981
python${PYTHON_VERSION} get-pip.py && \
8082
rm get-pip.py && \
81-
8283
pip install -q fire && \
8384
# Disable cache \
8485
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
@@ -91,16 +92,6 @@ RUN \
9192
pip install -r requirements/pytorch/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
9293
rm assistant.py
9394

94-
RUN \
95-
apt-get purge -y cmake && \
96-
wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
97-
tar -zxvf cmake-3.20.2.tar.gz && \
98-
cd cmake-3.20.2 && \
99-
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
100-
make && \
101-
make install && \
102-
cmake --version
103-
10495
ENV \
10596
HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
10697
HOROVOD_GPU_OPERATIONS=NCCL \

0 commit comments

Comments
 (0)