Skip to content

Commit 5f5b8da

Browse files
otajBorda
authored andcommitted
[CI] Bump CUDA in Docker images to 11.6.1 (#14348)
* bump cuda in docker images to 11.6.1 * PUSH TO HUB. REVERT THIS! * conda forge for 11.6 * cuda 11.5 * revert conda changes * 11.6 back again * 11.6 back again, all of them * maybe all passes now * maybe all passes now * final push * Revert "PUSH TO HUB. REVERT THIS!" This reverts commit 602bfce. * Apply suggestions from code review Co-authored-by: Jirka Borovec <[email protected]>
1 parent cca9c71 commit 5f5b8da

File tree

8 files changed

+28
-24
lines changed

8 files changed

+28
-24
lines changed

.azure/gpu-benchmark.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
cancelTimeoutInMinutes: "2"
2929
pool: azure-jirka-spot
3030
container:
31-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
31+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
3232
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
3333
workspace:
3434
clean: all

.azure/gpu-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
strategy:
2727
matrix:
2828
'PyTorch - stable':
29-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
29+
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
3030
# how long to run the job before automatically cancelling
3131
timeoutInMinutes: "80"
3232
# how much time to give 'run always even if cancelled tasks' before stopping them

.github/checkgroup.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,21 +81,21 @@ subprojects:
8181
- ".github/workflows/*docker*.yml"
8282
- "setup.py"
8383
checks:
84-
- "build-conda (3.8, 1.10)"
85-
- "build-conda (3.8, 1.9)"
86-
- "build-conda (3.9, 1.11)"
87-
- "build-conda (3.9, 1.12)"
84+
- "build-conda (3.8, 1.9, 11.1.1)"
85+
- "build-conda (3.8, 1.10.1, 11.1.1)"
86+
- "build-conda (3.9, 1.11, 11.3.1)"
87+
- "build-conda (3.9, 1.12, 11.3.1)"
8888
- "build-cuda (3.8, 1.9, 11.1.1)"
8989
- "build-cuda (3.9, 1.10, 11.3.1)"
9090
- "build-cuda (3.9, 1.11, 11.3.1)"
91-
- "build-cuda (3.9, 1.12, 11.3.1)"
91+
- "build-cuda (3.9, 1.12, 11.6.1)"
9292
- "build-cuda (3.9, 1.9, 11.1.1)"
9393
- "build-hpu (1.5.0, 1.11.0)"
9494
- "build-ipu (3.9, 1.9)"
9595
- "build-NGC"
9696
- "build-pl (3.9, 1.10, 11.3.1)"
9797
- "build-pl (3.9, 1.11, 11.3.1)"
98-
- "build-pl (3.9, 1.12, 11.3.1)"
98+
- "build-pl (3.9, 1.12, 11.6.1)"
9999
- "build-pl (3.9, 1.9, 11.1.1)"
100100
- "build-xla (3.7, 1.12)"
101101

.github/workflows/ci-pytorch-dockers.yml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
- {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
3737
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
3838
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
39-
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
39+
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
4040
steps:
4141
- uses: actions/checkout@v3
4242
- uses: docker/setup-buildx-action@v2
@@ -96,7 +96,7 @@ jobs:
9696
- {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
9797
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
9898
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
99-
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
99+
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
100100
# Used in Lightning-AI/tutorials
101101
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
102102
steps:
@@ -133,10 +133,10 @@ jobs:
133133
fail-fast: false
134134
matrix:
135135
include:
136-
- {python_version: "3.8", pytorch_version: "1.9"}
137-
- {python_version: "3.8", pytorch_version: "1.10"}
138-
- {python_version: "3.9", pytorch_version: "1.11"}
139-
- {python_version: "3.9", pytorch_version: "1.12"}
136+
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
137+
- {python_version: "3.8", pytorch_version: "1.10.1", cuda_version: "11.1.1"}
138+
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
139+
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
140140
steps:
141141
- uses: actions/checkout@v3
142142
- uses: docker/setup-buildx-action@v2
@@ -150,6 +150,7 @@ jobs:
150150
build-args: |
151151
PYTHON_VERSION=${{ matrix.python_version }}
152152
PYTORCH_VERSION=${{ matrix.pytorch_version }}
153+
CUDA_VERSION=${{ matrix.cuda_version }}
153154
file: dockers/base-conda/Dockerfile
154155
push: ${{ env.PUSH_TO_HUB }}
155156
tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}

.github/workflows/release-docker.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
- {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
2020
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
2121
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
22-
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
22+
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
2323
steps:
2424
- name: Checkout
2525
uses: actions/checkout@v2

dockers/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ git clone https://github.com/Lightning-AI/lightning.git
1111
docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile .
1212

1313
# build with specific arguments
14-
docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 .
14+
docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.12-cuda11.6.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.12 --build-arg CUDA_VERSION=11.6.1 .
1515
```
1616

1717
To run your docker use
@@ -45,7 +45,7 @@ sudo systemctl restart docker
4545
and later run the docker image with `--gpus all`. For example,
4646

4747
```
48-
docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1
48+
docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1
4949
```
5050

5151
## Run Jupyter server

dockers/base-conda/Dockerfile

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,17 @@ RUN \
4242
curl \
4343
unzip \
4444
ca-certificates \
45-
libopenmpi-dev \
46-
&& \
45+
libopenmpi-dev
4746

47+
RUN \
4848
# Install conda and python.
4949
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
5050
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \
5151
chmod +x ~/miniconda.sh && \
5252
~/miniconda.sh -b && \
53-
rm ~/miniconda.sh && \
53+
rm ~/miniconda.sh
5454

55+
RUN \
5556
# Cleaning
5657
apt-get autoremove -y && \
5758
apt-get clean && \
@@ -73,9 +74,10 @@ COPY environment.yml environment.yml
7374
# conda init
7475
RUN \
7576
conda update -n base -c defaults conda && \
77+
CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \
7678
conda create -y --name $CONDA_ENV \
77-
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \
78-
-c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \
79+
python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \
80+
-c nvidia -c pytorch -c pytorch-test && \
7981
conda init bash && \
8082
# NOTE: this requires that the channel is presented in the yaml before packages \
8183
printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \

dockers/base-cuda/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,9 @@ RUN \
140140
RUN \
141141
# install Bagua
142142
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
143-
pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \
144-
python -c "import bagua_core; bagua_core.install_deps()" && \
143+
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \
144+
pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \
145+
if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \
145146
python -c "import bagua; print(bagua.__version__)"
146147

147148
COPY requirements/pytorch/check-avail-extras.py check-avail-extras.py

0 commit comments

Comments
 (0)