Skip to content

Commit 1d8965c

Browse files
authored
Switch to Python 3.12 for GPU images (#2640)
1 parent cc09df9 commit 1d8965c

File tree

9 files changed

+44
-46
lines changed

9 files changed

+44
-46
lines changed

.azure/docker-build.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,17 @@ jobs:
3838
- job: build_push
3939
strategy:
4040
matrix:
41-
"cuda 12.6 | torch 2.8.0 | cudnn FE v1.10.0":
42-
{ CUDA_VERSION: "12.6.3", TORCH_VERSION: "2.8.0", TRITON_VERSION: "3.4.0", CUDNN_FRONTEND_VERSION: "1.10.0" }
43-
"cuda 12.6 | torch nightly | cudnn FE v1.10.0":
44-
{ CUDA_VERSION: "12.6.3", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND_VERSION: "1.10.0" }
41+
"cuda 12.8 | torch 2.8.0 | cudnn FE v1.15.0":
42+
{ CUDA_VERSION: "12.8.1", TORCH_VERSION: "2.8.0", TRITON_VERSION: "3.4.0", CUDNN_FRONTEND_VERSION: "1.15.0" }
43+
"cuda 12.8 | torch nightly | cudnn FE v1.15.0":
44+
{ CUDA_VERSION: "12.8.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND_VERSION: "1.15.0" }
4545
#'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found
4646
# how much time to give 'run always even if cancelled tasks' before stopping them
4747
cancelTimeoutInMinutes: "2"
4848
timeoutInMinutes: "95"
4949
variables:
5050
UBUNTU_VERSION: "24.04"
51-
PYTHON_VERSION: "3.10"
51+
PYTHON_VERSION: "3.12"
5252
imageRepository: "pytorchlightning/lightning-thunder"
5353
imageTag: "ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND_VERSION)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}"
5454
pool: "lit-rtx-3090"

.azure/gpu-coverage.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
strategy:
2222
matrix:
2323
"w/ torch 2.7.1":
24-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.7.1-dev"
24+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
2525
# how much time to give 'run always even if cancelled tasks' before stopping them
2626
cancelTimeoutInMinutes: "2"
2727
pool: "lit-rtx-3090"
@@ -65,7 +65,7 @@ jobs:
6565
chmod +x codecov
6666
6767
# install this package
68-
python setup.py develop
68+
pip install -e .
6969
displayName: "Install package & ..."
7070
7171
- bash: bash scripts/sanity-check.sh

.azure/gpu-tests.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,28 +16,28 @@ jobs:
1616
strategy:
1717
matrix:
1818
"main w/ torch 2.8.0":
19-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
19+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
2020
testing: "main"
2121
"ops w/ torch 2.8.0":
22-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
22+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
2323
testing: "ops"
2424
"grads w/ torch 2.8.0":
25-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
25+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
2626
testing: "grads"
2727
"distributed w/ torch 2.8.0":
28-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
28+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
2929
testing: "distributed"
3030
"main w/ torch-nightly":
31-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
31+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
3232
testing: "main"
3333
"ops w/ torch-nightly":
34-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
34+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
3535
testing: "ops"
3636
"grads w/ torch-nightly":
37-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
37+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
3838
testing: "grads"
3939
"distributed w/ torch-nightly":
40-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
40+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
4141
testing: "distributed"
4242
# how much time to give 'run always even if cancelled tasks' before stopping them
4343
cancelTimeoutInMinutes: "2"
@@ -82,7 +82,7 @@ jobs:
8282
chmod +x codecov
8383
8484
# install this package
85-
python setup.py develop
85+
pip install -e .
8686
displayName: "Install package & ..."
8787
8888
- bash: bash scripts/sanity-check.sh

.azure/notebook-runs.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ jobs:
1616
strategy:
1717
matrix:
1818
"notebooks w/ torch 2.8":
19-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
19+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
2020
"notebooks w/ torch-nightly":
21-
docker-image: "ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
21+
docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
2222
# how long to run the job before automatically cancelling
2323
timeoutInMinutes: "45"
2424
# how much time to give 'run always even if cancelled tasks' before stopping them
@@ -53,7 +53,7 @@ jobs:
5353
cat requirements/base.txt
5454
pip install -U -r requirements/notebooks.txt
5555
# install this package
56-
python setup.py develop
56+
pip install -e .
5757
# double check on test requirements
5858
echo "Install special requirements for notebooks"
5959
displayName: "Install package & ..."

.lightning/workflows/all-tests.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,24 @@ interruptible: False
99
parametrize:
1010
matrix:
1111
image:
12-
- "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
13-
- "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
12+
- "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
13+
- "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
1414
testing: ["main", "ops", "grads"]
1515
machine: ["L4"]
1616
exclude: []
1717
include:
18-
- image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
18+
- image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
1919
testing: "distributed"
2020
machine: "L4_X_2"
21-
- image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
21+
- image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
2222
testing: "distributed"
2323
machine: "L4_X_2"
2424

2525
env:
2626
CI: "true" # skip some tests with CI
2727
NCCL_DEBUG: "INFO"
2828
NCCL_IGNORE_DISABLED_P2P: "1"
29-
TORCH_VERSION: "2.7.1"
29+
TORCH_VERSION: "2.8.0"
3030
CUDA_LAUNCH_BLOCKING: "1" # for debugging purposes, to get better stack traces
3131

3232
run: |
@@ -49,7 +49,7 @@ run: |
4949
chmod +x codecov
5050
5151
# install this package
52-
python setup.py develop
52+
pip install -e .
5353
5454
bash scripts/sanity-check.sh
5555

.lightning/workflows/notebooks.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ interruptible: False
1010
parametrize:
1111
matrix:
1212
image:
13-
- "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
14-
- "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_main-dev"
13+
- "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
14+
- "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev"
1515
exclude: []
1616
include: []
1717

@@ -29,7 +29,7 @@ run: |
2929
# double check on test requirements
3030
pip install -q -U -r requirements/base.txt -r requirements/notebooks.txt
3131
# install this package
32-
python setup.py develop
32+
pip install -e .
3333
3434
bash scripts/sanity-check.sh
3535

.lightning/workflows/transformer-engine.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ trigger:
77
timeout: "30" # minutes
88
machine: "L4"
99
interruptible: False
10-
image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
10+
image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev"
1111
parametrize:
1212
matrix:
1313
test_file:
@@ -20,11 +20,12 @@ run: |
2020
pip list
2121
set -ex
2222
23+
pip install wheel
2324
# conda install -c conda-forge libstdcxx-ng
2425
# sudo apt install libstdc++6 libstdc++-*-dev
2526
pip install . -U -q -r requirements/test.txt
2627
# Need to explicitly point to cudnn.h as it is installed at a non-standard location
2728
# Ref: https://github.com/NVIDIA/TransformerEngine/issues/918#issuecomment-2187703769
28-
CPLUS_INCLUDE_PATH="/usr/local/lib/python3.10/dist-packages/nvidia/cudnn/include/" pip install --no-build-isolation 'transformer_engine[pytorch]'
29+
CPLUS_INCLUDE_PATH="/usr/local/lib/python3.12/dist-packages/nvidia/cudnn/include/" pip install --no-build-isolation 'transformer_engine[pytorch]'
2930
pip list # for debugging purposes
3031
pytest thunder/tests/${test_file} -v -rs

dockers/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@ You can build it on your own, note it takes lots of time, be prepared.
66

77
```bash
88
# build with specific arguments
9-
docker image build -t lightning:ubuntu-cuda-py3.10-cuda12.1.1 -f dockers/ubuntu-cuda/Dockerfile --build-arg "CUDA_VERSION=12.1.1" .
9+
docker image build -t lightning:ubuntu-cuda-py3.12-cuda12.8 -f dockers/ubuntu-cuda/Dockerfile --build-arg "CUDA_VERSION=12.1.1" .
1010
```
1111

1212
To run your docker use
1313

1414
```bash
1515
docker image list
16-
docker run --rm -it pytorch-lightning:ubuntu-cuda-py3.10-cuda11.7.0 bash
16+
docker run --rm -it pytorch-lightning:ubuntu-cuda-py3.12-cuda12.8 bash
1717
```
1818

1919
## Run docker image with GPUs
@@ -33,5 +33,5 @@ sudo systemctl restart docker
3333
and later run the docker image with `--gpus=all`. For example,
3434

3535
```bash
36-
docker run --rm -it --gpus=all pytorchlightning/lightning:ubuntu-cuda-py3.10-cuda12.1.0
36+
docker run --rm -it --gpus=all pytorchlightning/lightning:ubuntu-cuda-py3.12-cuda12.1.0
3737
```

dockers/ubuntu-cuda/Dockerfile

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,18 @@
1313
# limitations under the License.
1414

1515
ARG UBUNTU_VERSION="24.04"
16-
ARG CUDA_VERSION="12.6.3"
16+
ARG CUDA_VERSION="12.8.1"
1717
# select devel | runtime
1818
ARG IMAGE_TYPE="devel"
1919

2020
FROM nvidia/cuda:${CUDA_VERSION}-${IMAGE_TYPE}-ubuntu${UBUNTU_VERSION}
2121

2222
ARG CUDNN_VERSION="9.8.0.87"
2323
ARG CUDNN_FRONTEND_VERSION="1.10.0"
24-
ARG PYTHON_VERSION="3.10"
25-
ARG TORCH_VERSION="2.2.1"
26-
ARG TRITON_VERSION="2.2.0"
24+
ARG PYTHON_VERSION="3.12"
25+
ARG TORCH_VERSION="2.8.0"
26+
ARG TRITON_VERSION="3.4.0"
2727
ARG TORCH_INSTALL="stable"
28-
ARG MAX_ALLOWED_NCCL=2.26.0
2928

3029
SHELL ["/bin/bash", "-c"]
3130
# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
@@ -41,10 +40,6 @@ ENV \
4140
RUN \
4241
apt-get update -qq --fix-missing && \
4342
CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
44-
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
45-
echo "NCCL version found: $NCCL_VER" && \
46-
TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \
47-
echo "NCCL version to install: $TO_INSTALL_NCCL" && \
4843
apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
4944
build-essential \
5045
ca-certificates \
@@ -60,21 +55,23 @@ RUN \
6055
liblapack-dev \
6156
openmpi-bin \
6257
graphviz \
63-
libnccl2=$TO_INSTALL_NCCL \
64-
libnccl-dev=$TO_INSTALL_NCCL \
58+
llvm-dev \
59+
libnccl2 \
60+
libnccl-dev \
61+
libzstd-dev \
6562
ssh \
6663
&& \
6764
# Install python
6865
add-apt-repository ppa:deadsnakes/ppa && \
6966
apt-get install -y \
7067
python${PYTHON_VERSION} \
71-
python${PYTHON_VERSION}-distutils \
7268
python${PYTHON_VERSION}-dev \
7369
&& \
7470
update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
7571
update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
72+
echo -e "[global]\nbreak-system-packages = true" > /etc/pip.conf && \
7673
curl https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} && \
77-
pip install "numpy >=1.23.0,<2" && \
74+
pip install numpy && \
7875
# Cleaning
7976
apt-get autoremove -y && \
8077
apt-get clean && \

0 commit comments

Comments
 (0)