Skip to content

Commit d85c474

Browse files
authored
resolve failing tests with pt-2.1 (#21130)
* pip uninstall -y deepspeed * TORCH_VER * Apply suggestions from code review * A100 & L4 * machine
1 parent 634e6e6 commit d85c474

File tree

3 files changed

+28
-14
lines changed

3 files changed

+28
-14
lines changed

.lightning/workflows/fabric.yml

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,22 @@ trigger:
44
pull_request:
55
branches: ["master"]
66

7-
timeout: "75" # minutes
8-
machine: "L4_X_2"
7+
timeout: "55" # minutes
98
parametrize:
109
matrix: {}
1110
include:
12-
# note that this is setting also all oldest requirements which is linked to Torch == 2.0
11+
# note that this is setting also all oldest requirements which is linked to Torch == 2.1
1312
- image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
1413
PACKAGE_NAME: "fabric"
15-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
14+
machine: "A100_X_2"
15+
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
1616
PACKAGE_NAME: "fabric"
17+
machine: "L4_X_2"
1718
# - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
1819
# PACKAGE_NAME: "fabric"
19-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
20+
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
2021
PACKAGE_NAME: "lightning"
22+
machine: "L4_X_2"
2123
exclude: []
2224

2325
env:
@@ -30,6 +32,7 @@ run: |
3032
python --version
3133
pip --version
3234
pip install -q fire wget packaging
35+
pip list
3336
set -ex
3437
3538
CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
@@ -40,12 +43,15 @@ run: |
4043
echo "Torch URL: ${TORCH_URL}"
4144
COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))')
4245
echo "collecting coverage for: ${COVERAGE_SOURCE}"
46+
TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])")
4347
4448
if [ "${TORCH_VER}" == "2.1" ]; then
4549
echo "Set oldest versions"
46-
cd requirements/fabric
50+
pip uninstall -y deepspeed
4751
pip install -U "lightning-utilities[cli]"
52+
cd requirements/fabric
4853
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']"
54+
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
4955
cd ../..
5056
pip install "cython<3.0" wheel # for compatibility
5157
fi
@@ -92,6 +98,7 @@ run: |
9298
export PL_RUN_STANDALONE_TESTS=1
9399
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
94100
bash ./run_standalone_tests.sh "tests_fabric"
101+
export PL_RUN_STANDALONE_TESTS=0
95102
96103
# echo "Reporting coverage" # todo
97104
# python -m coverage report

.lightning/workflows/pytorch.yml

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,22 @@ trigger:
44
pull_request:
55
branches: ["master"]
66

7-
timeout: "75" # minutes
8-
machine: "L4_X_2"
7+
timeout: "55" # minutes
98
parametrize:
109
matrix: {}
1110
include:
12-
# note that this is setting also all oldest requirements which is linked to Torch == 2.0
11+
# note that this is setting also all oldest requirements which is linked to Torch == 2.1
1312
- image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
1413
PACKAGE_NAME: "pytorch"
15-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
14+
machine: "A100_X_2"
15+
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
1616
PACKAGE_NAME: "pytorch"
17+
machine: "L4_X_2"
1718
# - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
1819
# PACKAGE_NAME: "pytorch"
19-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
20+
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
2021
PACKAGE_NAME: "lightning"
22+
machine: "L4_X_2"
2123
exclude: []
2224

2325
env:
@@ -30,6 +32,7 @@ run: |
3032
python --version
3133
pip --version
3234
pip install -q fire wget packaging
35+
pip list
3336
set -ex
3437
3538
CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
@@ -40,12 +43,15 @@ run: |
4043
echo "Torch URL: ${TORCH_URL}"
4144
COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))')
4245
echo "collecting coverage for: ${COVERAGE_SOURCE}"
46+
TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])")
4347
4448
if [ "${TORCH_VER}" == "2.1" ]; then
45-
recho "Set oldest versions"
46-
cd requirements/pytorch
49+
echo "Set oldest versions"
50+
pip uninstall -y deepspeed
4751
pip install -U "lightning-utilities[cli]"
52+
cd requirements/pytorch
4853
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']"
54+
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
4955
cd ../..
5056
pip install "cython<3.0" wheel # for compatibility
5157
fi
@@ -108,6 +114,7 @@ run: |
108114
export PL_RUN_STANDALONE_TESTS=1
109115
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
110116
bash ./run_standalone_tests.sh "tests_pytorch"
117+
export PL_RUN_STANDALONE_TESTS=0
111118
112119
echo "Testing: PyTorch standalone tasks"
113120
cd tests_pytorch/

dockers/base-cuda/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414

1515
ARG UBUNTU_VERSION=22.04
16-
ARG CUDA_VERSION=11.7.1
16+
ARG CUDA_VERSION=12.1.1
1717

1818

1919
FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

0 commit comments

Comments
 (0)