Skip to content

Commit 64b138c

Browse files
Merge branch 'Lightning-AI:master' into master
2 parents 9452771 + debcda0 commit 64b138c

File tree

33 files changed

+644
-131
lines changed

33 files changed

+644
-131
lines changed

.lightning/workflows/fabric.yml

Lines changed: 68 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,116 @@
11
trigger:
22
push:
3-
branches: ["master"]
3+
branches: ["master", "release/stable"]
44
pull_request:
5-
branches: ["master"]
5+
branches: ["master", "release/stable"]
66

77
timeout: "55" # minutes
88
parametrize:
99
matrix: {}
1010
include:
11-
# note that this is setting also all oldest requirements which is linked to Torch == 2.1
12-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
11+
# note that this is setting also all oldest requirements which is linked to python == 3.10
12+
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
1313
PACKAGE_NAME: "fabric"
14+
python_version: "3.10"
1415
machine: "A100_X_2"
15-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
16+
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
1617
PACKAGE_NAME: "fabric"
18+
python_version: "3.12"
1719
machine: "L4_X_2"
18-
# - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
20+
# - image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
1921
# PACKAGE_NAME: "fabric"
20-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
22+
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
2123
PACKAGE_NAME: "lightning"
24+
python_version: "3.12"
2225
machine: "L4_X_2"
2326
exclude: []
2427

2528
env:
29+
TZ: "Etc/UTC"
30+
DEBIAN_FRONTEND: "noninteractive"
31+
CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
32+
MKL_THREADING_LAYER: "GNU"
33+
CUDA_LAUNCH_BLOCKING: "1"
34+
NCCL_DEBUG: "INFO"
35+
TORCHDYNAMO_VERBOSE: "1"
2636
FREEZE_REQUIREMENTS: "1"
2737
RUN_ONLY_CUDA_TESTS: "1"
2838

2939
run: |
40+
# Install Python and UV
41+
apt-get update -qq --fix-missing
42+
apt-get install -q -y software-properties-common curl
43+
# Add deadsnakes PPA for newer Python versions if needed
44+
add-apt-repository ppa:deadsnakes/ppa -y
45+
apt-get update -qq --fix-missing
46+
apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
47+
build-essential \
48+
pkg-config \
49+
cmake \
50+
ca-certificates \
51+
libopenmpi-dev \
52+
openmpi-bin \
53+
ninja-build \
54+
libnccl2 \
55+
libnccl-dev
56+
57+
apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev
58+
ln -sf /usr/bin/python${python_version} /usr/bin/python
59+
curl -LsSf https://astral.sh/uv/install.sh | sh
60+
61+
# Source the environment and ensure UV is in PATH
62+
[ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env"
63+
export PATH="$HOME/.local/bin:$PATH"
64+
source $HOME/.cargo/env 2>/dev/null || true
65+
export PATH="$HOME/.cargo/bin:$PATH"
66+
67+
# Verify UV installation
68+
command -v uv || (echo "UV not found in PATH" && exit 1)
69+
# Create and activate a local uv virtual environment
70+
uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv
71+
. .venv/bin/activate
72+
hash -r
73+
3074
whereis nvidia
3175
nvidia-smi
3276
python --version
33-
pip --version
34-
pip install -q fire wget packaging
35-
pip list
77+
uv --version
78+
uv pip list
3679
set -ex
3780
38-
CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
81+
# Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-devel-ubuntu22.04"
82+
IMAGE_TAG="${image##*:}" # "12.6.3-devel-ubuntu22.04"
83+
CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3"
3984
echo "Using CUDA version: ${CUDA_VERSION}"
40-
CUDA_VERSION_M_M="${cuda_version%.*}" # Get major.minor by removing the last dot and everything after
41-
CUDA_VERSION_MM="${CUDA_VERSION_M_M//'.'/''}"
42-
TORCH_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html"
43-
echo "Torch URL: ${TORCH_URL}"
85+
CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6"
86+
CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126"
87+
export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM}
4488
COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))')
4589
echo "collecting coverage for: ${COVERAGE_SOURCE}"
46-
TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])")
4790
48-
if [ "${TORCH_VER}" == "2.1" ]; then
91+
uv pip install fire wget packaging "lightning-utilities[cli]"
92+
if [ "${python_version}" == "3.10" ]; then
4993
echo "Set oldest versions"
50-
pip uninstall -y deepspeed
51-
pip install -U "lightning-utilities[cli]"
5294
cd requirements/fabric
5395
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']"
5496
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
5597
cd ../..
56-
pip install "cython<3.0" wheel # for compatibility
98+
uv pip install "cython<3.0" wheel # for compatibility
5799
fi
58100
101+
# install the base so we can adjust other packages
102+
uv pip install .
59103
echo "Adjust torch versions in requirements files"
60104
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
61-
pip install -q wget packaging
105+
uv pip install wget packaging
62106
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
63107
for fpath in `ls requirements/**/*.txt`; do \
64108
python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
65109
done
66110
67111
if [ "${PACKAGE_NAME}" == "fabric" ]; then
68112
echo "Replaced PL imports"
69-
pip install -U -q -r .actions/requirements.txt
113+
uv pip install --upgrade -r .actions/requirements.txt
70114
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_fabric" \
71115
--source_import="lightning.fabric" \
72116
--target_import="lightning_fabric"
@@ -76,11 +120,10 @@ run: |
76120
fi
77121
78122
extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))")
79-
pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}"
123+
uv pip install ".[${extra}dev]" --upgrade
80124
81125
python requirements/collect_env_details.py
82126
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
83-
python requirements/pytorch/check-avail-extras.py
84127
python -c "import bitsandbytes"
85128
86129
echo "Testing: Fabric doctests"
@@ -96,7 +139,7 @@ run: |
96139
97140
echo "Testing: fabric standalone"
98141
export PL_RUN_STANDALONE_TESTS=1
99-
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
142+
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
100143
bash ./run_standalone_tests.sh "tests_fabric"
101144
export PL_RUN_STANDALONE_TESTS=0
102145

.lightning/workflows/pytorch.yml

Lines changed: 70 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,116 @@
11
trigger:
22
push:
3-
branches: ["master"]
3+
branches: ["master", "release/stable"]
44
pull_request:
5-
branches: ["master"]
5+
branches: ["master", "release/stable"]
66

77
timeout: "55" # minutes
88
parametrize:
99
matrix: {}
1010
include:
11-
# note that this is setting also all oldest requirements which is linked to Torch == 2.1
12-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
11+
# note that this also sets oldest requirements which are linked to Python == 3.10
12+
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
1313
PACKAGE_NAME: "pytorch"
14+
python_version: "3.10"
1415
machine: "A100_X_2"
15-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
16+
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
1617
PACKAGE_NAME: "pytorch"
18+
python_version: "3.12"
1719
machine: "L4_X_2"
18-
# - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
20+
# - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
1921
# PACKAGE_NAME: "pytorch"
20-
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
22+
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
2123
PACKAGE_NAME: "lightning"
24+
python_version: "3.12"
2225
machine: "L4_X_2"
2326
exclude: []
2427

2528
env:
29+
TZ: "Etc/UTC"
30+
DEBIAN_FRONTEND: "noninteractive"
31+
CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
32+
MKL_THREADING_LAYER: "GNU"
33+
CUDA_LAUNCH_BLOCKING: "1"
34+
NCCL_DEBUG: "INFO"
35+
TORCHDYNAMO_VERBOSE: "1"
2636
FREEZE_REQUIREMENTS: "1"
2737
RUN_ONLY_CUDA_TESTS: "1"
2838

2939
run: |
40+
# Install Python and UV
41+
apt-get update -qq --fix-missing
42+
apt-get install -q -y software-properties-common curl
43+
# Add deadsnakes PPA for newer Python versions if needed
44+
add-apt-repository ppa:deadsnakes/ppa -y
45+
apt-get update -qq --fix-missing
46+
apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
47+
build-essential \
48+
pkg-config \
49+
cmake \
50+
ca-certificates \
51+
libopenmpi-dev \
52+
openmpi-bin \
53+
ninja-build \
54+
libnccl2 \
55+
libnccl-dev
56+
57+
apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev
58+
ln -sf /usr/bin/python${python_version} /usr/bin/python
59+
curl -LsSf https://astral.sh/uv/install.sh | sh
60+
61+
# Source the environment and ensure UV is in PATH
62+
[ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env"
63+
export PATH="$HOME/.local/bin:$PATH"
64+
source $HOME/.cargo/env 2>/dev/null || true
65+
export PATH="$HOME/.cargo/bin:$PATH"
66+
67+
# Verify UV installation
68+
command -v uv || (echo "UV not found in PATH" && exit 1)
69+
# Create and activate a local uv virtual environment
70+
uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv
71+
. .venv/bin/activate
72+
hash -r
73+
3074
whereis nvidia
3175
nvidia-smi
3276
python --version
33-
pip --version
34-
pip install -q fire wget packaging
35-
pip list
77+
uv --version
78+
uv pip list
3679
set -ex
3780
38-
CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
81+
# Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
82+
IMAGE_TAG="${image##*:}" # "12.6.3-runtime-ubuntu22.04"
83+
CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3"
3984
echo "Using CUDA version: ${CUDA_VERSION}"
40-
CUDA_VERSION_M_M="${cuda_version%.*}" # Get major.minor by removing the last dot and everything after
41-
CUDA_VERSION_MM="${CUDA_VERSION_M_M//'.'/''}"
42-
TORCH_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html"
43-
echo "Torch URL: ${TORCH_URL}"
85+
CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6"
86+
CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126"
87+
export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM}
4488
COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))')
4589
echo "collecting coverage for: ${COVERAGE_SOURCE}"
46-
TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])")
4790
48-
if [ "${TORCH_VER}" == "2.1" ]; then
91+
uv pip install -q fire wget packaging "lightning-utilities[cli]"
92+
if [ "${python_version}" == "3.10" ]; then
4993
echo "Set oldest versions"
50-
pip uninstall -y deepspeed
51-
pip install -U "lightning-utilities[cli]"
5294
cd requirements/pytorch
5395
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']"
5496
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
5597
cd ../..
56-
pip install "cython<3.0" wheel # for compatibility
98+
uv pip install "cython<3.0" wheel # for compatibility
5799
fi
58100
101+
# install the base so we can adjust other packages
102+
uv pip install .
59103
echo "Adjust torch versions in requirements files"
60104
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
61-
pip install -q wget packaging
105+
uv pip install -q wget packaging
62106
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
63107
for fpath in `ls requirements/**/*.txt`; do \
64108
python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
65109
done
66110
67111
if [ "${PACKAGE_NAME}" == "pytorch" ]; then
68112
echo "Adjust PL imports"
69-
pip install -U -q -r .actions/requirements.txt
113+
uv pip install --upgrade -r .actions/requirements.txt
70114
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_pytorch" \
71115
--source_import="lightning.fabric,lightning.pytorch" \
72116
--target_import="lightning_fabric,pytorch_lightning"
@@ -76,14 +120,14 @@ run: |
76120
fi
77121
78122
extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))")
79-
pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}"
123+
uv pip install -e ".[${extra}dev]" --upgrade
80124
81125
if [ "${PACKAGE_NAME}" == "pytorch" ]; then
82126
echo "uninstall lightning to have just single package"
83-
pip uninstall -y lightning
127+
uv pip uninstall lightning
84128
elif [ "${PACKAGE_NAME}" == "lightning" ]; then
85129
echo "uninstall PL to have just single package"
86-
pip uninstall -y pytorch-lightning
130+
uv pip uninstall pytorch-lightning
87131
fi
88132
89133
python requirements/collect_env_details.py
@@ -112,7 +156,7 @@ run: |
112156
echo "Testing: fabric standalone"
113157
export PL_USE_MOCKED_MNIST=1
114158
export PL_RUN_STANDALONE_TESTS=1
115-
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
159+
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
116160
bash ./run_standalone_tests.sh "tests_pytorch"
117161
export PL_RUN_STANDALONE_TESTS=0
118162

Makefile

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,8 @@ clean:
4545
rm -rf src/lightning_fabric/*/
4646
rm -rf src/pytorch_lightning/*/
4747

48-
test: clean
48+
test: clean setup
4949
# Review the CONTRIBUTING documentation for other ways to test.
50-
pip install -e . \
51-
-r requirements/pytorch/base.txt \
52-
-r requirements/fabric/base.txt \
53-
-r requirements/pytorch/test.txt \
5450

5551
# run tests with coverage
5652
python -m coverage run --source src/lightning/pytorch -m pytest src/lightning/pytorch tests/tests_pytorch -v
@@ -59,18 +55,18 @@ test: clean
5955

6056
docs: docs-pytorch
6157

62-
sphinx-theme:
63-
pip install -q awscli
58+
sphinx-theme: setup
59+
uv pip install -q awscli
6460
mkdir -p dist/
6561
aws s3 sync --no-sign-request s3://sphinx-packages/ dist/
66-
pip install lai-sphinx-theme -f dist/
62+
uv pip install lai-sphinx-theme -f dist/
6763

6864
docs-fabric: clean sphinx-theme
69-
pip install -e .[all] --quiet -r requirements/fabric/docs.txt
65+
uv pip install -e '.[all]' --quiet -r requirements/fabric/docs.txt
7066
cd docs/source-fabric && $(MAKE) html --jobs $(nproc)
7167

7268
docs-pytorch: clean sphinx-theme
73-
pip install -e .[all] --quiet -r requirements/pytorch/docs.txt
69+
uv pip install -e '.[all]' --quiet -r requirements/pytorch/docs.txt
7470
cd docs/source-pytorch && $(MAKE) html --jobs $(nproc)
7571

7672
update:

docs/source-pytorch/advanced/speed.rst

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,8 @@ Validation Within Training Epoch
297297

298298
For large datasets, it's often desirable to check validation multiple times within a training epoch.
299299
Pass in a float to check that often within one training epoch. Pass in an int ``K`` to check every ``K`` training batch.
300-
Must use an ``int`` if using an :class:`~torch.utils.data.IterableDataset`.
300+
Must use an ``int`` if using an :class:`~torch.utils.data.IterableDataset`. Alternatively, pass a string ("DD:HH:MM:SS"),
301+
a dict of ``datetime.timedelta`` kwargs, or a ``datetime.timedelta`` to check validation after a given amount of wall-clock time.
301302

302303
.. testcode::
303304

@@ -310,6 +311,16 @@ Must use an ``int`` if using an :class:`~torch.utils.data.IterableDataset`.
310311
# check every 100 train batches (ie: for IterableDatasets or fixed frequency)
311312
trainer = Trainer(val_check_interval=100)
312313

314+
# check validation every 15 minutes of wall-clock time
315+
trainer = Trainer(val_check_interval="00:00:15:00")
316+
317+
# alternatively, pass a dict of timedelta kwargs
318+
trainer = Trainer(val_check_interval={"minutes": 1})
319+
320+
# or use a timedelta object directly
321+
from datetime import timedelta
322+
trainer = Trainer(val_check_interval=timedelta(hours=1))
323+
313324
Learn more in our :ref:`trainer_flags` guide.
314325

315326

0 commit comments

Comments
 (0)