Skip to content

Commit e0d4700

Browse files
authored
Merge branch 'master' into throughput-callback-variable-batch-size
2 parents 88b26be + 3998b5d commit e0d4700

File tree

659 files changed

+16410
-8038
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

659 files changed

+16410
-8038
lines changed

.actions/assistant.py

Lines changed: 54 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@
1818
import shutil
1919
import tempfile
2020
import urllib.request
21+
from collections.abc import Iterable, Iterator, Sequence
2122
from itertools import chain
2223
from os.path import dirname, isfile
2324
from pathlib import Path
24-
from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple
25+
from typing import Any, Optional
2526

2627
from packaging.requirements import Requirement
2728
from packaging.version import Version
@@ -127,7 +128,7 @@ def _parse_requirements(lines: Iterable[str]) -> Iterator[_RequirementWithCommen
127128
pip_argument = None
128129

129130

130-
def load_requirements(path_dir: str, file_name: str = "base.txt", unfreeze: str = "all") -> List[str]:
131+
def load_requirements(path_dir: str, file_name: str = "base.txt", unfreeze: str = "all") -> list[str]:
131132
"""Loading requirements from a file.
132133
133134
>>> path_req = os.path.join(_PROJECT_ROOT, "requirements")
@@ -153,8 +154,8 @@ def load_readme_description(path_dir: str, homepage: str, version: str) -> str:
153154
154155
"""
155156
path_readme = os.path.join(path_dir, "README.md")
156-
with open(path_readme, encoding="utf-8") as fo:
157-
text = fo.read()
157+
with open(path_readme, encoding="utf-8") as fopen:
158+
text = fopen.read()
158159

159160
# drop images from readme
160161
text = text.replace(
@@ -222,7 +223,7 @@ def _load_aggregate_requirements(req_dir: str = "requirements", freeze_requireme
222223
fp.writelines([ln + os.linesep for ln in requires] + [os.linesep])
223224

224225

225-
def _retrieve_files(directory: str, *ext: str) -> List[str]:
226+
def _retrieve_files(directory: str, *ext: str) -> list[str]:
226227
all_files = []
227228
for root, _, files in os.walk(directory):
228229
for fname in files:
@@ -232,7 +233,7 @@ def _retrieve_files(directory: str, *ext: str) -> List[str]:
232233
return all_files
233234

234235

235-
def _replace_imports(lines: List[str], mapping: List[Tuple[str, str]], lightning_by: str = "") -> List[str]:
236+
def _replace_imports(lines: list[str], mapping: list[tuple[str, str]], lightning_by: str = "") -> list[str]:
236237
"""Replace imports of standalone package to lightning.
237238
238239
>>> lns = [
@@ -307,20 +308,20 @@ def copy_replace_imports(
307308
if ext in (".pyc",):
308309
continue
309310
# Try to parse everything else
310-
with open(fp, encoding="utf-8") as fo:
311+
with open(fp, encoding="utf-8") as fopen:
311312
try:
312-
lines = fo.readlines()
313+
lines = fopen.readlines()
313314
except UnicodeDecodeError:
314315
# a binary file, skip
315316
print(f"Skipped replacing imports for {fp}")
316317
continue
317318
lines = _replace_imports(lines, list(zip(source_imports, target_imports)), lightning_by=lightning_by)
318319
os.makedirs(os.path.dirname(fp_new), exist_ok=True)
319-
with open(fp_new, "w", encoding="utf-8") as fo:
320-
fo.writelines(lines)
320+
with open(fp_new, "w", encoding="utf-8") as fopen:
321+
fopen.writelines(lines)
321322

322323

323-
def create_mirror_package(source_dir: str, package_mapping: Dict[str, str]) -> None:
324+
def create_mirror_package(source_dir: str, package_mapping: dict[str, str]) -> None:
324325
"""Create a mirror package with adjusted imports."""
325326
# replace imports and copy the code
326327
mapping = package_mapping.copy()
@@ -340,47 +341,6 @@ def create_mirror_package(source_dir: str, package_mapping: Dict[str, str]) -> N
340341

341342

342343
class AssistantCLI:
343-
@staticmethod
344-
def requirements_prune_pkgs(packages: Sequence[str], req_files: Sequence[str] = REQUIREMENT_FILES_ALL) -> None:
345-
"""Remove some packages from given requirement files."""
346-
if isinstance(req_files, str):
347-
req_files = [req_files]
348-
for req in req_files:
349-
AssistantCLI._prune_packages(req, packages)
350-
351-
@staticmethod
352-
def _prune_packages(req_file: str, packages: Sequence[str]) -> None:
353-
"""Remove some packages from given requirement files."""
354-
path = Path(req_file)
355-
assert path.exists()
356-
text = path.read_text()
357-
lines = text.splitlines()
358-
final = []
359-
for line in lines:
360-
ln_ = line.strip()
361-
if not ln_ or ln_.startswith("#"):
362-
final.append(line)
363-
continue
364-
req = list(_parse_requirements([ln_]))[0]
365-
if req.name not in packages:
366-
final.append(line)
367-
print(final)
368-
path.write_text("\n".join(final) + "\n")
369-
370-
@staticmethod
371-
def _replace_min(fname: str) -> None:
372-
with open(fname, encoding="utf-8") as fo:
373-
req = fo.read().replace(">=", "==")
374-
with open(fname, "w", encoding="utf-8") as fw:
375-
fw.write(req)
376-
377-
@staticmethod
378-
def replace_oldest_ver(requirement_fnames: Sequence[str] = REQUIREMENT_FILES_ALL) -> None:
379-
"""Replace the min package version by fixed one."""
380-
for fname in requirement_fnames:
381-
print(fname)
382-
AssistantCLI._replace_min(fname)
383-
384344
@staticmethod
385345
def copy_replace_imports(
386346
source_dir: str,
@@ -470,15 +430,53 @@ def convert_version2nightly(ver_file: str = "src/version.info") -> None:
470430
"""Load the actual version and convert it to the nightly version."""
471431
from datetime import datetime
472432

473-
with open(ver_file) as fo:
474-
version = fo.read().strip()
433+
with open(ver_file) as fopen:
434+
version = fopen.read().strip()
475435
# parse X.Y.Z version and prune any suffix
476436
vers = re.match(r"(\d+)\.(\d+)\.(\d+).*", version)
477437
# create timestamp YYYYMMDD
478438
timestamp = datetime.now().strftime("%Y%m%d")
479439
version = f"{'.'.join(vers.groups())}.dev{timestamp}"
480-
with open(ver_file, "w") as fo:
481-
fo.write(version + os.linesep)
440+
with open(ver_file, "w") as fopen:
441+
fopen.write(version + os.linesep)
442+
443+
@staticmethod
444+
def generate_docker_tags(
445+
release_version: str,
446+
python_version: str,
447+
torch_version: str,
448+
cuda_version: str,
449+
docker_project: str = "pytorchlightning/pytorch_lightning",
450+
add_latest: bool = False,
451+
) -> None:
452+
"""Generate docker tags for the given versions."""
453+
tags = [f"latest-py{python_version}-torch{torch_version}-cuda{cuda_version}"]
454+
if release_version:
455+
tags += [f"{release_version}-py{python_version}-torch{torch_version}-cuda{cuda_version}"]
456+
if add_latest:
457+
tags += ["latest"]
458+
459+
tags = [f"{docker_project}:{tag}" for tag in tags]
460+
print(",".join(tags))
461+
462+
@staticmethod
463+
def prune_pytest_as_errors(
464+
pyproject_toml: str = "pyproject.toml", errors: tuple = ("FutureWarning", "DeprecationWarning")
465+
) -> None:
466+
"""Prune pytest warnings as errors from the pyproject.toml file."""
467+
import tomlkit
468+
469+
with open(pyproject_toml, encoding="utf-8") as fopen:
470+
content = fopen.read()
471+
pyproject = tomlkit.parse(content)
472+
filterwarnings = pyproject.get("tool", {}).get("pytest", {}).get("ini_options", {}).get("filterwarnings", [])
473+
if not filterwarnings:
474+
return
475+
filterwarnings = [wrn for wrn in filterwarnings if not any(f"error::{err}" in wrn for err in errors)]
476+
pyproject["tool"]["pytest"]["ini_options"]["filterwarnings"] = filterwarnings
477+
478+
with open(pyproject_toml, "w", encoding="utf-8") as fopen:
479+
fopen.write(tomlkit.dumps(pyproject))
482480

483481

484482
if __name__ == "__main__":

.actions/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
jsonargparse >=4.16.0, <4.28.0
1+
jsonargparse
22
requests
33
packaging

.azure/gpu-benchmarks.yml

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ jobs:
4646
variables:
4747
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
4848
container:
49-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.4-cuda12.1.0"
49+
image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
5050
options: "--gpus=all --shm-size=32g"
5151
strategy:
5252
matrix:
@@ -75,6 +75,13 @@ jobs:
7575
pip list
7676
displayName: "Image info & NVIDIA"
7777
78+
- bash: |
79+
pip install -U -q -r .actions/requirements.txt
80+
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
81+
--source_import="lightning.fabric,lightning.pytorch" \
82+
--target_import="lightning_fabric,pytorch_lightning"
83+
displayName: "Adjust tests"
84+
7885
- bash: pip install -e .[dev] --find-links ${TORCH_URL}
7986
env:
8087
FREEZE_REQUIREMENTS: "1"
@@ -86,17 +93,10 @@ jobs:
8693
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
8794
displayName: "Env details"
8895
89-
- bash: |
90-
pip install -q -r .actions/requirements.txt
91-
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
92-
--source_import="lightning.fabric,lightning.pytorch" \
93-
--target_import="lightning_fabric,pytorch_lightning"
94-
displayName: "Adjust tests"
95-
9696
- bash: python -m pytest parity_$(PACKAGE_NAME) -v --durations=0
9797
env:
9898
PL_RUNNING_BENCHMARKS: "1"
99-
PL_RUN_CUDA_TESTS: "1"
99+
RUN_ONLY_CUDA_TESTS: "1"
100100
workingDirectory: tests/
101101
displayName: "Testing: benchmarks"
102102

@@ -105,6 +105,7 @@ jobs:
105105
# without succeeded this could run even if the job has already failed
106106
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
107107
env:
108-
PL_RUN_CUDA_TESTS: "1"
108+
RUN_ONLY_CUDA_TESTS: "1"
109+
PL_RUN_STANDALONE_TESTS: "1"
109110
displayName: "Testing: fabric standalone tasks"
110111
timeoutInMinutes: "10"

.azure/gpu-tests-fabric.yml

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -48,19 +48,25 @@ jobs:
4848
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
4949
FREEZE_REQUIREMENTS: "1"
5050
PIP_CACHE_DIR: "/var/tmp/pip"
51-
PL_RUN_CUDA_TESTS: "1"
51+
RUN_ONLY_CUDA_TESTS: "1"
5252
container:
5353
image: $(image)
5454
# default shm size is 64m. Increase it to avoid:
5555
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
5656
options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp"
5757
strategy:
5858
matrix:
59+
"Fabric | oldest":
60+
image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
61+
PACKAGE_NAME: "fabric"
5962
"Fabric | latest":
60-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
63+
image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
6164
PACKAGE_NAME: "fabric"
65+
#"Fabric | future":
66+
# image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
67+
# PACKAGE_NAME: "fabric"
6268
"Lightning | latest":
63-
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.4-cuda12.1.0"
69+
image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
6470
PACKAGE_NAME: "lightning"
6571
workspace:
6672
clean: all
@@ -72,14 +78,11 @@ jobs:
7278
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
7379
scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))')
7480
echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope"
75-
python_ver=$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")
76-
echo "##vso[task.setvariable variable=PYTHON_VERSION_MM]$python_ver"
7781
displayName: "set env. vars"
7882
- bash: |
7983
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}"
80-
echo "##vso[task.setvariable variable=TORCHVISION_URL]https://download.pytorch.org/whl/test/cu124/torchvision-0.19.0%2Bcu124-cp${PYTHON_VERSION_MM}-cp${PYTHON_VERSION_MM}-linux_x86_64.whl"
8184
condition: endsWith(variables['Agent.JobName'], 'future')
82-
displayName: "set env. vars 4 future"
85+
displayName: "extend env. vars 4 future"
8386
8487
- bash: |
8588
echo $(DEVICES)
@@ -95,6 +98,20 @@ jobs:
9598
pip list
9699
displayName: "Image info & NVIDIA"
97100
101+
- bash: |
102+
set -ex
103+
pip install "cython<3.0" wheel # for compatibility
104+
pip install -U "lightning-utilities[cli]"
105+
cd requirements/fabric
106+
# replace range by pin minimal requirements
107+
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']"
108+
# drop deepspeed since it is not supported by our minimal Torch requirements
109+
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
110+
# uninstall deepspeed since some older docker images have it pre-installed
111+
pip uninstall -y deepspeed
112+
condition: contains(variables['Agent.JobName'], 'oldest')
113+
displayName: "setting oldest dependencies"
114+
98115
- bash: |
99116
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
100117
pip install -q wget packaging
@@ -105,14 +122,28 @@ jobs:
105122
displayName: "Adjust dependencies"
106123
107124
- bash: |
125+
pip install -U -q -r .actions/requirements.txt
126+
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_fabric" \
127+
--source_import="lightning.fabric" \
128+
--target_import="lightning_fabric"
129+
python .actions/assistant.py copy_replace_imports --source_dir="./examples/fabric" \
130+
--source_import="lightning.fabric" \
131+
--target_import="lightning_fabric"
132+
# without succeeded this could run even if the job has already failed
133+
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
134+
displayName: "Adjust tests & examples"
135+
136+
- bash: |
137+
set -e
108138
extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))")
109-
pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}"
139+
pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}"
110140
displayName: "Install package & dependencies"
111141
112142
- bash: |
113143
set -e
114144
python requirements/collect_env_details.py
115145
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
146+
python requirements/pytorch/check-avail-extras.py
116147
python -c "import bitsandbytes"
117148
displayName: "Env details"
118149
@@ -122,27 +153,17 @@ jobs:
122153
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
123154
displayName: "Testing: Fabric doctests"
124155

125-
- bash: |
126-
pip install -q -r .actions/requirements.txt
127-
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_fabric" \
128-
--source_import="lightning.fabric" \
129-
--target_import="lightning_fabric"
130-
python .actions/assistant.py copy_replace_imports --source_dir="./examples/fabric" \
131-
--source_import="lightning.fabric" \
132-
--target_import="lightning_fabric"
133-
# without succeeded this could run even if the job has already failed
134-
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
135-
displayName: "Adjust tests & examples"
136-
137-
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50
138-
workingDirectory: tests/tests_fabric/
156+
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
157+
workingDirectory: tests/
139158
displayName: "Testing: fabric standard"
140159
timeoutInMinutes: "10"
141160

142-
- bash: bash ../run_standalone_tests.sh "."
143-
workingDirectory: tests/tests_fabric/
161+
- bash: |
162+
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
163+
bash ./run_standalone_tests.sh "tests_fabric"
164+
workingDirectory: tests/
144165
env:
145-
PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
166+
PL_RUN_STANDALONE_TESTS: "1"
146167
displayName: "Testing: fabric standalone"
147168
timeoutInMinutes: "10"
148169
@@ -157,7 +178,7 @@ jobs:
157178
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
158179
--flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
159180
ls -l
160-
workingDirectory: tests/tests_fabric/
181+
workingDirectory: tests/
161182
displayName: "Statistics"
162183
163184
- script: |

0 commit comments

Comments
 (0)