diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml index 21551565b1fed..a8557a70c4598 100644 --- a/.lightning/workflows/pytorch.yml +++ b/.lightning/workflows/pytorch.yml @@ -8,57 +8,101 @@ timeout: "55" # minutes parametrize: matrix: {} include: - # note that this is setting also all oldest requirements which is linked to Torch == 2.1 - - image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1" + # note that this also sets oldest requirements which are linked to Python == 3.10 + - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04" PACKAGE_NAME: "pytorch" + python_version: "3.10" machine: "A100_X_2" - - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" + - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" PACKAGE_NAME: "pytorch" + python_version: "3.12" machine: "L4_X_2" - # - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" + # - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" # PACKAGE_NAME: "pytorch" - - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" + - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" PACKAGE_NAME: "lightning" + python_version: "3.12" machine: "L4_X_2" exclude: [] env: + TZ: "Etc/UTC" + DEBIAN_FRONTEND: "noninteractive" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" + MKL_THREADING_LAYER: "GNU" + CUDA_LAUNCH_BLOCKING: "1" + NCCL_DEBUG: "INFO" + TORCHDYNAMO_VERBOSE: "1" FREEZE_REQUIREMENTS: "1" RUN_ONLY_CUDA_TESTS: "1" run: | + # Install Python and UV + apt-get update -qq --fix-missing + apt-get install -q -y software-properties-common curl + # Add deadsnakes PPA for newer Python versions if needed + add-apt-repository ppa:deadsnakes/ppa -y + apt-get update -qq --fix-missing + apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ + build-essential \ + pkg-config \ + cmake \ + ca-certificates \ + libopenmpi-dev \ + openmpi-bin \ + ninja-build \ + libnccl2 \ + libnccl-dev + + apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev + ln -sf /usr/bin/python${python_version} /usr/bin/python + curl -LsSf https://astral.sh/uv/install.sh | sh + + # Source the environment and ensure UV is in PATH + [ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env" + export PATH="$HOME/.local/bin:$PATH" + source $HOME/.cargo/env 2>/dev/null || true + export PATH="$HOME/.cargo/bin:$PATH" + + # Verify UV installation + command -v uv || (echo "UV not found in PATH" && exit 1) + # Create and activate a local uv virtual environment + uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv + . .venv/bin/activate + hash -r + whereis nvidia nvidia-smi python --version - pip --version - pip install -q fire wget packaging - pip list + uv --version + uv pip list set -ex - CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda" + # Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-runtime-ubuntu22.04" + IMAGE_TAG="${image##*:}" # "12.6.3-runtime-ubuntu22.04" + CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3" echo "Using CUDA version: ${CUDA_VERSION}" - CUDA_VERSION_M_M="${cuda_version%.*}" # Get major.minor by removing the last dot and everything after - CUDA_VERSION_MM="${CUDA_VERSION_M_M//'.'/''}" - TORCH_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html" - echo "Torch URL: ${TORCH_URL}" + CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6" + CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126" + export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM} COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))') echo "collecting coverage for: ${COVERAGE_SOURCE}" - TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])") - if [ "${TORCH_VER}" == "2.1" ]; then + uv pip install -q fire wget packaging "lightning-utilities[cli]" + if [ "${python_version}" == "3.10" ]; then echo "Set oldest versions" - pip uninstall -y deepspeed - pip install -U "lightning-utilities[cli]" cd requirements/pytorch python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']" python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt cd ../.. - pip install "cython<3.0" wheel # for compatibility + uv pip install "cython<3.0" wheel # for compatibility fi + # install the base so we can adjust other packages + uv pip install . echo "Adjust torch versions in requirements files" PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") - pip install -q wget packaging + uv pip install -q wget packaging python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py for fpath in `ls requirements/**/*.txt`; do \ python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \ @@ -66,7 +110,7 @@ run: | if [ "${PACKAGE_NAME}" == "pytorch" ]; then echo "Adjust PL imports" - pip install -U -q -r .actions/requirements.txt + uv pip install --upgrade -r .actions/requirements.txt python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_pytorch" \ --source_import="lightning.fabric,lightning.pytorch" \ --target_import="lightning_fabric,pytorch_lightning" @@ -76,14 +120,14 @@ run: | fi extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))") - pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}" + uv pip install -e ".[${extra}dev]" --upgrade if [ "${PACKAGE_NAME}" == "pytorch" ]; then echo "uninstall lightning to have just single package" - pip uninstall -y lightning + uv pip uninstall lightning elif [ "${PACKAGE_NAME}" == "lightning" ]; then echo "uninstall PL to have just single package" - pip uninstall -y pytorch-lightning + uv pip uninstall pytorch-lightning fi python requirements/collect_env_details.py @@ -112,7 +156,7 @@ run: | echo "Testing: fabric standalone" export PL_USE_MOCKED_MNIST=1 export PL_RUN_STANDALONE_TESTS=1 - wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh + python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh bash ./run_standalone_tests.sh "tests_pytorch" export PL_RUN_STANDALONE_TESTS=0 diff --git a/tests/tests_pytorch/plugins/precision/test_fsdp.py b/tests/tests_pytorch/plugins/precision/test_fsdp.py index 0389d364dcb79..f8731aa424b38 100644 --- a/tests/tests_pytorch/plugins/precision/test_fsdp.py +++ b/tests/tests_pytorch/plugins/precision/test_fsdp.py @@ -74,8 +74,8 @@ def test_fsdp_precision_scaler_with_bf16(): @RunIf(min_cuda_gpus=1) -def test_fsdp_precision_forward_context(): - """Test to ensure that the context manager correctly is set to bfloat16.""" +def test_fsdp_precision_forward_context_f16(): + """Test to ensure that the context manager correctly is set to float16.""" from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler precision = FSDPPrecision(precision="16-mixed") @@ -94,6 +94,10 @@ def test_fsdp_precision_forward_context(): assert isinstance(precision.forward_context(), _DtypeContextManager) assert precision.forward_context()._new_dtype == torch.float16 + +@RunIf(min_cuda_gpus=1, bf16_cuda=True) +def test_fsdp_precision_forward_context_bf16(): + """Test to ensure that the context manager correctly is set to bfloat16.""" precision = FSDPPrecision(precision="bf16-mixed") assert precision.scaler is None with precision.forward_context():