diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml index fa87e28dcd1f8..767b5588fcbb5 100644 --- a/.lightning/workflows/fabric.yml +++ b/.lightning/workflows/fabric.yml @@ -8,57 +8,101 @@ timeout: "55" # minutes parametrize: matrix: {} include: - # note that this is setting also all oldest requirements which is linked to Torch == 2.1 - - image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1" + # note that this is setting also all oldest requirements which is linked to python == 3.10 + - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04" PACKAGE_NAME: "fabric" + python_version: "3.10" machine: "A100_X_2" - - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" + - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" PACKAGE_NAME: "fabric" + python_version: "3.12" machine: "L4_X_2" - # - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7" + # - image: "nvidia/cuda:12.6-runtime-ubuntu22.04" # PACKAGE_NAME: "fabric" - - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8" + - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" PACKAGE_NAME: "lightning" + python_version: "3.12" machine: "L4_X_2" exclude: [] env: + TZ: "Etc/UTC" + DEBIAN_FRONTEND: "noninteractive" + CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" + MKL_THREADING_LAYER: "GNU" + CUDA_LAUNCH_BLOCKING: "1" + NCCL_DEBUG: "INFO" + TORCHDYNAMO_VERBOSE: "1" FREEZE_REQUIREMENTS: "1" RUN_ONLY_CUDA_TESTS: "1" run: | + # Install Python and UV + apt-get update -qq --fix-missing + apt-get install -q -y software-properties-common curl + # Add deadsnakes PPA for newer Python versions if needed + add-apt-repository ppa:deadsnakes/ppa -y + apt-get update -qq --fix-missing + apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ + build-essential \ + pkg-config \ + cmake \ + ca-certificates \ + libopenmpi-dev \ + openmpi-bin \ + ninja-build \ + libnccl2 \ + libnccl-dev + + apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev + ln -sf /usr/bin/python${python_version} /usr/bin/python + curl -LsSf https://astral.sh/uv/install.sh | sh + + # Source the environment and ensure UV is in PATH + [ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env" + export PATH="$HOME/.local/bin:$PATH" + source $HOME/.cargo/env 2>/dev/null || true + export PATH="$HOME/.cargo/bin:$PATH" + + # Verify UV installation + command -v uv || (echo "UV not found in PATH" && exit 1) + # Create and activate a local uv virtual environment + uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv + . .venv/bin/activate + hash -r + whereis nvidia nvidia-smi python --version - pip --version - pip install -q fire wget packaging - pip list + uv --version + uv pip list set -ex - CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda" + # Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-devel-ubuntu22.04" + IMAGE_TAG="${image##*:}" # "12.6.3-devel-ubuntu22.04" + CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3" echo "Using CUDA version: ${CUDA_VERSION}" - CUDA_VERSION_M_M="${cuda_version%.*}" # Get major.minor by removing the last dot and everything after - CUDA_VERSION_MM="${CUDA_VERSION_M_M//'.'/''}" - TORCH_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html" - echo "Torch URL: ${TORCH_URL}" + CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6" + CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126" + export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM} COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))') echo "collecting coverage for: ${COVERAGE_SOURCE}" - TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])") - if [ "${TORCH_VER}" == "2.1" ]; then + uv pip install fire wget packaging "lightning-utilities[cli]" + if [ "${python_version}" == "3.10" ]; then echo "Set oldest versions" - pip uninstall -y deepspeed - pip install -U "lightning-utilities[cli]" cd requirements/fabric python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'strategies.txt']" python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt cd ../.. - pip install "cython<3.0" wheel # for compatibility + uv pip install "cython<3.0" wheel # for compatibility fi + # install the base so we can adjust other packages + uv pip install . echo "Adjust torch versions in requirements files" PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") - pip install -q wget packaging + uv pip install wget packaging python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py for fpath in `ls requirements/**/*.txt`; do \ python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \ @@ -66,7 +110,7 @@ run: | if [ "${PACKAGE_NAME}" == "fabric" ]; then echo "Replaced PL imports" - pip install -U -q -r .actions/requirements.txt + uv pip install --upgrade -r .actions/requirements.txt python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_fabric" \ --source_import="lightning.fabric" \ --target_import="lightning_fabric" @@ -76,11 +120,10 @@ run: | fi extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))") - pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}" + uv pip install ".[${extra}dev]" --upgrade python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" - python requirements/pytorch/check-avail-extras.py python -c "import bitsandbytes" echo "Testing: Fabric doctests" @@ -96,7 +139,7 @@ run: | echo "Testing: fabric standalone" export PL_RUN_STANDALONE_TESTS=1 - wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh + python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh bash ./run_standalone_tests.sh "tests_fabric" export PL_RUN_STANDALONE_TESTS=0 diff --git a/tests/tests_fabric/plugins/precision/test_fsdp.py b/tests/tests_fabric/plugins/precision/test_fsdp.py index 3b8d916e20c8f..b15e8e6c65f57 100644 --- a/tests/tests_fabric/plugins/precision/test_fsdp.py +++ b/tests/tests_fabric/plugins/precision/test_fsdp.py @@ -56,7 +56,7 @@ def test_fsdp_precision_scaler_with_bf16(): @RunIf(min_cuda_gpus=1) -def test_fsdp_precision_forward_context(): +def test_fsdp_precision_forward_context_f16(): """Test to ensure that the context manager correctly is set to bfloat16.""" from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler @@ -76,6 +76,10 @@ def test_fsdp_precision_forward_context(): assert isinstance(precision.forward_context(), _DtypeContextManager) assert precision.forward_context()._new_dtype == torch.float16 + +@RunIf(min_cuda_gpus=1, bf16_cuda=True) +def test_fsdp_precision_forward_context_bf16(): + """Test to ensure that the context manager correctly is set to bfloat16.""" precision = FSDPPrecision(precision="bf16-mixed") assert precision.scaler is None with precision.forward_context():