diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index e8b54873c657c..5d9b03a08052a 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -47,7 +47,7 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10, A100_X_2)" + - "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10, L4_X_2)" - "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, lightning, 3.12, L4_X_2)" - "pytorch.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, pytorch, 3.12, L4_X_2)" @@ -148,7 +148,7 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10, A100_X_2)" + - "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10, L4_X_2)" - "fabric.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, fabric, 3.12, L4_X_2)" - "fabric.yml / Lit Job (nvidia/cuda:12.6.3-runtime-ubuntu22.04, lightning, 3.12, L4_X_2)" diff --git a/.lightning/workflows/benchmark.yml b/.lightning/workflows/benchmark.yml index aaa786e264a95..1f3da911c38c9 100644 --- a/.lightning/workflows/benchmark.yml +++ b/.lightning/workflows/benchmark.yml @@ -22,12 +22,13 @@ env: RUN_ONLY_CUDA_TESTS: "1" run: | - # Install Python and UV - apt-get update -qq --fix-missing + echo "Installing dependencies" + apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null apt-get install -q -y software-properties-common curl - # Add deadsnakes PPA for newer Python versions if needed + echo "Add deadsnakes PPA for newer Python versions if needed" add-apt-repository ppa:deadsnakes/ppa -y - apt-get update -qq --fix-missing + apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null + echo "Install Python ${python_version} and other dependencies" apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ build-essential \ pkg-config \ @@ -36,23 +37,25 @@ run: | libopenmpi-dev \ openmpi-bin + echo "Install Python ${python_version} and UV" apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev ln -sf /usr/bin/python${python_version} /usr/bin/python curl -LsSf https://astral.sh/uv/install.sh | sh - # Source the environment and ensure UV is in PATH + echo "Source the environment and ensure UV is in PATH" [ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env" export PATH="$HOME/.local/bin:$PATH" source $HOME/.cargo/env 2>/dev/null || true export PATH="$HOME/.cargo/bin:$PATH" - # Verify UV installation + echo "Verify UV installation" command -v uv || (echo "UV not found in PATH" && exit 1) # Create and activate a local uv virtual environment uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv . .venv/bin/activate hash -r + echo "Show system information" whereis nvidia nvidia-smi python --version @@ -68,13 +71,13 @@ run: | CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126" export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM} - # Adjust tests + echo "Adjust tests" uv pip install -q -r .actions/requirements.txt python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ --source_import="lightning.fabric,lightning.pytorch" \ --target_import="lightning_fabric,pytorch_lightning" - # Install package + echo "Install package" uv pip install ".[dev]" # Env details @@ -82,12 +85,12 @@ run: | python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" cd tests/ - # Testing: benchmarks + echo "Testing: benchmarks" export PL_RUNNING_BENCHMARKS=1 python -m pytest parity_${PACKAGE_NAME} -v --durations=0 export PL_RUNNING_BENCHMARKS=0 - # Testing: fabric standalone tasks + echo "Testing: fabric standalone tasks" export PL_RUN_STANDALONE_TESTS=1 if [ "${PACKAGE_NAME}" == "fabric" ]; then cd parity_fabric/ diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml index 767b5588fcbb5..380be4bbcb195 100644 --- a/.lightning/workflows/fabric.yml +++ b/.lightning/workflows/fabric.yml @@ -12,7 +12,7 @@ parametrize: - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04" PACKAGE_NAME: "fabric" python_version: "3.10" - machine: "A100_X_2" + machine: "L4_X_2" - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" PACKAGE_NAME: "fabric" python_version: "3.12" @@ -37,12 +37,13 @@ env: RUN_ONLY_CUDA_TESTS: "1" run: | - # Install Python and UV - apt-get update -qq --fix-missing + echo "Installing dependencies" + apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null apt-get install -q -y software-properties-common curl - # Add deadsnakes PPA for newer Python versions if needed + echo "Add deadsnakes PPA for newer Python versions if needed" add-apt-repository ppa:deadsnakes/ppa -y - apt-get update -qq --fix-missing + apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null + echo "Install Python ${python_version} and other dependencies" apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ build-essential \ pkg-config \ @@ -54,23 +55,25 @@ run: | libnccl2 \ libnccl-dev + echo "Install Python ${python_version} and UV" apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev ln -sf /usr/bin/python${python_version} /usr/bin/python curl -LsSf https://astral.sh/uv/install.sh | sh - # Source the environment and ensure UV is in PATH + echo "Source the environment and ensure UV is in PATH" [ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env" export PATH="$HOME/.local/bin:$PATH" source $HOME/.cargo/env 2>/dev/null || true export PATH="$HOME/.cargo/bin:$PATH" - # Verify UV installation + echo "Verify UV installation" command -v uv || (echo "UV not found in PATH" && exit 1) # Create and activate a local uv virtual environment uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv . .venv/bin/activate hash -r + echo "Show system information" whereis nvidia nvidia-smi python --version @@ -98,7 +101,7 @@ run: | uv pip install "cython<3.0" wheel # for compatibility fi - # install the base so we can adjust other packages + echo "Install the base so we can adjust other packages" uv pip install . echo "Adjust torch versions in requirements files" PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") @@ -119,6 +122,7 @@ run: | --target_import="lightning_fabric" fi + echo "Install package with [${PACKAGE_NAME}] extras" extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))") uv pip install ".[${extra}dev]" --upgrade diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml index bbd47a8431fef..aa3b6c6020df8 100644 --- a/.lightning/workflows/pytorch.yml +++ b/.lightning/workflows/pytorch.yml @@ -12,7 +12,7 @@ parametrize: - image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04" PACKAGE_NAME: "pytorch" python_version: "3.10" - machine: "A100_X_2" + machine: "L4_X_2" - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04" PACKAGE_NAME: "pytorch" python_version: "3.12" @@ -37,12 +37,13 @@ env: RUN_ONLY_CUDA_TESTS: "1" run: | - # Install Python and UV - apt-get update -qq --fix-missing + echo "Installing dependencies" + apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null apt-get install -q -y software-properties-common curl - # Add deadsnakes PPA for newer Python versions if needed + echo "Add deadsnakes PPA for newer Python versions if needed" add-apt-repository ppa:deadsnakes/ppa -y - apt-get update -qq --fix-missing + apt-get update -qq --fix-missing -o=Dpkg::Use-Pty=0 &> /dev/null + echo "Install Python ${python_version} and other dependencies" apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ build-essential \ pkg-config \ @@ -54,23 +55,25 @@ run: | libnccl2 \ libnccl-dev + echo "Install Python ${python_version} and UV" apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev ln -sf /usr/bin/python${python_version} /usr/bin/python curl -LsSf https://astral.sh/uv/install.sh | sh - # Source the environment and ensure UV is in PATH + echo "Source the environment and ensure UV is in PATH" [ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env" export PATH="$HOME/.local/bin:$PATH" source $HOME/.cargo/env 2>/dev/null || true export PATH="$HOME/.cargo/bin:$PATH" - # Verify UV installation + echo "Verify UV installation" command -v uv || (echo "UV not found in PATH" && exit 1) # Create and activate a local uv virtual environment uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv . .venv/bin/activate hash -r + echo "Show system information" whereis nvidia nvidia-smi python --version @@ -98,7 +101,7 @@ run: | uv pip install "cython<3.0" wheel # for compatibility fi - # install the base so we can adjust other packages + echo "Install the base so we can adjust other packages" uv pip install . echo "Adjust torch versions in requirements files" PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") @@ -119,9 +122,11 @@ run: | --target_import="lightning_fabric,pytorch_lightning" fi + echo "Install package" extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))") uv pip install -e ".[${extra}dev]" --upgrade + echo "Ensure only a single package is installed" if [ "${PACKAGE_NAME}" == "pytorch" ]; then echo "uninstall lightning to have just single package" uv pip uninstall lightning