diff --git a/.lightning/workflows/benchmark.yml b/.lightning/workflows/benchmark.yml new file mode 100644 index 0000000000000..aaa786e264a95 --- /dev/null +++ b/.lightning/workflows/benchmark.yml @@ -0,0 +1,100 @@ +trigger: + push: + branches: ["master", "release/stable"] + pull_request: + branches: ["master", "release/stable"] + +timeout: "90" # minutes +parametrize: + matrix: + PACKAGE_NAME: ["fabric", "pytorch"] +image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04" +machine: "L4_X_2" +env: + TZ: "Etc/UTC" + DEBIAN_FRONTEND: "noninteractive" + python_version: "3.12" + MKL_THREADING_LAYER: "GNU" + CUDA_LAUNCH_BLOCKING: "1" + NCCL_DEBUG: "INFO" + TORCHDYNAMO_VERBOSE: "1" + FREEZE_REQUIREMENTS: "1" + RUN_ONLY_CUDA_TESTS: "1" + +run: | + # Install Python and UV + apt-get update -qq --fix-missing + apt-get install -q -y software-properties-common curl + # Add deadsnakes PPA for newer Python versions if needed + add-apt-repository ppa:deadsnakes/ppa -y + apt-get update -qq --fix-missing + apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \ + build-essential \ + pkg-config \ + cmake \ + ca-certificates \ + libopenmpi-dev \ + openmpi-bin + + apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev + ln -sf /usr/bin/python${python_version} /usr/bin/python + curl -LsSf https://astral.sh/uv/install.sh | sh + + # Source the environment and ensure UV is in PATH + [ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env" + export PATH="$HOME/.local/bin:$PATH" + source $HOME/.cargo/env 2>/dev/null || true + export PATH="$HOME/.cargo/bin:$PATH" + + # Verify UV installation + command -v uv || (echo "UV not found in PATH" && exit 1) + # Create and activate a local uv virtual environment + uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv + . .venv/bin/activate + hash -r + + whereis nvidia + nvidia-smi + python --version + uv --version + uv pip list + set -ex + + # Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-devel-ubuntu22.04" + IMAGE_TAG="${image##*:}" # "12.6.3-devel-ubuntu22.04" + CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3" + echo "Using CUDA version: ${CUDA_VERSION}" + CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6" + CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126" + export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM} + + # Adjust tests + uv pip install -q -r .actions/requirements.txt + python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ + --source_import="lightning.fabric,lightning.pytorch" \ + --target_import="lightning_fabric,pytorch_lightning" + + # Install package + uv pip install ".[dev]" + + # Env details + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" + + cd tests/ + # Testing: benchmarks + export PL_RUNNING_BENCHMARKS=1 + python -m pytest parity_${PACKAGE_NAME} -v --durations=0 + export PL_RUNNING_BENCHMARKS=0 + + # Testing: fabric standalone tasks + export PL_RUN_STANDALONE_TESTS=1 + if [ "${PACKAGE_NAME}" == "fabric" ]; then + cd parity_fabric/ + bash run_standalone_tasks.sh cuda + cd .. + fi + export PL_RUN_STANDALONE_TESTS=0 + + cd .. + echo "Benchmarks completed successfully" diff --git a/tests/parity_fabric/run_standalone_tasks.sh b/tests/parity_fabric/run_standalone_tasks.sh index bf87b0713f002..cb816b5982b0c 100644 --- a/tests/parity_fabric/run_standalone_tasks.sh +++ b/tests/parity_fabric/run_standalone_tasks.sh @@ -18,6 +18,10 @@ export PYTHONPATH="${PYTHONPATH}:$(pwd)" export PYTHONPATH="${PYTHONPATH}:$(pwd)/.." MAX_RETRIES=3 +# parsing argument from call like `bash run_standalone_tasks.sh cuda` +ACCELERATOR=$1 +# optional tolerance argument, default to 0.01 +TOLERANCE=${2:-0.01} retry_command() { local command="$@" @@ -39,5 +43,4 @@ retry_command() { return $exit_code } -retry_command "python -m test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02" -retry_command "python -m test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01" +retry_command "python -m test_parity_ddp --accelerator="$ACCELERATOR" --devices=2 --tolerance=$TOLERANCE"