Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions .lightning/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
trigger:
push:
branches: ["master", "release/stable"]
pull_request:
branches: ["master", "release/stable"]

timeout: "90" # minutes
parametrize:
matrix:
PACKAGE_NAME: ["fabric", "pytorch"]
image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
machine: "L4_X_2"
env:
TZ: "Etc/UTC"
DEBIAN_FRONTEND: "noninteractive"
python_version: "3.12"
MKL_THREADING_LAYER: "GNU"
CUDA_LAUNCH_BLOCKING: "1"
NCCL_DEBUG: "INFO"
TORCHDYNAMO_VERBOSE: "1"
FREEZE_REQUIREMENTS: "1"
RUN_ONLY_CUDA_TESTS: "1"

run: |
# Install Python and UV
apt-get update -qq --fix-missing
apt-get install -q -y software-properties-common curl
# Add deadsnakes PPA for newer Python versions if needed
add-apt-repository ppa:deadsnakes/ppa -y
apt-get update -qq --fix-missing
apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
build-essential \
pkg-config \
cmake \
ca-certificates \
libopenmpi-dev \
openmpi-bin

apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev
ln -sf /usr/bin/python${python_version} /usr/bin/python
curl -LsSf https://astral.sh/uv/install.sh | sh

# Source the environment and ensure UV is in PATH
[ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env"
export PATH="$HOME/.local/bin:$PATH"
source $HOME/.cargo/env 2>/dev/null || true
export PATH="$HOME/.cargo/bin:$PATH"

# Verify UV installation
command -v uv || (echo "UV not found in PATH" && exit 1)
# Create and activate a local uv virtual environment
uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv
. .venv/bin/activate
hash -r

whereis nvidia
nvidia-smi
python --version
uv --version
uv pip list
set -ex

# Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-devel-ubuntu22.04"
IMAGE_TAG="${image##*:}" # "12.6.3-devel-ubuntu22.04"
CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3"
echo "Using CUDA version: ${CUDA_VERSION}"
CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6"
CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126"
export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM}

# Adjust tests
uv pip install -q -r .actions/requirements.txt
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"

# Install package
uv pip install ".[dev]"

# Env details
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"

cd tests/
# Testing: benchmarks
export PL_RUNNING_BENCHMARKS=1
python -m pytest parity_${PACKAGE_NAME} -v --durations=0
export PL_RUNNING_BENCHMARKS=0

# Testing: fabric standalone tasks
export PL_RUN_STANDALONE_TESTS=1
if [ "${PACKAGE_NAME}" == "fabric" ]; then
cd parity_fabric/
bash run_standalone_tasks.sh cuda
cd ..
fi
export PL_RUN_STANDALONE_TESTS=0

cd ..
echo "Benchmarks completed successfully"
7 changes: 5 additions & 2 deletions tests/parity_fabric/run_standalone_tasks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export PYTHONPATH="${PYTHONPATH}:$(pwd)/.."

MAX_RETRIES=3
# parsing argument from call like `bash run_standalone_tasks.sh cuda`
ACCELERATOR=$1
# optional tolerance argument, default to 0.01
TOLERANCE=${2:-0.01}

retry_command() {
local command="$@"
Expand All @@ -39,5 +43,4 @@ retry_command() {
return $exit_code
}

retry_command "python -m test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
retry_command "python -m test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"
retry_command "python -m test_parity_ddp --accelerator="$ACCELERATOR" --devices=2 --tolerance=$TOLERANCE"
Loading