snorkel-marlin-repos
diff --git a/‎.actions/assistant.py‎
Lines changed: 488 additions & 0 deletions b/‎.actions/assistant.py‎
Lines changed: 488 additions & 0 deletions
diff --git a/‎.actions/pull_legacy_checkpoints.sh‎
Lines changed: 11 additions & 0 deletions b/‎.actions/pull_legacy_checkpoints.sh‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎.actions/requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎.actions/requirements.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.azure/README.md‎
Lines changed: 70 additions & 0 deletions b/‎.azure/README.md‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎.azure/gpu-benchmarks.yml‎
Lines changed: 110 additions & 0 deletions b/‎.azure/gpu-benchmarks.yml‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎.azure/gpu-tests-fabric.yml‎
Lines changed: 168 additions & 0 deletions b/‎.azure/gpu-tests-fabric.yml‎
Lines changed: 168 additions & 0 deletions
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Run this script from the project root.
+URL="https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip"
+mkdir -p tests/legacy
+# wget is simpler but does not work on Windows
+python -c "from urllib.request import urlretrieve; urlretrieve('$URL', 'tests/legacy/checkpoints.zip')"
+ls -l tests/legacy/
+
+unzip -o tests/legacy/checkpoints.zip -d tests/legacy/
+ls -l tests/legacy/checkpoints/
@@ -0,0 +1,3 @@
+jsonargparse >=4.16.0, <4.28.0
+requests
+packaging
@@ -0,0 +1,70 @@
+# Creation GPU self-hosted agent pool
+
+## Prepare the machine
+
+This is a slightly modified version of the script from
+https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/docker
+
+```bash
+apt-get update
+apt-get install -y --no-install-recommends \
+    ca-certificates \
+    curl \
+    jq \
+    git \
+    iputils-ping \
+    libcurl4 \
+    libunwind8 \
+    netcat \
+    libssl1.0
+
+curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+mkdir /azp
+```
+
+## Stating the agents
+
+```bash
+export TARGETARCH=linux-x64
+export AZP_URL="https://dev.azure.com/Lightning-AI"
+export AZP_TOKEN="xxxxxxxxxxxxxxxxxxxxxxxxxx"
+export AZP_POOL="lit-rtx-3090"
+
+for i in {0..7..2}
+do
+     nohup bash .azure/start.sh \
+        "AZP_AGENT_NAME=litGPU-YX_$i,$((i+1))" \
+        "CUDA_VISIBLE_DEVICES=$i,$((i+1))" \
+     > "agent-$i.log" &
+done
+```
+
+## Check running agents
+
+```bash
+ps aux | grep start.sh
+```
+
+# Machine maintenance
+
+Since most of our jobs/checks are running in a Docker container, the OS/machine can become polluted and fail to run with errors such as:
+
+```
+No space left on device : '/azp/agent-litGPU-21_0,1/_diag/pages/8bb191f4-a8c2-419a-8788-66e3f0522bea_1.log'
+```
+
+In such cases, you need to log in to the machine and run `docker system prune`.
+
+## Automated ways
+
+Let's explore adding a cron job for periodically removing all Docker caches:
+
+1. Open your user's cron tab for editing: `crontab -e`
+1. Schedule/add the command with the `--force` flag to force pruning without interactive confirmation:
+   ```bash
+   # every day at 2:00 AM clean docker caches
+   0 2 * * * docker system prune --force
+   ```
+1. Verify the entry: `crontab -l`
+
+Note: You may need to add yourself to the Docker group by running `sudo usermod -aG docker <your_username>` to have permission to execute this command without needing `sudo` and entering the password.
@@ -0,0 +1,110 @@
+# Python package
+# Create and test a Python package on multiple Python versions.
+# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
+# https://docs.microsoft.com/azure/devops/pipelines/languages/python
+
+trigger:
+  tags:
+    include: ["*"]
+  branches:
+    include:
+      - "master"
+      - "release/*"
+      - "refs/tags/*"
+
+pr:
+  branches:
+    include:
+      - "master"
+      - "release/*"
+  paths:
+    include:
+      - ".azure/gpu-benchmarks.yml"
+      - "requirements/fabric/**"
+      - "requirements/pytorch/**"
+      - "src/lightning/fabric/**"
+      - "src/lightning/pytorch/**"
+      - "tests/parity_fabric/**"
+      - "tests/parity_pytorch/**"
+    exclude:
+      - "requirements/*/docs.txt"
+      - "*.md"
+      - "**/*.md"
+
+schedules:
+  - cron: "0 0 * * *" # At the end of every day
+    displayName: Daily midnight benchmark
+    branches:
+      include:
+        - "master"
+
+jobs:
+  - job: benchmarks
+    timeoutInMinutes: "90"
+    cancelTimeoutInMinutes: "2"
+    pool: lit-rtx-3090
+    variables:
+      DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
+    container:
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0"
+      options: "--gpus=all --shm-size=32g"
+    strategy:
+      matrix:
+        "pkg: Fabric":
+          PACKAGE_NAME: "fabric"
+        "pkg: Pytorch":
+          PACKAGE_NAME: "pytorch"
+    workspace:
+      clean: all
+
+    steps:
+      - bash: |
+          echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
+          cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
+          echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
+        displayName: "set env. vars"
+
+      - bash: |
+          echo $CUDA_VISIBLE_DEVICES
+          echo $TORCH_URL
+          whereis nvidia
+          nvidia-smi
+          which python && which pip
+          python --version
+          pip --version
+          pip list
+        displayName: "Image info & NVIDIA"
+
+      - bash: pip install -e .[dev] --find-links ${TORCH_URL}
+        env:
+          FREEZE_REQUIREMENTS: "1"
+        displayName: "Install package"
+
+      - bash: |
+          set -e
+          python requirements/collect_env_details.py
+          python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
+        displayName: "Env details"
+
+      - bash: |
+          pip install -q -r .actions/requirements.txt
+          python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
+            --source_import="lightning.fabric,lightning.pytorch" \
+            --target_import="lightning_fabric,pytorch_lightning"
+        displayName: "Adjust tests"
+
+      - bash: python -m pytest parity_$(PACKAGE_NAME) -v --durations=0
+        env:
+          PL_RUNNING_BENCHMARKS: "1"
+          PL_RUN_CUDA_TESTS: "1"
+        workingDirectory: tests/
+        displayName: "Testing: benchmarks"
+
+      - bash: bash run_standalone_tasks.sh
+        workingDirectory: tests/parity_fabric
+        # without succeeded this could run even if the job has already failed
+        condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
+        env:
+          PL_RUN_CUDA_TESTS: "1"
+        displayName: "Testing: fabric standalone tasks"
+        timeoutInMinutes: "10"
@@ -0,0 +1,168 @@
+# Python package
+# Create and test a Python package on multiple Python versions.
+# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
+# https://docs.microsoft.com/azure/devops/pipelines/languages/python
+
+trigger:
+  tags:
+    include: ["*"]
+  branches:
+    include:
+      - "master"
+      - "release/*"
+      - "refs/tags/*"
+
+pr:
+  branches:
+    include:
+      - "master"
+      - "release/*"
+  paths:
+    include:
+      - ".actions/*"
+      - ".azure/gpu-tests-fabric.yml"
+      - "examples/fabric/**"
+      - "examples/run_fabric_examples.sh"
+      - "tests/run_standalone_*.sh"
+      - "requirements/fabric/**"
+      - "src/lightning/__init__.py"
+      - "src/lightning/__setup__.py"
+      - "src/lightning/__version__.py"
+      - "src/lightning/fabric/**"
+      - "src/lightning_fabric/*"
+      - "tests/tests_fabric/**"
+      - "pyproject.toml" # includes pytest config
+    exclude:
+      - "requirements/*/docs.txt"
+      - "*.md"
+      - "**/*.md"
+
+jobs:
+  - job: testing
+    # how long to run the job before automatically cancelling
+    timeoutInMinutes: "20"
+    # how much time to give 'run always even if cancelled tasks' before stopping them
+    cancelTimeoutInMinutes: "2"
+    pool: lit-rtx-3090
+    variables:
+      DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
+      FREEZE_REQUIREMENTS: "1"
+      PIP_CACHE_DIR: "/var/tmp/pip"
+      PL_RUN_CUDA_TESTS: "1"
+    container:
+      image: $(image)
+      # default shm size is 64m. Increase it to avoid:
+      # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
+      options: "--gpus=all --shm-size=2gb  -v /var/tmp:/var/tmp"
+    strategy:
+      matrix:
+        "Fabric | latest":
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
+          PACKAGE_NAME: "fabric"
+        "Lightning | latest":
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0"
+          PACKAGE_NAME: "lightning"
+    workspace:
+      clean: all
+    steps:
+      - bash: |
+          echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
+          cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
+          echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver"
+          echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
+          scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))')
+          echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope"
+          python_ver=$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")
+          echo "##vso[task.setvariable variable=PYTHON_VERSION_MM]$python_ver"
+        displayName: "set env. vars"
+      - bash: |
+          echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}"
+          echo "##vso[task.setvariable variable=TORCHVISION_URL]https://download.pytorch.org/whl/test/cu124/torchvision-0.19.0%2Bcu124-cp${PYTHON_VERSION_MM}-cp${PYTHON_VERSION_MM}-linux_x86_64.whl"
+        condition: endsWith(variables['Agent.JobName'], 'future')
+        displayName: "set env. vars 4 future"
+
+      - bash: |
+          echo $(DEVICES)
+          echo $CUDA_VISIBLE_DEVICES
+          echo $CUDA_VERSION_MM
+          echo $TORCH_URL
+          echo $COVERAGE_SOURCE
+          whereis nvidia
+          nvidia-smi
+          which python && which pip
+          python --version
+          pip --version
+          pip list
+        displayName: "Image info & NVIDIA"
+
+      - bash: |
+          PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+          pip install -q wget packaging
+          python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
+          for fpath in `ls requirements/**/*.txt`; do \
+            python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
+          done
+        displayName: "Adjust dependencies"
+
+      - bash: |
+          extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))")
+          pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}"
+        displayName: "Install package & dependencies"
+
+      - bash: |
+          set -e
+          python requirements/collect_env_details.py
+          python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
+          python -c "import bitsandbytes"
+        displayName: "Env details"
+
+      - bash: python -m pytest lightning_fabric
+        workingDirectory: src
+        # without succeeded this could run even if the job has already failed
+        condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
+        displayName: "Testing: Fabric doctests"
+
+      - bash: |
+          pip install -q -r .actions/requirements.txt
+          python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_fabric" \
+            --source_import="lightning.fabric" \
+            --target_import="lightning_fabric"
+          python .actions/assistant.py copy_replace_imports --source_dir="./examples/fabric" \
+            --source_import="lightning.fabric" \
+            --target_import="lightning_fabric"
+        # without succeeded this could run even if the job has already failed
+        condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
+        displayName: "Adjust tests & examples"
+
+      - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
+        workingDirectory: tests/
+        displayName: "Testing: fabric standard"
+        timeoutInMinutes: "10"
+
+      - bash: bash ./run_standalone_tests.sh "tests_fabric"
+        workingDirectory: tests/
+        env:
+          PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
+        displayName: "Testing: fabric standalone"
+        timeoutInMinutes: "10"
+
+      - bash: |
+          python -m coverage report
+          python -m coverage xml
+          python -m coverage html
+
+          # https://docs.codecov.com/docs/codecov-uploader
+          curl -Os https://uploader.codecov.io/latest/linux/codecov
+          chmod +x codecov
+          ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
+            --flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
+          ls -l
+        workingDirectory: tests/
+        displayName: "Statistics"
+
+      - script: |
+          set -e
+          bash run_fabric_examples.sh --accelerator=cuda --devices=1
+          bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
+        workingDirectory: examples/
+        displayName: "Testing: fabric examples"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+jsonargparse >=4.16.0, <4.28.0`
	`2`	`+requests`
	`3`	`+packaging`