feat: add GoldenSwag task (#175) #1390
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| types: [opened, reopened, synchronize, labeled, unlabeled, edited] | |
| # Manually trigger a workflow for a branch | |
| workflow_dispatch: | |
| # Merge queue trigger | |
| merge_group: | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| env: | |
| REGISTRY: registry.gitlab.aleph-alpha.de | |
| REPO_OWNER: research/public-registry | |
| IMAGE_NAME: eval_framework | |
| HF_DATASET_CACHE_DIR: /tmp/huggingface_datasets # <- single source of truth | |
| UV_LINK_MODE: symlink | |
| UV_LOCKED: 1 | |
| jobs: | |
| lint-pr-title: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Skip PR title check (non-PR) | |
| if: github.event_name != 'pull_request' | |
| run: | | |
| echo "Not a PR, skipping PR title check" | |
| - name: Set up Python | |
| if: github.event_name == 'pull_request' | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: "3.11" | |
| - name: Install commitizen | |
| if: github.event_name == 'pull_request' | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install commitizen | |
| - name: Validate PR Title (only on PRs) | |
| if: github.event_name == 'pull_request' | |
| id: pr-check | |
| run: | | |
| cz check --message "${{ github.event.pull_request.title }}" | |
| lint: | |
| runs-on: ubuntu-latest # default runner runs out of disk space due to hf cache | |
| needs: [lint-pr-title] | |
| steps: | |
| - name: Checkout Repository | |
| uses: actions/checkout@v4 | |
| - name: Setup uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "~=0.8.16" | |
| - name: Run Pre-Commit | |
| run: uvx pre-commit run --all-files | |
| - name: Dependency check | |
| run: ./utils/dependency_check.sh | |
| - name: Run MyPy | |
| run: uv run --all-extras mypy | |
| hf-datasets-cache: | |
| runs-on: cpu-runner-8c-32gb-01 # default runner runs out of disk space, unfortunately | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Setup uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "~=0.8.16" | |
| - name: Restore and save cache | |
| uses: actions/cache@v4 | |
| with: | |
| path: ${{ env.HF_DATASET_CACHE_DIR }} | |
| key: hf-datasets-${{ github.run_id }} | |
| restore-keys: | | |
| hf-datasets- | |
| - name: Download only updated datasets | |
| env: | |
| HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }} | |
| run: | | |
| echo "Updating changed datasets" | |
| uv run --extra=comet --extra=openai python tests/tests_eval_framework/utils/update_datasets.py update | |
| tag: | |
| # Set Docker Tag and Image Name for Docker Build and Push (GPU Runs) | |
| runs-on: ubuntu-latest | |
| outputs: | |
| tag: ${{ steps.set-tag.outputs.tag }} | |
| image: ${{ steps.set-tag.outputs.image }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set Tag | |
| id: set-tag | |
| run: | | |
| if [ "${{ github.ref }}" == "refs/heads/main" ]; then | |
| TAG='latest' | |
| else | |
| # head_ref is the correct branch name for PRs | |
| BRANCH_NAME=${{ github.head_ref || github.ref_name }} | |
| # Convert slashes with hyphens and ensure valid Docker tag format | |
| TAG=$(echo "${BRANCH_NAME}" | sed 's/[^a-zA-Z0-9._-]/-/g' | cut -c1-20) | |
| fi | |
| echo "tag=$TAG" >> $GITHUB_OUTPUT | |
| echo "image=${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.IMAGE_NAME }}:$TAG" >> $GITHUB_OUTPUT | |
| - name: Output Docker Tag | |
| run: | | |
| echo "Docker Tag: ${{ steps.set-tag.outputs.tag }}" | |
| echo "Docker image: ${{ steps.set-tag.outputs.image }}" | |
| build: | |
| # Build and Push Docker Image (GPU Runs) | |
| needs: [lint, tag] | |
| runs-on: cpu-runner-8c-32gb-01 | |
| container: docker:dind | |
| steps: | |
| - name: Checkout Repository | |
| uses: actions/checkout@v4 | |
| - name: Registry Authentication | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ${{ env.REGISTRY }} | |
| username: token | |
| password: ${{ secrets.GL_PUBLIC_REGISTRY_READ_WRITE_TOKEN }} | |
| - name: Setup Docker BuildX | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Build and Push Image | |
| uses: docker/build-push-action@v6 | |
| with: | |
| context: . | |
| file: Dockerfile | |
| push: true | |
| tags: ${{ needs.tag.outputs.image }} | |
| build-args: BUILDKIT_INLINE_CACHE=1 | |
| cache-from: | | |
| type=gha,scope=shared | |
| type=registry,ref=${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.IMAGE_NAME }}:buildcache | |
| type=registry,ref=${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.IMAGE_NAME }}:latest | |
| type=registry,ref=${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.IMAGE_NAME }}:${{ needs.tag.outputs.tag }} | |
| test-extras: | |
| # Test uv installs (CPU) | |
| runs-on: ubuntu-latest | |
| needs: [lint] | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| extras: ['', 'determined', 'api', 'openai', 'transformers', 'accelerate', 'comet', 'optional'] | |
| steps: | |
| - name: Checkout Repository | |
| uses: actions/checkout@v4 | |
| - name: Setup uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "~=0.8.16" | |
| - name: Verify install and functionality via uv --exact | |
| run: | | |
| if [ "${{ matrix.extras }}" != "" ]; then | |
| echo "Testing extra: ${{ matrix.extras }}" | |
| uv run --exact --extra ${{ matrix.extras }} pytest -v --noconftest tests/tests_eval_framework/installs/test_${{ matrix.extras }}.py | |
| else | |
| echo "Testing core install" | |
| uv run --exact pytest --noconftest -v tests/tests_eval_framework/installs/test_core.py | |
| fi | |
| test-cpu: | |
| runs-on: cpu-runner-8c-32gb-01 | |
| container: derskythe/github-runner-base:ubuntu-noble | |
| needs: [hf-datasets-cache, test-extras] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Setup uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "~=0.8.16" | |
| - name: Huggingface datasets cache | |
| uses: actions/cache/restore@v4 | |
| with: | |
| path: ${{ env.HF_DATASET_CACHE_DIR }} # <- shared path | |
| key: hf-datasets-${{ github.run_id }} | |
| restore-keys: | | |
| hf-datasets- | |
| - name: Run tests | |
| env: | |
| HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }} | |
| run: | | |
| uv run --all-extras python -c "import nltk; nltk.download('punkt_tab')" | |
| uv run --all-extras pytest -n auto --max-worker-restart=0 --durations=30 -v -m "not gpu and not cpu_slow and not external_api and not formatter_hash" | |
| test-cpu-slow: | |
| runs-on: cpu-runner-8c-32gb-01 | |
| container: derskythe/github-runner-base:ubuntu-noble | |
| needs: [hf-datasets-cache, test-extras] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Setup uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "~=0.8.16" | |
| - name: Huggingface datasets cache | |
| uses: actions/cache/restore@v4 | |
| with: | |
| path: ${{ env.HF_DATASET_CACHE_DIR }} | |
| key: hf-datasets-${{ github.run_id }} | |
| restore-keys: | | |
| hf-datasets- | |
| - name: Run tests | |
| env: | |
| HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }} | |
| run: | | |
| uv run --all-extras python -c "import nltk; nltk.download('punkt_tab')" | |
| uv run --all-extras pytest -n auto --max-worker-restart=0 --durations=30 -v -m "not gpu and cpu_slow and not external_api and not formatter_hash" | |
| test-formatter-hash: | |
| runs-on: cpu-runner-8c-32gb-01 | |
| container: derskythe/github-runner-base:ubuntu-noble | |
| needs: [hf-datasets-cache, test-extras] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Setup uv | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "~=0.8.16" | |
| - name: Huggingface datasets cache | |
| uses: actions/cache/restore@v4 | |
| with: | |
| path: ${{ env.HF_DATASET_CACHE_DIR }} | |
| key: hf-datasets-${{ github.run_id }} | |
| restore-keys: | | |
| hf-datasets- | |
| - name: Run formatter hashing tests | |
| env: | |
| HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }} | |
| run: | | |
| uv run --all-extras python -c "import nltk; nltk.download('punkt_tab')" | |
| uv run --all-extras pytest -n auto --max-worker-restart=0 --durations=30 -v -m "formatter_hash" | |
| test-docker-gpu: | |
| # Run full test suite in Docker Container with GPU | |
| runs-on: EvalFrameworkGPURunner | |
| needs: [tag, build, test-cpu, test-cpu-slow, test-formatter-hash] | |
| container: | |
| image: "${{ needs.tag.outputs.image }}" | |
| credentials: | |
| username: token | |
| password: ${{ secrets.GL_PUBLIC_REGISTRY_READ_WRITE_TOKEN }} | |
| options: --gpus all | |
| defaults: | |
| run: | |
| working-directory: /eval_framework | |
| steps: | |
| - name: Verify GPU installs via uv --exact | |
| run: | | |
| set -e # fail fast if any test fails | |
| echo "Testing vllm extra" | |
| uv run --exact --extra vllm pytest -v --noconftest tests/tests_eval_framework/installs/test_vllm.py | |
| echo "Testing mistral extra" | |
| uv run --exact --extra mistral pytest -v --noconftest tests/tests_eval_framework/installs/test_mistral.py | |
| echo "Testing all extras together" | |
| uv run --exact --all-extras pytest -v --noconftest tests/tests_eval_framework/installs/ | |
| - name: Huggingface datasets cache | |
| uses: actions/cache/restore@v4 | |
| with: | |
| path: ${{ env.HF_DATASET_CACHE_DIR }} | |
| key: hf-datasets-${{ github.run_id }} | |
| restore-keys: | | |
| hf-datasets- | |
| - name: Test GPU | |
| timeout-minutes: 20 | |
| env: | |
| HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }} | |
| run: uv run --all-extras pytest -n auto --max-worker-restart=0 --durations=30 -v -m "gpu and not cpu_slow and not external_api and not vllm" | |
| - name: Test VLLM | |
| timeout-minutes: 20 | |
| env: | |
| HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }} | |
| VLLM_LOGGING_LEVEL: DEBUG | |
| VLLM_WORKER_MULTIPROC_METHOD: spawn | |
| VLLM_USE_MODELSCOPE: False | |
| VLLM_NCCL_SO_PATH: "" | |
| VLLM_USE_TRITON_FLASH_ATTN: 0 | |
| VLLM_DISABLE_CUSTOM_ALL_REDUCE: 1 | |
| run: pytest --log-cli-level=INFO -v -m "vllm" |