releasing 1.8.3.post2 #22022
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test PyTorch - TPU | |
| on: | |
| push: | |
| branches: [master, "release/*"] | |
| pull_request_target: | |
| branches: [master, "release/*"] | |
| types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped | |
| paths: | |
| - ".github/workflows/tpu-tests.yml" | |
| - "dockers/base-xla/*" | |
| - "requirements/lite/**" | |
| - "src/lightning_lite/**" | |
| - "tests/tests_lite/**" | |
| - "requirements/pytorch/**" | |
| - "src/pytorch_lightning/**" | |
| - "tests/tests_pytorch/**" | |
| - "setup.cfg" # includes pytest config | |
| - ".actions/**" | |
| - "!requirements/**/docs.txt" | |
| - "!*.md" | |
| - "!**/*.md" | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} | |
| cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} | |
| env: | |
| PROJECT_ID: ${{ secrets.GKE_PROJECT }} | |
| GKE_CLUSTER: lightning-cluster | |
| GKE_ZONE: us-central1-a | |
| jobs: | |
| # TODO: package parametrization | |
| test-on-tpus: | |
| runs-on: ubuntu-22.04 | |
| if: github.event.pull_request.draft == false | |
| env: | |
| PYTHON_VER: 3.7 | |
| timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet` | |
| steps: | |
| - uses: actions/checkout@v3 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha }} | |
| - uses: actions/setup-python@v4 | |
| with: | |
| python-version: ${{ env.PYTHON_VER }} | |
| - name: Checkout ml-testing-accelerators | |
| run: | | |
| git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git | |
| cd ml-testing-accelerators | |
| git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable | |
| git checkout stable | |
| - uses: actions/setup-go@v3 | |
| with: | |
| go-version: '1.19' | |
| - name: Install jsonnet | |
| run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest | |
| - name: Update jsonnet | |
| env: | |
| XLA_VER: 1.12 | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| SHA: ${{ github.event.pull_request.head.sha }} | |
| run: | | |
| python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') | |
| data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" | |
| cat dockers/base-xla/tpu_workflow.jsonnet | |
| shell: bash | |
| - uses: google-github-actions/auth@v1 | |
| with: | |
| credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }} | |
| # https://docs.github.com/en/actions/deployment/deploying-to-your-cloud-provider/deploying-to-google-kubernetes-engine | |
| - uses: google-github-actions/get-gke-credentials@v0 | |
| with: | |
| cluster_name: ${{ env.GKE_CLUSTER }} | |
| location: ${{ env.GKE_ZONE }} | |
| - name: Deploy cluster | |
| run: | | |
| export PATH=$PATH:$HOME/go/bin | |
| job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet | kubectl create -f -) | |
| job_name=${job_name#job.batch/} | |
| job_name=${job_name% created} | |
| pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') | |
| echo "GKE pod name: $pod_name" | |
| echo "Waiting on kubernetes job: $job_name" | |
| status_code=2 && | |
| # Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes. | |
| printf "Waiting for job to finish: " | |
| while true; do | |
| if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then | |
| status_code=1 && break; | |
| elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then | |
| status_code=0 && break; | |
| else | |
| printf "."; | |
| fi; | |
| sleep 5; | |
| done | |
| echo "Done waiting. Job status code: $status_code" | |
| kubectl logs -f $pod_name --container=train > /tmp/full_output.txt | |
| if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt; then | |
| # successful run. split the output into logs + coverage report | |
| csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; | |
| cat xx00 # test logs | |
| mv xx01 coverage.xml | |
| else | |
| # failed run, print everything | |
| cat /tmp/full_output.txt; | |
| fi | |
| exit $status_code | |
| shell: bash | |
| - name: Upload coverage to Codecov | |
| uses: codecov/codecov-action@v3 | |
| # see: https://github.com/actions/toolkit/issues/399 | |
| continue-on-error: true | |
| with: | |
| token: ${{ secrets.CODECOV_TOKEN }} | |
| file: coverage.xml | |
| flags: tpu,pytest,python${{ env.PYTHON_VER }} | |
| name: TPU-coverage | |
| fail_ci_if_error: false |