releasing 1.8.3.post2 #22022

Summary
Jobs
- test-on-tpus
Run details
- Usage
- Workflow file

Workflow file for this run

.github/workflows/tpu-tests.yml at cae23c1

	name: Test PyTorch - TPU

	on:
	push:
	branches: [master, "release/*"]
	pull_request_target:
	branches: [master, "release/*"]
	types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
	paths:
	- ".github/workflows/tpu-tests.yml"
	- "dockers/base-xla/*"
	- "requirements/lite/**"
	- "src/lightning_lite/**"
	- "tests/tests_lite/**"
	- "requirements/pytorch/**"
	- "src/pytorch_lightning/**"
	- "tests/tests_pytorch/**"
	- "setup.cfg" # includes pytest config
	- ".actions/**"
	- "!requirements/**/docs.txt"
	- "!*.md"
	- "!*/.md"

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
	cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' \|\| startsWith(github.ref, 'refs/heads/release/')) }}

	env:
	PROJECT_ID: ${{ secrets.GKE_PROJECT }}
	GKE_CLUSTER: lightning-cluster
	GKE_ZONE: us-central1-a

	jobs:
	# TODO: package parametrization
	test-on-tpus:
	runs-on: ubuntu-22.04
	if: github.event.pull_request.draft == false
	env:
	PYTHON_VER: 3.7
	timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet`

	steps:
	- uses: actions/checkout@v3
	with:
	ref: ${{ github.event.pull_request.head.sha }}

	- uses: actions/setup-python@v4
	with:
	python-version: ${{ env.PYTHON_VER }}

	- name: Checkout ml-testing-accelerators
	run: \|
	git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
	cd ml-testing-accelerators
	git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
	git checkout stable

	- uses: actions/setup-go@v3
	with:
	go-version: '1.19'

	- name: Install jsonnet
	run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest

	- name: Update jsonnet
	env:
	XLA_VER: 1.12
	PR_NUMBER: ${{ github.event.pull_request.number }}
	SHA: ${{ github.event.pull_request.head.sha }}
	run: \|
	python -c "fname = 'dockers/base-xla/tpu_workflow.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
	data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
	cat dockers/base-xla/tpu_workflow.jsonnet
	shell: bash

	- uses: google-github-actions/auth@v1
	with:
	credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}

	# https://docs.github.com/en/actions/deployment/deploying-to-your-cloud-provider/deploying-to-google-kubernetes-engine
	- uses: google-github-actions/get-gke-credentials@v0
	with:
	cluster_name: ${{ env.GKE_CLUSTER }}
	location: ${{ env.GKE_ZONE }}

	- name: Deploy cluster
	run: \|
	export PATH=$PATH:$HOME/go/bin
	job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow.jsonnet \| kubectl create -f -)
	job_name=${job_name#job.batch/}
	job_name=${job_name% created}
	pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` \| awk 'match($0,!/NAME/) {print $1}')
	echo "GKE pod name: $pod_name"
	echo "Waiting on kubernetes job: $job_name"
	status_code=2 &&
	# Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes.
	printf "Waiting for job to finish: "
	while true; do
	if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' \| grep "Failed:1"; then
	status_code=1 && break;
	elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' \| grep "Succeeded:1"; then
	status_code=0 && break;
	else
	printf ".";
	fi;
	sleep 5;
	done
	echo "Done waiting. Job status code: $status_code"
	kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
	if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt; then
	# successful run. split the output into logs + coverage report
	csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/';
	cat xx00 # test logs
	mv xx01 coverage.xml
	else
	# failed run, print everything
	cat /tmp/full_output.txt;
	fi
	exit $status_code
	shell: bash

	- name: Upload coverage to Codecov
	uses: codecov/codecov-action@v3
	# see: https://github.com/actions/toolkit/issues/399
	continue-on-error: true
	with:
	token: ${{ secrets.CODECOV_TOKEN }}
	file: coverage.xml
	flags: tpu,pytest,python${{ env.PYTHON_VER }}
	name: TPU-coverage
	fail_ci_if_error: false

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

releasing 1.8.3.post2 #22022

Workflow file

releasing 1.8.3.post2 #22022

Uh oh!

Jobs

Run details

Workflow file for this run