Skip to content

Commit 59ec7a0

Browse files
Merge pull request #7894 from GeorgianaElena/more-health-checks
Run health checks when infrastructure code changes
2 parents 3ac26ec + 8e82d06 commit 59ec7a0

File tree

10 files changed

+466
-96
lines changed

10 files changed

+466
-96
lines changed

.github/workflows/pd-triggered-health-check.yaml

Lines changed: 30 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@ name: PD triggered health-check
22
on:
33
repository_dispatch:
44
types: [health-check]
5-
outputs:
65

76
jobs:
8-
run-health-check:
7+
get_incident_details:
98
runs-on: ubuntu-latest
9+
outputs:
10+
cluster: ${{ steps.details.outputs.cluster }}
11+
hub: ${{ steps.details.outputs.hub }}
12+
provider: ${{ steps.details.outputs.provider }}
1013
steps:
1114
- name: Get the incident details
1215
id: details
@@ -29,67 +32,54 @@ jobs:
2932
f.write(f"{cluster}\n")
3033
f.write(f"{hub}\n")
3134
f.write(f"{provider}")
32-
- uses: actions/checkout@v6
33-
with:
34-
submodules: true
35-
- uses: actions/setup-python@v6
36-
with:
37-
python-version: '3.13'
38-
- name: Save pip's install cache on job completion
39-
uses: actions/cache@v5
40-
with:
41-
path: ~/.cache/pip
42-
key: ${{ github.run_id }}
4335
44-
- name: Install deployer script's Python dependencies
45-
run: |
46-
pip install --editable .
47-
go install github.com/google/go-jsonnet/cmd/jsonnet@v0.20.0
48-
49-
- name: Setup deploy for ${{ steps.details.outputs.cluster }} cluster
50-
uses: ./.github/actions/setup-deploy
51-
with:
52-
provider: ${{ steps.details.outputs.provider }}
53-
GCP_KMS_DECRYPTOR_KEY: ${{ secrets.GCP_KMS_DECRYPTOR_KEY }}
54-
55-
- name: Run health check against ${{ steps.details.outputs.cluster }} ${{ steps.details.outputs.hub }}
56-
uses: nick-fields/retry@v3
57-
continue-on-error: true
58-
id: health_check
59-
with:
60-
timeout_minutes: 10
61-
max_attempts: 3
62-
command: |
63-
echo "health_check_output<<EOF" | tee --append "$GITHUB_OUTPUT"
64-
deployer run-hub-health-check ${{ steps.details.outputs.cluster }} ${{ steps.details.outputs.hub }} | tee --append "$GITHUB_OUTPUT"
65-
echo "EOF" | tee --append "$GITHUB_OUTPUT"
36+
health_check:
37+
needs: get_incident_details
38+
uses: ./.github/workflows/reusable-health-check.yaml
39+
with:
40+
cluster: ${{ needs.get_incident_details.outputs.cluster }}
41+
hub: ${{ needs.get_incident_details.outputs.hub }}
42+
provider: ${{ needs.get_incident_details.outputs.provider }}
43+
secrets: inherit
6644

45+
report_status:
46+
needs: [get_incident_details, health_check]
47+
runs-on: ubuntu-latest
48+
steps:
6749
- name: Report Status
6850
if: always()
6951
uses: ravsamhq/notify-slack-action@v2
7052
with:
7153
notify_when: success,failure
72-
status: ${{ job.status }} # required
73-
notification_title: Health check status for <${{ env.URL }}|${{ steps.details.outputs.cluster }} ${{ steps.details.outputs.hub }}>
74-
message_format: ':point_right: ${{ job.status }}'
54+
status: ${{ needs.health_check.result }}
55+
notification_title: Health check status for <${{ env.URL }}|${{ needs.get_incident_details.outputs.cluster }} ${{ needs.get_incident_details.outputs.hub }}>
56+
message_format: ':point_right: ${{ needs.health_check.result }}'
7557
mention_groups: '!channel'
7658
mention_groups_when: failure
7759
footer: <{run_url}|GitHub Run>
7860
env:
7961
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_PD_NOTIFICATIONS_WEBHOOK_URL }}
8062
URL: https://2i2c-org.pagerduty.com/incidents/${{ github.event.client_payload.id }}
8163

64+
- name: Set note content
65+
id: note
66+
run: |
67+
if [[ "${{ needs.health_check.outputs.health_check_output }}" == *"Skipping"* ]]; then
68+
echo "note_content=Health check skipped" >> "$GITHUB_OUTPUT"
69+
else
70+
echo "note_content=Health check status was: ${{ needs.health_check.result }}" >> "$GITHUB_OUTPUT"
71+
fi
8272
8373
- name: Leave a note to the incident
8474
uses: ./.github/actions/pagerduty-note
8575
with:
8676
incident_id: ${{ github.event.client_payload.id }}
8777
token: ${{ secrets.PD_TOKEN }}
88-
note_content: 'The health checks status was: ${{ job.status }}'
78+
note_content: ${{ steps.note.outputs.note_content }}
8979

9080
- name: Resolve the incident if successful run
9181
# We don't yet run health checks against binders
92-
if: steps.health_check.outcome == 'success' && !contains(steps.health_check.outputs.health_check_output, 'Skipping')
82+
if: needs.health_check.result == 'success' && !contains(needs.health_check.outputs.health_check_output, 'Skipping')
9383
env:
9484
ID: ${{ github.event.client_payload.id }}
9585
PD_TOKEN: ${{ secrets.PD_TOKEN }}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Call this reusable workflow like
2+
# jobs:
3+
# job1:
4+
# uses: ./.github/workflows/run-hub-health-check.yaml
5+
# with:
6+
# cluster: cluster-name
7+
# hub: hub-name
8+
# provider: provider
9+
#
10+
# https://docs.github.com/en/actions/how-tos/reuse-automations/reuse-workflows
11+
#
12+
name: Run a hub health check
13+
14+
on:
15+
workflow_call:
16+
inputs:
17+
cluster:
18+
required: true
19+
type: string
20+
hub:
21+
required: true
22+
type: string
23+
provider:
24+
required: true
25+
type: string
26+
secrets:
27+
GCP_KMS_DECRYPTOR_KEY:
28+
required: true
29+
outputs:
30+
health_check_output:
31+
value: ${{ jobs.run_health_check.outputs.output1 }}
32+
jobs:
33+
run_health_check:
34+
runs-on: ubuntu-latest
35+
outputs:
36+
output1: ${{ steps.health_check.outputs.health_check_output }}
37+
steps:
38+
- uses: actions/checkout@v6
39+
with:
40+
submodules: true
41+
- uses: actions/setup-python@v6
42+
with:
43+
python-version: '3.13'
44+
- name: Save pip's install cache on job completion
45+
uses: actions/cache@v5
46+
with:
47+
path: ~/.cache/pip
48+
key: ${{ github.run_id }}
49+
50+
- name: Install deployer script's Python dependencies
51+
run: |
52+
pip install --editable .
53+
go install github.com/google/go-jsonnet/cmd/jsonnet@v0.20.0
54+
55+
- name: Setup deploy for ${{ inputs.cluster }} cluster
56+
uses: ./.github/actions/setup-deploy
57+
with:
58+
provider: ${{ inputs.provider }}
59+
GCP_KMS_DECRYPTOR_KEY: ${{ secrets.GCP_KMS_DECRYPTOR_KEY }}
60+
61+
- name: Run health check against ${{ inputs.cluster }} ${{ inputs.hub }}
62+
uses: nick-fields/retry@v3
63+
id: health_check
64+
with:
65+
timeout_minutes: 10
66+
max_attempts: 3
67+
command: |
68+
echo "health_check_output<<EOF" >> "$GITHUB_OUTPUT"
69+
deployer run-hub-health-check ${{ inputs.cluster }} ${{ inputs.hub }} | tee --append "$GITHUB_OUTPUT"
70+
echo "EOF" >> "$GITHUB_OUTPUT"
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
name: Run a health check on hubs
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
paths:
8+
- terraform/**
9+
- eksctl/**
10+
pull_request:
11+
branches:
12+
- main
13+
paths:
14+
- terraform/**
15+
- eksctl/**
16+
17+
# ref: https://docs.github.com/en/actions/using-jobs/using-concurrency
18+
concurrency:
19+
group: ${{ github.workflow }}-${{ github.head_ref || 'not-a-pr' }}
20+
cancel-in-progress: false
21+
22+
env:
23+
TERM: xterm
24+
USE_GKE_GCLOUD_AUTH_PLUGIN: 'True'
25+
26+
jobs:
27+
generate-jobs:
28+
runs-on: ubuntu-latest
29+
outputs:
30+
support-jobs: ${{ steps.generate-jobs.outputs.support-jobs }}
31+
staging-jobs: ${{ steps.generate-jobs.outputs.staging-jobs }}
32+
prod-jobs: ${{ steps.generate-jobs.outputs.prod-jobs }}
33+
34+
steps:
35+
- uses: actions/checkout@v6
36+
with:
37+
submodules: true
38+
39+
- name: Identify files that have been added or modified
40+
# Action repo: https://github.com/dorny/paths-filter
41+
uses: dorny/paths-filter@v3
42+
id: changed-files
43+
with:
44+
token: ''
45+
list-files: csv
46+
filters: |
47+
changed:
48+
- added|modified: terraform/**
49+
- added|modified: eksctl/**
50+
51+
# This step will create a comment-body.txt file containing the jobs to be run in a
52+
# Markdown table format to be posted on a Pull Request
53+
- name: Generate matrix jobs
54+
id: generate-jobs
55+
run: |
56+
deployer plan-health-check "${{ steps.changed-files.outputs.changed_files }}"
57+
58+
# The comment-deployment-plan-pr.yaml workflow won't have the correct context to
59+
# know the PR number, so we save it to a file to pass to that workflow
60+
- name: Save Pull Request number to a file
61+
if: github.event_name == 'pull_request'
62+
run: |
63+
echo "${{ github.event.number }}" > pr-number.txt
64+
65+
# Upload the pr-number.txt and comment-body.txt files as artifacts for the
66+
# comment-deployment-plan-pr.yaml workflow to access
67+
- name: Upload artifacts
68+
if: >
69+
github.event_name == 'pull_request' &&
70+
(steps.generate-jobs.outputs.support-jobs != '[]' ||
71+
steps.generate-jobs.outputs.staging-jobs != '[]' ||
72+
steps.generate-jobs.outputs.prod-jobs != '[]')
73+
uses: actions/upload-artifact@v6
74+
with:
75+
name: pr
76+
path: |
77+
pr-number.txt
78+
comment-body.txt
79+
upgrade-staging:
80+
runs-on: ubuntu-latest
81+
needs: [generate-jobs]
82+
name: ${{ matrix.jobs.cluster_name }}-${{ matrix.jobs.hub_name }}-${{ matrix.jobs.provider }}
83+
if: |
84+
!cancelled() &&
85+
(github.event_name == 'push' && contains(github.ref, 'main')) &&
86+
needs.generate-jobs.result == 'success' &&
87+
needs.generate-jobs.outputs.staging-jobs != '[]'
88+
strategy:
89+
fail-fast: false
90+
matrix:
91+
jobs: ${{ fromJson(needs.generate-jobs.outputs.staging-jobs) }}
92+
uses: ./.github/workflows/reusable-health-check.yaml
93+
with:
94+
cluster: ${{ matrix.jobs.cluster_name }}
95+
hub: ${{ matrix.jobs.hub_name }}
96+
provider: ${{ matrix.jobs.provider }}
97+
secrets: inherit
98+
99+
upgrade-prod:
100+
runs-on: ubuntu-latest
101+
needs: [generate-jobs]
102+
name: ${{ matrix.jobs.cluster_name }}-${{ matrix.jobs.hub_name }}-${{ matrix.jobs.provider }}
103+
if: |
104+
!cancelled() &&
105+
(github.event_name == 'push' && contains(github.ref, 'main')) &&
106+
needs.generate-jobs.result == 'success' &&' &&
107+
needs.generate-jobs.outputs.prod-jobs != '[]'
108+
strategy:
109+
fail-fast: false
110+
matrix:
111+
jobs: ${{ fromJson(needs.generate-jobs.outputs.prod-jobs) }}
112+
uses: ./.github/workflows/reusable-health-check.yaml
113+
with:
114+
cluster: ${{ matrix.jobs.cluster_name }}
115+
hub: ${{ matrix.jobs.hub_name }}
116+
provider: ${{ matrix.jobs.provider }}
117+
secrets: inherit

config/clusters/2i2c-jetstream2/cluster.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
name: 2i2c-jetstream2
22
provider: kubeconfig # allocation CIS250031_IU in https://js2.jetstream-cloud.org/project/
3+
provider_url: https://js2.jetstream-cloud.org/project/
34
kubeconfig:
45
file: enc-deployer-credentials.secret.yaml
56
support:

config/clusters/projectpythia-binder/cluster.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
name: projectpythia-binder
22
provider: kubeconfig # allocation SEE240014_IU in https://js2.jetstream-cloud.org/project/
3+
provider_url: https://js2.jetstream-cloud.org/project/
34
kubeconfig:
45
file: enc-deployer-credentials.secret.yaml
56
support:

config/clusters/utoronto/cluster.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
name: utoronto
22
provider: kubeconfig # azure based, cloud infra work requires a dedicated utoronto account
3+
provider_url: https://portal.azure.com/?l=en.en-gb#@utoronto.onmicrosoft.com/resource/subscriptions/ead3521a-d994-4a44-a68d-b16e35642d5b/resourceGroups/2i2c-utoronto-cluster/providers/Microsoft.ContainerService/managedClusters/hub-cluster/overview
34
kubeconfig:
45
file: enc-deployer-credentials.secret.yaml
56
support:

0 commit comments

Comments
 (0)