Skip to content

Commit 420cc97

Browse files
committed
ci: add rudimentary CI check for WorkloadAllowlists
This check fails if the Dash0 operator workloads are changed in a way that make an update of the related WorkloadAllowlists necessary. The script currently does not check with every possible combination of Helm settings, but only checks a typical installation, including using the target-allocator. There might be cases that we miss. Testing every possible combination seems excessive at the moment. The check is currently after the release. Running it with a pre-release before actually releasing it officially would be an obvious improvement, but is out of scope for now.
1 parent 0b98946 commit 420cc97

File tree

2 files changed

+232
-1
lines changed

2 files changed

+232
-1
lines changed
Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,88 @@
11
name: GKE Autopilot WorkloadAllowlist Check
22

33
on:
4+
deployment_status:
45
workflow_dispatch:
56

7+
env:
8+
PROJECT_ID: ${{ secrets.GKE_PROJECT }}
9+
GKE_CLUSTER: ci-workload-allowlist-check
10+
GKE_ZONE: europe-central2
11+
12+
concurrency:
13+
# This workflow uses a shared resource, allow only one execution at a time. Alternatively, we would need to add
14+
# steps to create a GKE Autopilot cluster on the fly and discard it after the GH action run.
15+
group: gke-ap-workload-allowlist-check-concurrency-group
16+
17+
18+
# Note: We could potentially optimize this check by only running it when the Helm chart has changed.
619
jobs:
720
check_workload_allowlists:
821
name: Check WorkloadAllowlists
922
runs-on: ubuntu-latest
23+
timeout-minutes: 8
1024

1125
steps:
12-
- run: echo "ohai"
26+
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
27+
28+
- id: 'auth'
29+
uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093
30+
with:
31+
credentials_json: '${{ secrets.GKE_SA_KEY }}'
32+
33+
# get the GKE credentials so we can deploy to the cluster
34+
- uses: google-github-actions/get-gke-credentials@3da1e46a907576cefaa90c484278bb5b259dd395
35+
with:
36+
project_id: ${{ secrets.GKE_PROJECT }}
37+
cluster_name: ${{ env.GKE_CLUSTER }}
38+
location: ${{ env.GKE_ZONE }}
39+
40+
- name: run check
41+
run: |-
42+
.github/workflows/scripts/gke-ap-workload-allowlist-check.sh
43+
44+
- name: send Slack notification on success
45+
if: success()
46+
uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a
47+
with:
48+
errors: true
49+
method: chat.postMessage
50+
token: ${{ secrets.SLACK_BOT_TOKEN }}
51+
# language=YAML
52+
payload: |
53+
channel: ${{ secrets.SLACK_CHANNEL_ID }}
54+
text: 'GKE Autopilot WorkloadAllowlist check successful'
55+
blocks:
56+
- type: header
57+
text:
58+
type: plain_text
59+
text: ':white_check_mark: GKE Autopilot WorkloadAllowlist check successful'
60+
- type: context
61+
elements:
62+
- type: mrkdwn
63+
text: ':github: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|Workflow run #${{ github.run_id }}>'
64+
- type: mrkdwn
65+
text: ':diff: <${{ github.event.head_commit.url }}|Commit>'
66+
67+
- name: send Slack notification on failure
68+
if: failure()
69+
uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a
70+
with:
71+
errors: true
72+
method: chat.postMessage
73+
token: ${{ secrets.SLACK_BOT_TOKEN }}
74+
# language=YAML
75+
payload: |
76+
channel: ${{ secrets.SLACK_CHANNEL_ID }}
77+
text: 'GKE Autopilot WorkloadAllowlist check has failed'
78+
blocks:
79+
- type: header
80+
text:
81+
type: plain_text
82+
text: ':x: GKE Autopilot WorkloadAllowlist check has failed'
83+
- type: context
84+
elements:
85+
- type: mrkdwn
86+
text: ':github: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|Workflow run #${{ github.run_id }}>'
87+
- type: mrkdwn
88+
text: ':diff: <${{ github.event.head_commit.url }}|Commit>'
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#!/usr/bin/env bash
2+
3+
# SPDX-FileCopyrightText: Copyright 2025 Dash0 Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
# A smoke test to check whether the most recently published Helm chart has been modified in a way that would require
7+
# updating the GKE Autopilot WorkloadAllowlists.
8+
# The test tries to deploy the chart to a GKE Autopilot cluster.
9+
10+
set -xeuo pipefail
11+
12+
cd "$(dirname "${BASH_SOURCE[0]}")"/../../..
13+
14+
operator_namespace=gke-ap-workload-allowlist-check-operator
15+
monitored_namespace=gke-ap-workload-allowlist-check-monitored
16+
chart="${OPERATOR_HELM_CHART:-dash0-operator/dash0-operator}"
17+
helm_release_name=dash0-operator
18+
19+
cleanup() {
20+
set +e
21+
22+
helm uninstall \
23+
--namespace "$operator_namespace" \
24+
--ignore-not-found \
25+
--wait \
26+
"$helm_release_name"
27+
28+
# If the workload allow list for the pre-delete hook does not match, it might stick around as a Zombie job, force-delete it.
29+
kubectl delete job --namespace "$operator_namespace" --ignore-not-found "${helm_release_name}-pre-delete" --wait --grace-period=0 --force
30+
31+
kubectl delete namespace "$operator_namespace" --ignore-not-found --grace-period=0 --force
32+
33+
kubectl delete namespace "$monitored_namespace" --ignore-not-found --grace-period=0 --force
34+
35+
helm uninstall --namespace ensure-at-least-one-node podinfo || true
36+
kubectl delete namespace ensure-at-least-one-node --ignore-not-found --grace-period=0 --force || true
37+
38+
return 0
39+
}
40+
41+
retry_command() {
42+
local max_retries=10
43+
local retry_delay=5
44+
local attempt=1
45+
46+
while [[ $attempt -le $max_retries ]]; do
47+
if "$@"; then
48+
return 0
49+
fi
50+
51+
if [[ $attempt -eq $max_retries ]]; then
52+
echo "Command failed after $max_retries attempts: $*"
53+
return 1
54+
fi
55+
56+
echo "Attempt $attempt failed, retrying in ${retry_delay} seconds..."
57+
sleep $retry_delay
58+
attempt=$((attempt + 1))
59+
done
60+
}
61+
62+
# Deploy a dummy pod, to ensure the cluster is scaled up to at least one node. GKE AP has the infuriating UX problem
63+
# that for example a namespace can be in state terminating forever if it is scaled down to zero nodes.
64+
helm repo add podinfo https://stefanprodan.github.io/podinfo
65+
kubectl create namespace ensure-at-least-one-node || true
66+
helm install --namespace ensure-at-least-one-node podinfo podinfo/podinfo || true
67+
68+
if [[ "$chart" != "helm-chart/dash0-operator" ]]; then
69+
echo "installing the operator helm repo"
70+
helm repo add dash0-operator https://dash0hq.github.io/dash0-operator
71+
helm repo update dash0-operator
72+
fi
73+
74+
# Install a trap to make sure we clean up after ourselves, no matter the outcome of the check.
75+
trap cleanup HUP INT TERM EXIT
76+
77+
# Create the namespace and a dummy auth token secret.
78+
echo "creating operator namespace $operator_namespace and auth token secret"
79+
kubectl create namespace "$operator_namespace"
80+
kubectl create secret \
81+
generic \
82+
dash0-authorization-secret \
83+
--namespace "$operator_namespace" \
84+
--from-literal=token=dummy-token
85+
86+
# Try to install the Helm chart:
87+
helm_command="helm install --namespace $operator_namespace"
88+
helm_command+=" --set operator.gke.autopilot.enabled=true"
89+
helm_command+=" --set operator.dash0Export.enabled=true"
90+
helm_command+=" --set operator.dash0Export.endpoint=ingress.dummy-url.aws.dash0.com:4317"
91+
helm_command+=" --set operator.dash0Export.secretRef.name=dash0-authorization-secret"
92+
helm_command+=" --set operator.dash0Export.secretRef.key=token"
93+
helm_command+=" --set operator.dash0Export.apiEndpoint=https://api.dummy-url.aws.dash0.com"
94+
helm_command+=" --set operator.prometheusCrdSupportEnabled=true"
95+
helm_command+=" --set operator.clusterName=dummy-cluster-name"
96+
if [[ "$chart" = "helm-chart/dash0-operator" ]]; then
97+
# When using a local Helm chart, the test repositories ghcr.io/dash0hq/gke-ap-xxx will be used. Make sure they have
98+
# up-to-date images, this script does not build or push images.
99+
helm_command+=" --set operator.image.repository=ghcr.io/dash0hq/gke-ap-operator-controller"
100+
helm_command+=" --set operator.image.tag=latest"
101+
helm_command+=" --set operator.initContainerImage.repository=ghcr.io/dash0hq/gke-ap-instrumentation"
102+
helm_command+=" --set operator.initContainerImage.tag=latest"
103+
helm_command+=" --set operator.collectorImage.repository=ghcr.io/dash0hq/gke-ap-collector"
104+
helm_command+=" --set operator.collectorImage.tag=latest"
105+
helm_command+=" --set operator.configurationReloaderImage.repository=ghcr.io/dash0hq/gke-ap-configuration-reloader"
106+
helm_command+=" --set operator.configurationReloaderImage.tag=latest"
107+
helm_command+=" --set operator.filelogOffsetSyncImage.repository=ghcr.io/dash0hq/gke-ap-filelog-offset-sync"
108+
helm_command+=" --set operator.filelogOffsetSyncImage.tag=latest"
109+
helm_command+=" --set operator.filelogOffsetVolumeOwnershipImage.repository=ghcr.io/dash0hq/gke-ap-filelog-offset-volume-ownership"
110+
helm_command+=" --set operator.filelogOffsetVolumeOwnershipImage.tag=latest"
111+
helm_command+=" --set operator.targetAllocatorImage.repository=ghcr.io/dash0hq/gke-ap-target-allocator"
112+
helm_command+=" --set operator.targetAllocatorImage.tag=latest"
113+
fi
114+
helm_command+=" $helm_release_name"
115+
helm_command+=" $chart"
116+
117+
echo "running helm install"
118+
$helm_command
119+
120+
echo "helm install has been successful, waiting for the collectors to become ready"
121+
122+
# Wait for the OTel collector workloads to become ready, this ensures that the WorkloadAllowlists for those also match.
123+
kubectl \
124+
rollout status \
125+
daemonset "${helm_release_name}-opentelemetry-collector-agent-daemonset" \
126+
--namespace "$operator_namespace" \
127+
--timeout 90s
128+
129+
echo "the daemonset collector is ready now"
130+
131+
kubectl \
132+
rollout status \
133+
deployment "${helm_release_name}-cluster-metrics-collector-deployment" \
134+
--namespace "$operator_namespace" \
135+
--timeout 60s
136+
137+
echo "the deployment collector is ready now"
138+
139+
echo "deploying a monitoring resource to trigger deploying the target-allocator"
140+
141+
kubectl create namespace "$monitored_namespace"
142+
kubectl apply --namespace "$monitored_namespace" -f - <<EOF
143+
apiVersion: operator.dash0.com/v1beta1
144+
kind: Dash0Monitoring
145+
metadata:
146+
name: dash0-monitoring-resource
147+
EOF
148+
retry_command kubectl get --namespace "$monitored_namespace" dash0monitorings.operator.dash0.com/dash0-monitoring-resource
149+
kubectl wait --namespace "$monitored_namespace" dash0monitorings.operator.dash0.com/dash0-monitoring-resource --for condition=Available --timeout 30s
150+
151+
kubectl \
152+
rollout status \
153+
deployment "${helm_release_name}-opentelemetry-target-allocator-deployment" \
154+
--namespace "$operator_namespace" \
155+
--timeout 60s

0 commit comments

Comments
 (0)