|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# SPDX-FileCopyrightText: Copyright 2025 Dash0 Inc. |
| 4 | +# SPDX-License-Identifier: Apache-2.0 |
| 5 | + |
| 6 | +# A smoke test to check whether the most recently published Helm chart has been modified in a way that would require |
| 7 | +# updating the GKE Autopilot WorkloadAllowlists. |
| 8 | +# The test tries to deploy the chart to a GKE Autopilot cluster. |
| 9 | + |
| 10 | +set -xeuo pipefail |
| 11 | + |
| 12 | +cd "$(dirname "${BASH_SOURCE[0]}")"/../../.. |
| 13 | + |
| 14 | +operator_namespace=gke-ap-workload-allowlist-check-operator |
| 15 | +monitored_namespace=gke-ap-workload-allowlist-check-monitored |
| 16 | +chart="${OPERATOR_HELM_CHART:-dash0-operator/dash0-operator}" |
| 17 | +helm_release_name=dash0-operator |
| 18 | + |
| 19 | +cleanup() { |
| 20 | + set +e |
| 21 | + |
| 22 | + helm uninstall \ |
| 23 | + --namespace "$operator_namespace" \ |
| 24 | + --ignore-not-found \ |
| 25 | + --wait \ |
| 26 | + "$helm_release_name" |
| 27 | + |
| 28 | + # If the workload allow list for the pre-delete hook does not match, it might stick around as a Zombie job, force-delete it. |
| 29 | + kubectl delete job --namespace "$operator_namespace" --ignore-not-found "${helm_release_name}-pre-delete" --wait --grace-period=0 --force |
| 30 | + |
| 31 | + kubectl delete namespace "$operator_namespace" --ignore-not-found --grace-period=0 --force |
| 32 | + |
| 33 | + kubectl delete namespace "$monitored_namespace" --ignore-not-found --grace-period=0 --force |
| 34 | + |
| 35 | + helm uninstall --namespace ensure-at-least-one-node podinfo || true |
| 36 | + kubectl delete namespace ensure-at-least-one-node --ignore-not-found --grace-period=0 --force || true |
| 37 | + |
| 38 | + return 0 |
| 39 | +} |
| 40 | + |
| 41 | +retry_command() { |
| 42 | + local max_retries=10 |
| 43 | + local retry_delay=5 |
| 44 | + local attempt=1 |
| 45 | + |
| 46 | + while [[ $attempt -le $max_retries ]]; do |
| 47 | + if "$@"; then |
| 48 | + return 0 |
| 49 | + fi |
| 50 | + |
| 51 | + if [[ $attempt -eq $max_retries ]]; then |
| 52 | + echo "Command failed after $max_retries attempts: $*" |
| 53 | + return 1 |
| 54 | + fi |
| 55 | + |
| 56 | + echo "Attempt $attempt failed, retrying in ${retry_delay} seconds..." |
| 57 | + sleep $retry_delay |
| 58 | + attempt=$((attempt + 1)) |
| 59 | + done |
| 60 | +} |
| 61 | + |
| 62 | +# Deploy a dummy pod, to ensure the cluster is scaled up to at least one node. GKE AP has the infuriating UX problem |
| 63 | +# that for example a namespace can be in state terminating forever if it is scaled down to zero nodes. |
| 64 | +helm repo add podinfo https://stefanprodan.github.io/podinfo |
| 65 | +kubectl create namespace ensure-at-least-one-node || true |
| 66 | +helm install --namespace ensure-at-least-one-node podinfo podinfo/podinfo || true |
| 67 | + |
| 68 | +if [[ "$chart" != "helm-chart/dash0-operator" ]]; then |
| 69 | + echo "installing the operator helm repo" |
| 70 | + helm repo add dash0-operator https://dash0hq.github.io/dash0-operator |
| 71 | + helm repo update dash0-operator |
| 72 | +fi |
| 73 | + |
| 74 | +# Install a trap to make sure we clean up after ourselves, no matter the outcome of the check. |
| 75 | +trap cleanup HUP INT TERM EXIT |
| 76 | + |
| 77 | +# Create the namespace and a dummy auth token secret. |
| 78 | +echo "creating operator namespace $operator_namespace and auth token secret" |
| 79 | +kubectl create namespace "$operator_namespace" |
| 80 | +kubectl create secret \ |
| 81 | + generic \ |
| 82 | + dash0-authorization-secret \ |
| 83 | + --namespace "$operator_namespace" \ |
| 84 | + --from-literal=token=dummy-token |
| 85 | + |
| 86 | +# Try to install the Helm chart: |
| 87 | +helm_command="helm install --namespace $operator_namespace" |
| 88 | +helm_command+=" --set operator.gke.autopilot.enabled=true" |
| 89 | +helm_command+=" --set operator.dash0Export.enabled=true" |
| 90 | +helm_command+=" --set operator.dash0Export.endpoint=ingress.dummy-url.aws.dash0.com:4317" |
| 91 | +helm_command+=" --set operator.dash0Export.secretRef.name=dash0-authorization-secret" |
| 92 | +helm_command+=" --set operator.dash0Export.secretRef.key=token" |
| 93 | +helm_command+=" --set operator.dash0Export.apiEndpoint=https://api.dummy-url.aws.dash0.com" |
| 94 | +helm_command+=" --set operator.prometheusCrdSupportEnabled=true" |
| 95 | +helm_command+=" --set operator.clusterName=dummy-cluster-name" |
| 96 | +if [[ "$chart" = "helm-chart/dash0-operator" ]]; then |
| 97 | + # When using a local Helm chart, the test repositories ghcr.io/dash0hq/gke-ap-xxx will be used. Make sure they have |
| 98 | + # up-to-date images, this script does not build or push images. |
| 99 | + helm_command+=" --set operator.image.repository=ghcr.io/dash0hq/gke-ap-operator-controller" |
| 100 | + helm_command+=" --set operator.image.tag=latest" |
| 101 | + helm_command+=" --set operator.initContainerImage.repository=ghcr.io/dash0hq/gke-ap-instrumentation" |
| 102 | + helm_command+=" --set operator.initContainerImage.tag=latest" |
| 103 | + helm_command+=" --set operator.collectorImage.repository=ghcr.io/dash0hq/gke-ap-collector" |
| 104 | + helm_command+=" --set operator.collectorImage.tag=latest" |
| 105 | + helm_command+=" --set operator.configurationReloaderImage.repository=ghcr.io/dash0hq/gke-ap-configuration-reloader" |
| 106 | + helm_command+=" --set operator.configurationReloaderImage.tag=latest" |
| 107 | + helm_command+=" --set operator.filelogOffsetSyncImage.repository=ghcr.io/dash0hq/gke-ap-filelog-offset-sync" |
| 108 | + helm_command+=" --set operator.filelogOffsetSyncImage.tag=latest" |
| 109 | + helm_command+=" --set operator.filelogOffsetVolumeOwnershipImage.repository=ghcr.io/dash0hq/gke-ap-filelog-offset-volume-ownership" |
| 110 | + helm_command+=" --set operator.filelogOffsetVolumeOwnershipImage.tag=latest" |
| 111 | + helm_command+=" --set operator.targetAllocatorImage.repository=ghcr.io/dash0hq/gke-ap-target-allocator" |
| 112 | + helm_command+=" --set operator.targetAllocatorImage.tag=latest" |
| 113 | +fi |
| 114 | +helm_command+=" $helm_release_name" |
| 115 | +helm_command+=" $chart" |
| 116 | + |
| 117 | +echo "running helm install" |
| 118 | +$helm_command |
| 119 | + |
| 120 | +echo "helm install has been successful, waiting for the collectors to become ready" |
| 121 | + |
| 122 | +# Wait for the OTel collector workloads to become ready, this ensures that the WorkloadAllowlists for those also match. |
| 123 | +kubectl \ |
| 124 | + rollout status \ |
| 125 | + daemonset "${helm_release_name}-opentelemetry-collector-agent-daemonset" \ |
| 126 | + --namespace "$operator_namespace" \ |
| 127 | + --timeout 90s |
| 128 | + |
| 129 | +echo "the daemonset collector is ready now" |
| 130 | + |
| 131 | +kubectl \ |
| 132 | + rollout status \ |
| 133 | + deployment "${helm_release_name}-cluster-metrics-collector-deployment" \ |
| 134 | + --namespace "$operator_namespace" \ |
| 135 | + --timeout 60s |
| 136 | + |
| 137 | +echo "the deployment collector is ready now" |
| 138 | + |
| 139 | +echo "deploying a monitoring resource to trigger deploying the target-allocator" |
| 140 | + |
| 141 | +kubectl create namespace "$monitored_namespace" |
| 142 | +kubectl apply --namespace "$monitored_namespace" -f - <<EOF |
| 143 | +apiVersion: operator.dash0.com/v1beta1 |
| 144 | +kind: Dash0Monitoring |
| 145 | +metadata: |
| 146 | + name: dash0-monitoring-resource |
| 147 | +EOF |
| 148 | +retry_command kubectl get --namespace "$monitored_namespace" dash0monitorings.operator.dash0.com/dash0-monitoring-resource |
| 149 | +kubectl wait --namespace "$monitored_namespace" dash0monitorings.operator.dash0.com/dash0-monitoring-resource --for condition=Available --timeout 30s |
| 150 | + |
| 151 | +kubectl \ |
| 152 | + rollout status \ |
| 153 | + deployment "${helm_release_name}-opentelemetry-target-allocator-deployment" \ |
| 154 | + --namespace "$operator_namespace" \ |
| 155 | + --timeout 60s |
0 commit comments