From 381e252aa4cf42b8a085d7049dddcff7fbe16240 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 24 Oct 2024 17:01:24 +0200 Subject: [PATCH 01/40] Add Prime install Signed-off-by: Jeroen van Erp --- scripts/README.md | 6 +++++ scripts/rancher/manager_lifecycle.sh | 33 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/scripts/README.md b/scripts/README.md index f27ab74..c6e864d 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -53,6 +53,12 @@ Name | Source `rancher_update_serverurl` | [rancher/manager_settings.sh](rancher/manager_settings.sh) `rancher_wait_capiready` | [rancher/manager_lifecycle.sh](rancher/manager_lifecycle.sh) +### Rancher Prime + +Name | Source +-----------------------------------------------|------------------------------------------------------------- +`rancherprime_install_withcertmanagerclusterissuer` | [rancher/manager_lifecycle.sh](rancher/manager_lifecycle.sh) + ### SUSE Observability Name | Source diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index 180ee48..2e24026 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -36,6 +36,39 @@ rancher_install_withcertmanagerclusterissuer() { sleep 10 } +####################################### +# Installs Rancher Prime with a certificate generated by a cluster issuer +# Arguments: +# Version +# Number of replicas +# Hostname +# Cluster issuer name (managed by cert-manager) +# Examples: +# rancher_install_withcertmanagerclusterissuer latest "2.8.2" 1 rancher.random_string.geek letsencrypt-prod +####################################### +rancherprime_install_withcertmanagerclusterissuer() { + local version=$2 + local replicas=$3 + local hostname=$4 + local clusterissuer=$5 + + echo "Installing Rancher..." + helm repo add rancher-prime https://charts.rancher.com/server-charts/prime + helm repo update + helm upgrade --install rancher rancher-prime/rancher --namespace cattle-system --create-namespace \ + --version ${version} \ + --set replicas=${replicas} \ + --set hostname=${hostname} \ + --set ingress.extraAnnotations.'cert-manager\.io/cluster-issuer'=${clusterissuer} \ + --set ingress.tls.source=secret \ + --set ingress.tls.secretName=rancher-tls \ + --set agentTLSMode="system-store" + kubectl wait pods -n cattle-system -l app=rancher --for condition=Ready --timeout=180s + echo "Waiting for Rancher web app to be running with a valid certificate..." + while ! kubectl get secret rancher-tls --namespace cattle-system 2>/dev/null; do sleep 1; done + sleep 10 +} + ####################################### # Do the first log in Rancher (will update admin password and set server URL) # Arguments: From 6f2f4d35e05a6fafdbf5da77077530edd2d140c1 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 24 Oct 2024 17:15:50 +0200 Subject: [PATCH 02/40] Fix param order Signed-off-by: Jeroen van Erp --- scripts/rancher/manager_lifecycle.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index 2e24026..6822b5e 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -47,10 +47,10 @@ rancher_install_withcertmanagerclusterissuer() { # rancher_install_withcertmanagerclusterissuer latest "2.8.2" 1 rancher.random_string.geek letsencrypt-prod ####################################### rancherprime_install_withcertmanagerclusterissuer() { - local version=$2 - local replicas=$3 - local hostname=$4 - local clusterissuer=$5 + local version=$1 + local replicas=$2 + local hostname=$3 + local clusterissuer=$4 echo "Installing Rancher..." helm repo add rancher-prime https://charts.rancher.com/server-charts/prime From b73ad1d30ea570ea6c4bbdbc8665da108b0ac7f0 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 25 Oct 2024 11:17:37 +0200 Subject: [PATCH 03/40] Add observability service token methods Signed-off-by: Jeroen van Erp --- scripts/README.md | 2 + scripts/observability/service_token.sh | 51 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 scripts/observability/service_token.sh diff --git a/scripts/README.md b/scripts/README.md index c6e864d..fbe7c65 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -70,6 +70,8 @@ Name | Source `observability_get_component_snapshot` | [observability/stql.sh](observability/stql.sh) `observability_get_component_state` | [observability/stql.sh](observability/stql.sh) `observability_install_cli` | [observability/cli.sh](observability/cli.sh) +`observability_create_service_token` | [observability/service_token.sh](observability/service_token.sh) +`observability_delete_service_token` | [observability/service_token.sh](observability/service_token.sh) ### SUSE Linux (previously SLES, SLE Micro) diff --git a/scripts/observability/service_token.sh b/scripts/observability/service_token.sh new file mode 100644 index 0000000..451a1e7 --- /dev/null +++ b/scripts/observability/service_token.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +####################################### +# Create a service token for SUSE Observability +# Output: +# The service token +# Arguments: +# url (SUSE Observability) +# service_token (SUSE Observability) +# cluster_name +# role +# Examples: +# observability_create_service_token https://obs.suse.com/ xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx lab-dsu37834 stackstate-k8s-troubleshooter +####################################### +observability_create_service_token() { + local url=$1 + local service_token=$2 + local cluster_name=$3 + local role=$4 + + local resp + resp=$(/usr/local/bin/sts service-token create --name $cluster_name --roles $role -o json --url $url --service-token $service_token) + + echo $resp | jq -r '."service-token".token' +} + +####################################### +# Delete a service token for SUSE Observability +# Arguments: +# url (SUSE Observability) +# service_token (SUSE Observability) +# cluster_name +# Examples: +# observability_delete_service_token https://obs.suse.com/ xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx lab-dsu37834 +####################################### +observability_delete_service_token() { + local url=$1 + local service_token=$2 + local cluster_name=$3 + + local tokens token_id + + tokens=$(/usr/local/bin/sts service-token list -o json --url $url --service-token $service_token) + token_id=$(echo $tokens | jq -r '."service-tokens"[] | select(.name == "'$cluster_name'") | .id') + if [ -n "$token_id" ]; then + /usr/local/bin/sts service-token delete --id $token_id --url $url --service-token $service_token + echo ">>> Service token named '${cluster_name}' deleted" + else + echo ">>> Service token named '${cluster_name}' not found" + fi +} From 4f55817b3dd4a8ad8177d66332485936ff344162 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Mon, 4 Nov 2024 23:08:31 +0100 Subject: [PATCH 04/40] Add ai-model workload Signed-off-by: Jeroen van Erp --- charts/ai-model/Chart.yaml | 9 +++ charts/ai-model/templates/_helpers.tpl | 62 +++++++++++++++++++ charts/ai-model/templates/ai-model-cm.yaml | 32 ++++++++++ .../templates/ai-model-deployment.yaml | 41 ++++++++++++ .../ai-model/templates/ai-model-ingress.yaml | 27 ++++++++ charts/ai-model/templates/ai-model-svc.yaml | 16 +++++ charts/ai-model/values.yaml | 12 ++++ 7 files changed, 199 insertions(+) create mode 100644 charts/ai-model/Chart.yaml create mode 100644 charts/ai-model/templates/_helpers.tpl create mode 100644 charts/ai-model/templates/ai-model-cm.yaml create mode 100644 charts/ai-model/templates/ai-model-deployment.yaml create mode 100644 charts/ai-model/templates/ai-model-ingress.yaml create mode 100644 charts/ai-model/templates/ai-model-svc.yaml create mode 100644 charts/ai-model/values.yaml diff --git a/charts/ai-model/Chart.yaml b/charts/ai-model/Chart.yaml new file mode 100644 index 0000000..17216aa --- /dev/null +++ b/charts/ai-model/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: ai-model +description: A Helm chart for ai-model Mackroservices +type: application +version: 0.1.0 +appVersion: "0.1.0" +keywords: +- challenge +- observability diff --git a/charts/ai-model/templates/_helpers.tpl b/charts/ai-model/templates/_helpers.tpl new file mode 100644 index 0000000..5c3f420 --- /dev/null +++ b/charts/ai-model/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "common.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "common.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "common.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "common.labels" -}} +helm.sh/chart: {{ include "common.chart" . }} +{{ include "common.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "common.selectorLabels" -}} +app.kubernetes.io/name: {{ include "common.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "common.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "common.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/ai-model/templates/ai-model-cm.yaml b/charts/ai-model/templates/ai-model-cm.yaml new file mode 100644 index 0000000..da05242 --- /dev/null +++ b/charts/ai-model/templates/ai-model-cm.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ai-model-cm + labels: + {{- include "common.labels" . | nindent 4 }} +data: + config.toml: | + # General configuration + port = 11434 + address = "0.0.0.0" + serviceName = "AI Model" + logLevel = "info" + + # Endpoints + [[endpoints]] + uri = "/api/chat" + delay = "1000ms" + body.status = "success" + body.msg = "Your dino is a T-Rex" + + [endpoints.logging] + before = "Processing [[.Endpoint.Uri]] request" + beforeLevel = "Info" + after = "Completed [[.Endpoint.Uri]] request" + afterLevel = "Info" + + # OpenTelemetry + [otel.trace] + enabled = false + tracer-name = "ai-model" + diff --git a/charts/ai-model/templates/ai-model-deployment.yaml b/charts/ai-model/templates/ai-model-deployment.yaml new file mode 100644 index 0000000..fe5616e --- /dev/null +++ b/charts/ai-model/templates/ai-model-deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ai-model + labels: + service: ai-model + {{- include "common.labels" . | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + service: ai-model + {{- include "common.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "common.labels" . | nindent 8 }} + service: ai-model + annotations: + checksum/config: '{{ include (print $.Template.BasePath "/ai-model-cm.yaml") . | sha256sum}}' + spec: + containers: + - name: ai-model + image: {{.Values.image}} + env: + - name: CONFIG_FILE + value: /etc/app/config.toml + ports: + - containerPort: 8080 + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: config-volume + mountPath: /etc/app + volumes: + - name: config-volume + configMap: + name: ai-model-cm + items: + - key: config.toml + path: config.toml diff --git a/charts/ai-model/templates/ai-model-ingress.yaml b/charts/ai-model/templates/ai-model-ingress.yaml new file mode 100644 index 0000000..a162c0b --- /dev/null +++ b/charts/ai-model/templates/ai-model-ingress.yaml @@ -0,0 +1,27 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/proxy-body-size: 50m + labels: + service: ai-model + {{- include "common.labels" . | nindent 4 }} + name: ai-model +spec: + ingressClassName: traefik + rules: + - host: {{ .Values.ingress.host }} + http: + paths: + - backend: + service: + name: ai-model + port: + number: 11434 + path: / + pathType: Prefix + tls: + - hosts: + - {{ .Values.ingress.host }} + secretName: tls-secret diff --git a/charts/ai-model/templates/ai-model-svc.yaml b/charts/ai-model/templates/ai-model-svc.yaml new file mode 100644 index 0000000..5613d2a --- /dev/null +++ b/charts/ai-model/templates/ai-model-svc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: ai-model + labels: + service: ai-model + {{- include "common.labels" . | nindent 4 }} +spec: + selector: + service: ai-model + {{- include "common.selectorLabels" . | nindent 4 }} + ports: + - protocol: TCP + port: 80 # Service port + targetPort: 8080 # Container port + type: ClusterIP # Internal service within the Kubernetes cluster diff --git a/charts/ai-model/values.yaml b/charts/ai-model/values.yaml new file mode 100644 index 0000000..d33b91c --- /dev/null +++ b/charts/ai-model/values.yaml @@ -0,0 +1,12 @@ +nameOverride: '' +fullnameOverride: '' +image: ravan/mockroservice:0.0.23 +resources: + requests: + memory: '8Mi' + cpu: '5m' + limits: + memory: '10Mi' + cpu: '10m' +ingress: + host: From 82fea318ba7fe9f1664ad53b6e60d669eac8a320 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 5 Nov 2024 11:07:15 +0100 Subject: [PATCH 05/40] Fixup helm charts and add cli install Signed-off-by: Jeroen van Erp --- charts/ai-model/templates/ai-model-deployment.yaml | 6 +++--- charts/ai-model/templates/ai-model-svc.yaml | 2 +- scripts/observability/cli.sh | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/charts/ai-model/templates/ai-model-deployment.yaml b/charts/ai-model/templates/ai-model-deployment.yaml index fe5616e..bf0c4e7 100644 --- a/charts/ai-model/templates/ai-model-deployment.yaml +++ b/charts/ai-model/templates/ai-model-deployment.yaml @@ -14,8 +14,8 @@ spec: template: metadata: labels: - {{- include "common.labels" . | nindent 8 }} - service: ai-model + {{- include "common.labels" . | nindent 8 }} + service: ai-model annotations: checksum/config: '{{ include (print $.Template.BasePath "/ai-model-cm.yaml") . | sha256sum}}' spec: @@ -28,7 +28,7 @@ spec: ports: - containerPort: 8080 resources: - {{- toYaml .Values.resources | nindent 12 }} + {{- toYaml .Values.resources | nindent 12 }} volumeMounts: - name: config-volume mountPath: /etc/app diff --git a/charts/ai-model/templates/ai-model-svc.yaml b/charts/ai-model/templates/ai-model-svc.yaml index 5613d2a..cd33a02 100644 --- a/charts/ai-model/templates/ai-model-svc.yaml +++ b/charts/ai-model/templates/ai-model-svc.yaml @@ -8,7 +8,7 @@ metadata: spec: selector: service: ai-model - {{- include "common.selectorLabels" . | nindent 4 }} + {{- include "common.selectorLabels" . | nindent 4 }} ports: - protocol: TCP port: 80 # Service port diff --git a/scripts/observability/cli.sh b/scripts/observability/cli.sh index 40c2030..a228b30 100644 --- a/scripts/observability/cli.sh +++ b/scripts/observability/cli.sh @@ -5,7 +5,7 @@ ####################################### observability_install_cli() { if ! [ -x "$(command -v sts)" ]; then - curl -o- https://dl.stackstate.com/stackstate-cli/install.sh | STS_CLI_LOCATION=/usr/local/bin bash + curl -s -o- https://dl.stackstate.com/stackstate-cli/install.sh | STS_CLI_LOCATION=/usr/local/bin bash else echo ">>> sts CLI already installed" fi From afe86ea1969ac6789a389e320b2b6e7c0bdf15ba Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 5 Nov 2024 11:28:35 +0100 Subject: [PATCH 06/40] Make lint happy Signed-off-by: Jeroen van Erp --- charts/ai-model/Chart.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/charts/ai-model/Chart.yaml b/charts/ai-model/Chart.yaml index 17216aa..d35c96a 100644 --- a/charts/ai-model/Chart.yaml +++ b/charts/ai-model/Chart.yaml @@ -4,6 +4,9 @@ description: A Helm chart for ai-model Mackroservices type: application version: 0.1.0 appVersion: "0.1.0" +maintainers: + - name: hierynomus + email: jeroen.vanerp@suse.com keywords: - challenge - observability From 24d8f60329946ff872983a254f6caa875619d29f Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 5 Nov 2024 11:31:18 +0100 Subject: [PATCH 07/40] Remove hardcoded clusterissuer Signed-off-by: Jeroen van Erp --- charts/ai-model/templates/ai-model-ingress.yaml | 2 +- charts/ai-model/values.yaml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/charts/ai-model/templates/ai-model-ingress.yaml b/charts/ai-model/templates/ai-model-ingress.yaml index a162c0b..c9d5bd5 100644 --- a/charts/ai-model/templates/ai-model-ingress.yaml +++ b/charts/ai-model/templates/ai-model-ingress.yaml @@ -2,7 +2,7 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod + cert-manager.io/cluster-issuer: {{ .Values.ingress.certmanager.issuer }} nginx.ingress.kubernetes.io/proxy-body-size: 50m labels: service: ai-model diff --git a/charts/ai-model/values.yaml b/charts/ai-model/values.yaml index d33b91c..70f1596 100644 --- a/charts/ai-model/values.yaml +++ b/charts/ai-model/values.yaml @@ -10,3 +10,5 @@ resources: cpu: '10m' ingress: host: + certmanager: + issuer: From dae759c008cc1bf19032bb31a4637ed02f8a4448 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 5 Nov 2024 11:32:42 +0100 Subject: [PATCH 08/40] Remove hardcoded annotation Signed-off-by: Jeroen van Erp --- charts/ai-model/templates/ai-model-ingress.yaml | 3 +-- charts/ai-model/values.yaml | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/charts/ai-model/templates/ai-model-ingress.yaml b/charts/ai-model/templates/ai-model-ingress.yaml index c9d5bd5..03476ab 100644 --- a/charts/ai-model/templates/ai-model-ingress.yaml +++ b/charts/ai-model/templates/ai-model-ingress.yaml @@ -2,8 +2,7 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: annotations: - cert-manager.io/cluster-issuer: {{ .Values.ingress.certmanager.issuer }} - nginx.ingress.kubernetes.io/proxy-body-size: 50m + {{- .Values.ingress.annotations | toYaml | nindent 4 }} labels: service: ai-model {{- include "common.labels" . | nindent 4 }} diff --git a/charts/ai-model/values.yaml b/charts/ai-model/values.yaml index 70f1596..37ecdbd 100644 --- a/charts/ai-model/values.yaml +++ b/charts/ai-model/values.yaml @@ -9,6 +9,5 @@ resources: memory: '10Mi' cpu: '10m' ingress: + annotations: host: - certmanager: - issuer: From 37edf3291dd615b548eb4df889c1726cd12e1e04 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 7 Nov 2024 12:28:09 +0100 Subject: [PATCH 09/40] Added monitor and assets Signed-off-by: Jeroen van Erp --- assets/monitors/pods-in-waiting-state.yaml | 236 +++++++++++++++++++++ scripts/download.sh | 2 + scripts/observability/monitors.sh | 23 ++ 3 files changed, 261 insertions(+) create mode 100644 assets/monitors/pods-in-waiting-state.yaml create mode 100644 scripts/observability/monitors.sh diff --git a/assets/monitors/pods-in-waiting-state.yaml b/assets/monitors/pods-in-waiting-state.yaml new file mode 100644 index 0000000..6b62c11 --- /dev/null +++ b/assets/monitors/pods-in-waiting-state.yaml @@ -0,0 +1,236 @@ +nodes: +- _type: Monitor + arguments: + failureState: CRITICAL + loggingLevel: WARN + description: | + If a pod is within a waiting state and contains a reason of CreateContainerConfigError, CreateContainerError, + CrashLoopBackOff, or ImagePullBackOff it will be seen as deviating. + function: {{ get "urn:stackpack:kubernetes-v2:shared:monitor-function:pods-in-waiting-state" }} + id: -6 + identifier: urn:custom:monitor:pods-in-waiting-state-v2 + intervalSeconds: 30 + name: Pods in Waiting State V2 + remediationHint: |- + \{{#if reasons\}} + \{{#if reasons.CreateContainerConfigError\}} + ## CreateContainerConfigError + + In case of CreateContainerConfigError common causes are a secret or ConfigMap that is referenced in [your pod](/#/components/\{{ componentUrnForUrl \}}), but doesn’t exist. + + ### Missing ConfigMap + + If case of a missing ConfigMap you see an error like `Error: configmap "mydb-config" not found` you see the error mention in the message of this monitor. + + To solve this you should reference an existing ConfigMap. + + An example: + + ```markdown + # See if the configmap exists + kubectl get configmap mydb-config + + # Create the correct configmap, this is just an example + kubectl create configmap mydb-config --from-literal=database_name=mydb + + # Delete and recreate the pod using this configmag + kubectl delete -f mydb_pod.yaml + kubectl create -f mydb_pod.yaml + + # After recreating the pod this pod should be in a running state. + # This is visible because the waiting pod monitor will not trigger anymore on this condition. + ``` + + ### Missing Secret + + If case of a missing Secret you see an error like `Error from server (NotFound): secrets "my-secret" not found` + you see the error mention in the message of this monitor. + + To solve this you should reference an existing ConfigMap. + + An example: + + ```markdown + # See if the secret exists + kubectl get secret mydb-secret + + # Create the correct configmap, this is just an example + kubectl create secret mydb-secret --from-literal=password=mysupersecretpassword + + # Delete and recreate the pod using this configmag + kubectl delete -f mydb_pod.yaml + kubectl create -f mydb_pod.yaml + + # After recreating the pod this pod should be in a running state. + # This is visible because the waiting pod monitor will not trigger anymore on this condition. + ``` + \{{/if\}} + \{{#if reasons.CreateContainerError\}} + ## CreateContainerError + + Common causes for a CreateContainerError are: + + - Command Not Available + - Issues Mounting a Volume + - Container Runtime Not Cleaning Up Old Containers + + ### Command Not Available + + In case of ‘`Command Not Available`’ you will find this in the reason field at the top of this monitor (full screen). + If this is the case, the first thing you need to investigate is to check that you have a valid ENTRYPOINT in the Dockerfile + used to build your container image. + + If you don’t have access to the Dockerfile, you can configure your pod object by using + a valid command in the command attribute of the object. + + Check if your pod has a command set by inspecting the [Configuration"](/#/components/\{{ componentUrnForUrl \}}#configuration) on the pod, e.g.: + + ```markdown + apiVersion: v1 + kind: Pod + metadata: + name: nodeapp + labels: + app: nodeapp + spec: + containers: + - image: myimage/wrong-node-app + name: nodeapp + ports: + - containerPort: 80 + **command: ["node", "index.js"]** + ``` + + If the pod does not have a command set, check the container definition to see if an ENTRYPOINT is set, here you see an example without an existing ENTRYPOINT. + + if no exisiting ENTRYPOINT is set and the pod does not have a command the solution is to use a valid command in the pod definition: + + ```markdown + FROM ****node:16.3.0-alpine + WORKDIR /usr/src/app + COPY package*.json ./ + + RUN npm install + COPY . . + + EXPOSE 8080 + + **ENTRYPOINT []** + ``` + + ### Issues Mounting a Volume + + In the case of a `volume mount problem` the message of this monitor will give you a hint. For example, if you have a message like: + + ``` + Error: Error response from daemon: create \mnt\data: "\\mnt\\data" includes invalid characters for a local volume name, only "[a-zA-Z0-9][a-zA-Z0-9_.-]" are allowed. If you intended to pass a host directory, use absolute path + ``` + + In this case you should use a change the path in the PersistentVolume definition to a valid path. e.g. /mnt/data + + ### Container Runtime Not Cleaning Up Old Containers + + In this case you will see a message like: + + ``` + The container name "/myapp_ed236ae738" is already in use by container "22f4edaec41cb193857aefcead3b86cdb69edfd69b2ab57486dff63102b24d29". You have to remove (or rename) that container to be able to reuse that name. + ``` + + This is an indication that the [container runtime](https://kubernetes.io/docs/setup/production-environment/container-runtimes/) + doesn’t clean up old containers. + In this case the node should be removed from the cluster and the node container runtime should be reinstalled + (or be recreated). After that the node should be (re)assigned to the cluster. + + \{{/if\}} + \{{#if reasons.CrashLoopBackOff\}} + ## CrashLoopBackOff + + When a Kubernetes container has errors, it can enter into a state called CrashLoopBackOff, where Kubernetes attempts to restart the container to resolve the issue. + + The container will continue to restart until the problem is resolved. + + Take the following steps to diagnose the problem: + + ### Container Logs + Check the container logs for any explicit errors or warnings + + 1. Inspect the [Logs](/#/components/\{{ componentUrnForUrl \}}#logs) of all the containers in this pod. + 2. Scroll through it and validate if there is an excessive amount of errors. + 1. if a container is crashing due to an out of memory error, the logs may show errors related to memory allocation or exhaustion. + - If this is the case check if the memory limits are too low in which case you can make them higher. + - If the memory problem is not resolved you might have introduced an memory leak in which case you want to take a look at the last deployment. + - If there are no limits you might have a proble with the physical memory on the node running the pod. + 2. if a container is crashing due to a configuration error, the logs may show errors related to the incorrect configuration. + + ### Understand application + + It is important to understand what the intended behaviour of the application should be. + A good place to start is the [configuration](/#/components/\{{ componentUrnForUrl\}}#configuration). + Pay attention to environment variables and volume mounts as these are mechanism to configure the application. + We can use references to configmaps and secrets to futher explore configuration information. + + ### Pod Events + Check the pod events to identify any explicit errors or warnings. + 1. Go to the [Pod events page](/#/components/\{{ componentUrnForUrl \}}/events). + 2. Check if there is a large amount of events like `BackOff`, `FailedScheduling` or `FailedAttachVolume` + 3. If this is the case, see if the event details (click on the event) contains more information about this issue. + + ### Recent Deployment + Look at the pod age in the "About" section on the [Pod highlight page](/#/components/\{{ componentUrnForUrl \}}) to identify any recent deployments that might have caused the issue + + 1. The "Age" is shown in the "About" section on the left side of the screen + 2. If the "Age" and the time that the monitor was triggered are in close proximity then take a look at the most recent deployment by clicking on [Show last change](/#/components/\{{ componentUrnForUrl \}}#lastChange). + \{{/if\}} + \{{#if reasons.ImagePullBackOff\}} + ## ImagePullBackOff + + If you see the "ImagePullBackOff" error message while trying to pull a container image from a registry, it means that + the Docker engine was unable to pull the requested image for some reason. + + The reason field at the top of this monitor (full screen) might give you more information about the specific issue at hand. + + ## Diagnose + + To diagnose the problem, try the following actions: + + - Go to the [pod events page filtered by failed or unhealthy events](/#/components/\{{ componentUrnForUrl \}}/events?view=eventTypes--Unhealthy,Created,FailedMount,Failed) + + If there are no "Failed" events shown increase the time-range by clicking on the Zoom-out button on next to the telemetry-time-interval on the bottom left of the timeline. + + Click on the left side of the [Pod highlight page](/#/components/\{{ componentUrnForUrl \}}) on "Containers" in the "Related resources" + to view the `containers` and the `Image URL`. + + ## Common causes + + ### Rate Limit + A docker hub rate limit has been reached. + + Typical resolution is to authenticate using docker hub credentials (it will increase the rate limit from 100 to 200 pulls per 6 hours) + or to get a paid account and authenticate with that (bumping the limit to 5000 pulls per day). + + ### Network connectivity issues + Check your internet connection or the connection to the registry where the image is hosted. + + ### Authentication problems + If the registry requires authentication, make sure that your credentials are correct and that + you have the necessary permissions to access the image. + + ### Image availability + Verify that the image you are trying to pull exists in the registry and that you have specified the correct image name and tag. + + Here are some steps you can take to resolve the "ImagePullBackOff" error: + + 1. Check the registry logs for any error messages that might provide more information about the issue. + 2. Verify that the image exists in the registry and that you have the correct image name and tag. + 3. Check your network connectivity to ensure that you can reach the registry. + 4. Check the authentication credentials to ensure that they are correct and have the necessary permissions. + + If none of these steps work, you may need to consult the Docker documentation or contact support for the registry or Docker + itself for further assistance. + \{{/if\}} + \{{/if\}} + status: ENABLED + tags: + - pods + - containers +timestamp: 2024-10-17T10:15:31.714348Z[Etc/UTC] diff --git a/scripts/download.sh b/scripts/download.sh index 45717e8..7612b77 100644 --- a/scripts/download.sh +++ b/scripts/download.sh @@ -68,6 +68,8 @@ download() { rm -rf ${OUTPUT_FOLDER}/scripts fi mv ${GIT_REPO_NAME}-${GIT_FOLDER}/scripts ${OUTPUT_FOLDER} + mkdir -p ${OUTPUT_FOLDER}/assets + mv ${GIT_REPO_NAME}-${GIT_FOLDER}/assets ${OUTPUT_FOLDER}/assets } cleanup() { diff --git a/scripts/observability/monitors.sh b/scripts/observability/monitors.sh new file mode 100644 index 0000000..7bd8e23 --- /dev/null +++ b/scripts/observability/monitors.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +observability_disable_monitor() { + local url=$1 + local service_token=$2 + local monitor_identifier=$3 + /usr/local/bin/sts monitor disable --identifier $monitor_identifier --service-token $service_token --url $url +} + +observability_deploy_monitor() { + local url=$1 + local service_token=$2 + local file $3 + /usr/local/bin/sts monitor apply -f $file --service-token $service_token --url $url +} + +observability_enable_monitor() { + local url=$1 + local service_token=$2 + local monitor_identifier=$3 + /usr/local/bin/sts monitor enable --identifier $monitor_identifier --service-token $service_token --url $url +} + From 1affbcbaf1d4243c39e8179b236e12c042e45c1a Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 7 Nov 2024 12:31:22 +0100 Subject: [PATCH 10/40] Switched parameters Signed-off-by: Jeroen van Erp --- scripts/observability/monitors.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/observability/monitors.sh b/scripts/observability/monitors.sh index 7bd8e23..0aae47c 100644 --- a/scripts/observability/monitors.sh +++ b/scripts/observability/monitors.sh @@ -1,23 +1,23 @@ #!/bin/bash observability_disable_monitor() { - local url=$1 - local service_token=$2 - local monitor_identifier=$3 + local monitor_identifier=$1 + local url=$2 + local service_token=$3 /usr/local/bin/sts monitor disable --identifier $monitor_identifier --service-token $service_token --url $url } observability_deploy_monitor() { - local url=$1 - local service_token=$2 - local file $3 + local file $1 + local url=$2 + local service_token=$3 /usr/local/bin/sts monitor apply -f $file --service-token $service_token --url $url } observability_enable_monitor() { - local url=$1 - local service_token=$2 - local monitor_identifier=$3 + local monitor_identifier=$1 + local url=$2 + local service_token=$3 /usr/local/bin/sts monitor enable --identifier $monitor_identifier --service-token $service_token --url $url } From a78615b8f891d0bd59e553a914d06e0e6c20088c Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 7 Nov 2024 15:30:33 +0100 Subject: [PATCH 11/40] Fix download script Signed-off-by: Jeroen van Erp --- scripts/download.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/download.sh b/scripts/download.sh index 7612b77..9d54438 100644 --- a/scripts/download.sh +++ b/scripts/download.sh @@ -68,8 +68,7 @@ download() { rm -rf ${OUTPUT_FOLDER}/scripts fi mv ${GIT_REPO_NAME}-${GIT_FOLDER}/scripts ${OUTPUT_FOLDER} - mkdir -p ${OUTPUT_FOLDER}/assets - mv ${GIT_REPO_NAME}-${GIT_FOLDER}/assets ${OUTPUT_FOLDER}/assets + mv ${GIT_REPO_NAME}-${GIT_FOLDER}/assets ${OUTPUT_FOLDER} } cleanup() { From 4b159679cd01fe3b51259fed56b9d354581f16e3 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 7 Nov 2024 17:59:32 +0100 Subject: [PATCH 12/40] Add fleet assets Signed-off-by: Jeroen van Erp --- assets/fleet/clustergroup.yaml | 16 ++++++++++++++++ assets/fleet/gitrepo.yaml | 24 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 assets/fleet/clustergroup.yaml create mode 100644 assets/fleet/gitrepo.yaml diff --git a/assets/fleet/clustergroup.yaml b/assets/fleet/clustergroup.yaml new file mode 100644 index 0000000..b51a22b --- /dev/null +++ b/assets/fleet/clustergroup.yaml @@ -0,0 +1,16 @@ +apiVersion: fleet.cattle.io/v1alpha1 +kind: ClusterGroup +metadata: + name: build-a-dino + annotations: + {} + # key: string + labels: + {} + # key: string + namespace: fleet-default +spec: + selector: + matchLabels: + gpu-enabled: 'true' + app: build-a-dino diff --git a/assets/fleet/gitrepo.yaml b/assets/fleet/gitrepo.yaml new file mode 100644 index 0000000..7d3702e --- /dev/null +++ b/assets/fleet/gitrepo.yaml @@ -0,0 +1,24 @@ +apiVersion: fleet.cattle.io/v1alpha1 +kind: GitRepo +metadata: + name: build-a-dino + annotations: + {} + # key: string + labels: + {} + # key: string + namespace: fleet-default +spec: + branch: main + correctDrift: + enabled: true +# force: boolean +# keepFailHistory: boolean + insecureSkipTLSVerify: false + paths: + - /fleet/build-a-dino +# - string + repo: https://github.com/wiredquill/prime-rodeo + targets: + - clusterGroup: build-a-dino From 05e0290f168c6c60308f1eda75644c71d05db235 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 12 Nov 2024 06:50:11 -0700 Subject: [PATCH 13/40] Added cpu throttling monitor in assets Signed-off-by: Jeroen van Erp --- assets/monitors/cpu-throttling.yaml | 85 +++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 assets/monitors/cpu-throttling.yaml diff --git a/assets/monitors/cpu-throttling.yaml b/assets/monitors/cpu-throttling.yaml new file mode 100644 index 0000000..0164a01 --- /dev/null +++ b/assets/monitors/cpu-throttling.yaml @@ -0,0 +1,85 @@ +nodes: +- _type: Monitor + arguments: + comparator: GT + failureState: DEVIATING + metric: + aliasTemplate: CPU Throttling for ${container} of ${pod_name} + query: 100 * sum by (cluster_name, namespace, pod_name, container) (container_cpu_throttled_periods{}) + / sum by (cluster_name, namespace, pod_name, container) (container_cpu_elapsed_periods{}) + unit: percent + threshold: 95.0 + urnTemplate: urn:kubernetes:/${cluster_name}:${namespace}:pod/${pod_name} + description: |- + In Kubernetes, CPU throttling refers to the process where limits are applied to the amount of CPU resources a container can use. + This typically occurs when a container approaches the maximum CPU resources allocated to it, causing the system to throttle or restrict + its CPU usage to prevent a crash. + + While CPU throttling can help maintain system stability by avoiding crashes due to CPU exhaustion, it can also significantly slow down workload + performance. Ideally, CPU throttling should be avoided by ensuring that containers have access to sufficient CPU resources. + This proactive approach helps maintain optimal performance and prevents the slowdown associated with throttling. + function: {{ get "urn:stackpack:common:monitor-function:threshold" }} + id: -13 + identifier: urn:custom:monitor:pod-cpu-throttling-v2 + intervalSeconds: 60 + name: CPU Throttling V2 + remediationHint: |- + + ### Application behaviour + + Check the container [Logs](/#/components/\{{ componentUrnForUrl \}}#logs) for any hints on how the application is behaving under CPU Throttling + + ### Understanding CPU Usage and CPU Throttling + + On the [pod metrics page](/#/components/\{{ componentUrnForUrl \}}/metrics) you will find the CPU Usage and CPU Throttling charts. + + #### CPU Trottling + + The percentage of CPU throttling over time. CPU throttling occurs when a container reaches its CPU limit, restricting its CPU usage to + prevent it from exceeding the specified limit. The higher the percentage, the more throttling is occurring, which means the container's + performance is being constrained. + + #### CPU Usage + + This chart shows three key CPU metrics over time: + + 1. Request: The amount of CPU the container requests as its minimum requirement. This sets the baseline CPU resources the container is guaranteed to receive. + 2. Limit: The maximum amount of CPU the container can use. If the container's usage reaches this limit, throttling will occur. + 3. Current: The actual CPU usage of the container in real-time. + + The `Request` and `Limit` settings in the container can be seen in `Resource` section in [configuration](/#/components/\{{ componentUrnForUrl\}}#configuration) + + #### Correlation + + The two charts are correlated in the following way: + + - As the `Current` CPU usage approaches the CPU `Limit`, the CPU throttling percentage increases. This is because the container tries to use more CPU than it is allowed, and the system restricts it, causing throttling. + - The aim is to keep the `Current` usage below the `Limit` to minimize throttling. If you see frequent high percentages in the CPU throttling chart, it suggests that you may need to adjust the CPU limits or optimize the container's workload to reduce CPU demand. + + + ### Adjust CPU Requests and Limits + + On the [pod highlights page](/#/components/\{{ componentUrnForUrl \}}/highlights) and checking whether a `Deployment` event happened recently after which the cpu usage behaviour changed. + + You can investigate which change led to the cpu throttling by checking the [Show last change](/#/components/\{{ componentUrnForUrl \}}#lastChange), + which will highlight the latest changeset for the deployment. You can then revert the change or fix the cpu request and limit. + + + Review the pod's resource requests and limits to ensure they are set appropriately. + Show component [configuration](/#/components/\{{ componentUrnForUrl \}}#configuration) + + If the CPU usage consistently hits the limit, consider increasing the CPU limit of the pod.
+ Edit the pod or deployment configuration file to modify the `resources.limits.cpu` and `resources.requests.cpu` as needed. + ``` + resources: + requests: + cpu: "500m" # Adjust this value based on analysis + limits: + cpu: "1" # Adjust this value based on analysis + ``` + If CPU throttling persists, consider horizontal pod autoscaling to distribute the workload across more pods, or adjust the cluster's node resources to meet the demands. Continuously monitor and fine-tune resource settings to optimize performance and prevent further throttling issues. + status: ENABLED + tags: + - cpu + - performance + - pod From b092e60a22b34e7021664b02c1b45da8bbbb26ba Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 16 Jan 2025 14:21:45 +0100 Subject: [PATCH 14/40] New monitors Signed-off-by: Jeroen van Erp --- assets/monitors/certificate-expiration.yaml | 48 +++++++++++ .../http-error-ratio-for-service.yaml | 70 ++++++++++++++++ assets/monitors/oom-for-containers.yaml | 81 +++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 assets/monitors/certificate-expiration.yaml create mode 100644 assets/monitors/http-error-ratio-for-service.yaml create mode 100644 assets/monitors/oom-for-containers.yaml diff --git a/assets/monitors/certificate-expiration.yaml b/assets/monitors/certificate-expiration.yaml new file mode 100644 index 0000000..80f689c --- /dev/null +++ b/assets/monitors/certificate-expiration.yaml @@ -0,0 +1,48 @@ +nodes: +- _type: Monitor + arguments: + criticalThreshold: 1w + deviatingThreshold: 30d + query: type = "secret" AND label = "secret-type:certificate" + resourceName: Certificate + timestampProperty: certificateExpiration + description: Verify certificates that are close to it's expiration date + function: {{ get "urn:stackpack:common:monitor-function:topology-timestamp-threshold-monitor" }} + id: -12 + identifier: urn:custom:monitor:certificate-expiration-v2 + intervalSeconds: 60 + name: Certificate Expiration V2 + remediationHint: | + + Certificate expiration date `\{{certificateExpiration\}}`. + + ### Obtain new TLS certificates + + If you're using a Certificate Authority (CA) or a third-party provider, follow their procedures to obtain a new TLS certificate. + Once validated, download the new TLS certificate and the corresponding private key from the third-party provider's dashboard or via their API. + When you have downloaded these two files, you can update the Secret with the new certificate and key data. + + ``` + kubectl create secret tls \{{name\}} --cert=path/to/new/certificate.crt --key=path/to/new/private.key + ``` + + 2. **Generate new self-signed certificates**: + + If you're using self-signed certificates, you can generate new ones locally and update the Secret with the new certificate and key data. + Use tools like OpenSSL to generate new self-signed certificates. + + ``` + openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout path/to/new/private.key -out path/to/new/certificate.crt + ``` + + Update the Secret with the new certificate and key data. + + ``` + kubectl create secret tls \{{name\}} --cert=path/to/new/certificate.crt --key=path/to/new/private.key + ``` + + Alternatively you can edit the existing secret with **`kubectl edit secret \{{name\}}`** and replace the certificate and key data with the new ones obtained from the third-party provider or generated locally. + status: ENABLED + tags: + - certificate + - secret diff --git a/assets/monitors/http-error-ratio-for-service.yaml b/assets/monitors/http-error-ratio-for-service.yaml new file mode 100644 index 0000000..66b8b6f --- /dev/null +++ b/assets/monitors/http-error-ratio-for-service.yaml @@ -0,0 +1,70 @@ +_version: 1.0.85 +nodes: +- _type: Monitor + arguments: + deviatingThreshold: 0.05 + loggingLevel: WARN + timeWindow: 2 minutes + description: |- + HTTP responses with a status code in the 5xx range indicate server-side errors such as a misconfiguration, overload or internal server errors. + To ensure a good user experience, the percentage of 5xx responses should be less than the configured percentage (5% is the default) of the total HTTP responses for a Kubernetes (K8s) service. + To understand the full monitor definition check the details. + Because the exact threshold and severity might be application dependent, the thresholds can be overriden via a Kubernetes annotation on the service. For example to override the pre-configured deviating threshold and instead only have a critical threshold at 6% put this annotation on your service: + ``` + monitor.kubernetes-v2.stackstate.io/http-error-ratio-for-service: | + { + "criticalThreshold": 0.06, + "deviatingThreshold": null + } + ``` + Omitting the deviating threshold from this json snippet would have kept it at the configured 5%, with the critical threshold at 6% that means that the monitor would only result in a deviating state for an error ratio between 5% and 6%. + function: {{ get "urn:stackpack:prime-kubernetes:shared:monitor-function:http-error-ratio-for-service" }} + id: -8 + identifier: urn:stackpack:custom:shared:monitor:http-error-ratio-for-service-v2 + intervalSeconds: 10 + name: HTTP - 5xx error ratio + remediationHint: |- + We have detected that more than 5% of the total responses from your Kubernetes service have a 5xx status code, + this signals that a significant number of users are experiencing downtime and service interruptions. + Take the following steps to diagnose the problem: + ## Possible causes + - Slow dependency or dependency serving errors + - Recent update of the application + - Load on the application has increased + - Code has memory leaks + - Environment issues (e.g. certain nodes, database or services that the service depends on) + ### Slow dependency or dependency serving errors + Check, in the related health violations of this monitor (which can be found in the expanded version if you read this in the pinned minimised version) if there are any health violations on one of the services or pods that this service depends on (focus on the lowest dependency). If you find a violation (deviating or critical health), click on that component to see the related health violations in table next to it. You can than click on those health violations to follow the instructions to resolve the issue. + ### Configuration error + It could be that there is a configuration error in the pod backing this service. Go to the related resources for this service on the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) and click on the "Pods of this service". Select the pod(s) and investigate their logs to see if they're returning any errors. If you find any errors, resolve them. + ### Recent update of the service + Check if the service was recently updated: + - See the Age in the "About" section to identify on the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) + is this is recently deployed + - Check if any of the pods are recently updated by clicking on "Pods of this service" in "Related resource" section of + the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) and look if their Age is recent. + - If application has just started, it might be that the service has not warmed up yet. Compare the response time metrics + for the current deployment with the previous deployment by checking the response time metric chart with a time interval including both. + - Check if application is using more resources than before, consider scaling it up or giving it more resources. + - If increased latency is crucial, consider rolling back the service to the previous version: + - if that helps, then the issue is likely with new deployment + - if that does not help, then the issue may be in the environment (e.g. network issues or issues with the underlying infrastructure, e. g. database) + ### Load on the service has increased + - Check if the amount of requests to the service has increased by looking at the "Throughput (HTTP responses/s)" chart for the "HTTP response metrics for all clients (incoming requests)" on the [service highlight page](/#/components/\{{ componentUrnForUrl \}}). + If so, consider scaling up the service or giving it more resources. + ### Code has memory leaks + - Check if memory or CPU usage have been increasing over time. If so, there might be a memory leak. + You can find the pods supporting this service by clicking on "Pods of this service" in "Related resource" + section of the [service highlight page](/#/components/\{{ componentUrnForUrl \}}). + Check which pods are using the most disk space by clicking on the left side of the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) on "Pods of this service" + - Check all the pods supporting this service by clicking on the pod name + - Check the resource usage on the "Resource usage" section + - Restart the pod(s) of this service that is having the issue or add more memory/cpu + ### Environment issues + - Check latency of particular pods of the service. If only certain pods are having issues, might be an issue with the node the pod is running on: + - Try to move the pod to another node + - Check if other pods of other services are also having latency increased on that node. Drain the node if that is the case. + status: ENABLED + tags: + - services +timestamp: 2025-01-16T13:16:53.208687Z[Etc/UTC] diff --git a/assets/monitors/oom-for-containers.yaml b/assets/monitors/oom-for-containers.yaml new file mode 100644 index 0000000..d24bc40 --- /dev/null +++ b/assets/monitors/oom-for-containers.yaml @@ -0,0 +1,81 @@ +nodes: +- _type: Monitor + arguments: + comparator: GTE + failureState: DEVIATING + metric: + aliasTemplate: OOM Killed count + query: max(increase(kubernetes_containers_last_state_terminated{reason="OOMKilled"}[10m])) + by (cluster_name, namespace, pod_name, container) + unit: short + threshold: 1.0 + urnTemplate: urn:kubernetes:/${cluster_name}:${namespace}:pod/${pod_name} + description: |- + It is important to ensure that the containers running in your Kubernetes cluster have enough memory to function properly. Out of memory (OOM) conditions can cause containers to crash or become unresponsive, leading to restarts and potential data loss. + To monitor for these conditions, we set up a check that detects and reports OOM events in the containers running in the cluster. This check will help you identify any containers that are running out of memory and allow you to take action to prevent issues before they occur. + To understand the full monitor definition check the details. + function: {{ get "urn:stackpack:common:monitor-function:threshold" }} + id: -13 + identifier: urn:custom:monitor:out-of-memory-for-containers-v2 + intervalSeconds: 30 + name: Out of memory for containers V2 + remediationHint: |- + An Out of Memory (OOM) event in Kubernetes occurs when a container's memory usage exceeds the limit set for it. + The Linux kernel's OOM killer process is triggered, which attempts to free up memory by killing one or more processes. + This can cause the container to terminate, leading to issues such as lost data, service interruption, and increased + resource usage. + + Check the container [Logs](/#/components/\{{ componentUrnForUrl \}}#logs) for any hints on how the application is behaving. + + ### Recognize a memory leak + + A memory leak can be recognized by looking at the "Memory Usage" metric on the [pod metrics page](/#/components/\{{ componentUrnForUrl \}}/metrics). + + If the metric resembles a `saw-tooth` pattern that is a clear indication of a slow memory leak being present in your application. + The memory usage increases over time, but the memory is not released until the container is restarted. + + If the metric resembles a `dash` pattern that is an indication of a memory leak via a spike. + The memory usage suddenly increases that causes the limit to be violated and the container killed. + + You will notice that the container continually restarts. + + Common issues that can cause this problem include: + 1. New deployments that introduce a memory leak. + 2. Elevated traffic that causes a temporary increase of memory usage. + 3. Incorrectly configured memory limits. + + ### 1. New deployments that introduce a memory leak + + If the memory leak behaviour is new, it is likely that a new deployment introduced a memory leak. + + This can be checked by looking at the Events shown on the [pod highlights page](/#/components/\{{ componentUrnForUrl \}}/highlights) and checking whether a `Deployment` event happened recently after which the memory usage behaviour changed. + + If the memory leak is caused by a deployment, you can investigate which change led to the memory leak by checking the [Show last change](/#/components/\{{ componentUrnForUrl \}}#lastChange), which will highlight the latest changeset for the deployment. You can then revert the change or fix the memory leak. + + ### 2. Elevated traffic that causes a temporary increase of memory usage + This can be checked by looking at the "Network Throughput for pods (received)" metric on the [pod metrics page](/#/components/\{{ componentUrnForUrl \}}/metrics) and comparing the usage to the "Memory Usage" metric. If the memory usage increases at the same time as the network throughput, it is likely that the memory usage is caused by the increased traffic. + + As a temporary fix you can elevate the memory limit for the container. However, this is not a long-term solution as the memory usage will likely increase again in the future. You can also consider using Kubernetes autoscaling feature to scale up and down the number of replicas based on resource usage. + + ### 3. Incorrectly configured memory Limits + This can be checked by looking at the "Memory Usage" metric on the [pod metrics page](/#/components/\{{ componentUrnForUrl \}}/metrics) and comparing the usage to the requests and limits set for the pod. If the memory usage is higher than the limit set for the pod, the container will be terminated by the OOM killer. + + To fix this issue, you can increate the memory limit for the pod, by changing the Kubernetes resource YAML and increasing the memory limit values e.g. + ``` + metadata: + … + spec: + containers: + … + resources: + limits: + cpu: "2" + memory: "3Gi" + requests: + cpu: "2" + memory: "3Gi" + ``` + status: ENABLED + tags: + - containers + - pods From 01a3da0e2afad969f5c8e2d59eeb38e763526b53 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 16 Jan 2025 15:51:17 +0100 Subject: [PATCH 15/40] Fix monitor naming Signed-off-by: Jeroen van Erp --- .../{oom-for-containers.yaml => out-of-memory-containers.yaml} | 2 +- .../monitors/{cpu-throttling.yaml => pod-cpu-throttling.yaml} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename assets/monitors/{oom-for-containers.yaml => out-of-memory-containers.yaml} (98%) rename assets/monitors/{cpu-throttling.yaml => pod-cpu-throttling.yaml} (100%) diff --git a/assets/monitors/oom-for-containers.yaml b/assets/monitors/out-of-memory-containers.yaml similarity index 98% rename from assets/monitors/oom-for-containers.yaml rename to assets/monitors/out-of-memory-containers.yaml index d24bc40..95507af 100644 --- a/assets/monitors/oom-for-containers.yaml +++ b/assets/monitors/out-of-memory-containers.yaml @@ -16,7 +16,7 @@ nodes: To understand the full monitor definition check the details. function: {{ get "urn:stackpack:common:monitor-function:threshold" }} id: -13 - identifier: urn:custom:monitor:out-of-memory-for-containers-v2 + identifier: urn:custom:monitor:out-of-memory-containers-v2 intervalSeconds: 30 name: Out of memory for containers V2 remediationHint: |- diff --git a/assets/monitors/cpu-throttling.yaml b/assets/monitors/pod-cpu-throttling.yaml similarity index 100% rename from assets/monitors/cpu-throttling.yaml rename to assets/monitors/pod-cpu-throttling.yaml From 4cd3229ec06b737164d2ff82389a5b2679c8e7ac Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 17 Jan 2025 17:44:19 +0100 Subject: [PATCH 16/40] New remediation guide Signed-off-by: Jeroen van Erp --- assets/monitors/http-error-ratio-for-service.yaml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/assets/monitors/http-error-ratio-for-service.yaml b/assets/monitors/http-error-ratio-for-service.yaml index 66b8b6f..97920b3 100644 --- a/assets/monitors/http-error-ratio-for-service.yaml +++ b/assets/monitors/http-error-ratio-for-service.yaml @@ -27,16 +27,27 @@ nodes: We have detected that more than 5% of the total responses from your Kubernetes service have a 5xx status code, this signals that a significant number of users are experiencing downtime and service interruptions. Take the following steps to diagnose the problem: + ## Possible causes - Slow dependency or dependency serving errors - Recent update of the application - Load on the application has increased - Code has memory leaks - Environment issues (e.g. certain nodes, database or services that the service depends on) + ### Slow dependency or dependency serving errors Check, in the related health violations of this monitor (which can be found in the expanded version if you read this in the pinned minimised version) if there are any health violations on one of the services or pods that this service depends on (focus on the lowest dependency). If you find a violation (deviating or critical health), click on that component to see the related health violations in table next to it. You can than click on those health violations to follow the instructions to resolve the issue. - ### Configuration error - It could be that there is a configuration error in the pod backing this service. Go to the related resources for this service on the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) and click on the "Pods of this service". Select the pod(s) and investigate their logs to see if they're returning any errors. If you find any errors, resolve them. + + ### New behavior of the service + If there are no dependencies that have health violations, it could be that the pod backing this service is returning errors. If this behavior is new, it could be caused by a recent deployment. + + This can be checked by looking at the Events shown on the [service highlights page](/#/components/\{{ componentUrnForUrl \}}/highlights) and checking whether a `Deployment` event happened recently after which the HTTP Error ratio behaviour changed. + + To troubleshoot further, you can have a look at the pod(s) backing this service. + - Click on the "Pods of this service" in the "Related resource" section of the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) + - Click on the pod name(s) to go to their highlights pages + - Check the logs of the pod(s) to see if they're returning any errors. + ### Recent update of the service Check if the service was recently updated: - See the Age in the "About" section to identify on the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) From e72cd45065eb45558291fe5f3a7b3e98d9862d78 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 21 Jan 2025 10:37:59 +0100 Subject: [PATCH 17/40] 30s interval for cert expiration Signed-off-by: Jeroen van Erp --- assets/monitors/certificate-expiration.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/monitors/certificate-expiration.yaml b/assets/monitors/certificate-expiration.yaml index 80f689c..bf20abd 100644 --- a/assets/monitors/certificate-expiration.yaml +++ b/assets/monitors/certificate-expiration.yaml @@ -10,7 +10,7 @@ nodes: function: {{ get "urn:stackpack:common:monitor-function:topology-timestamp-threshold-monitor" }} id: -12 identifier: urn:custom:monitor:certificate-expiration-v2 - intervalSeconds: 60 + intervalSeconds: 30 name: Certificate Expiration V2 remediationHint: | From d933f57cb02b510dfd4d0d5bd06692af75f9cba9 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 24 Jan 2025 08:53:48 +0100 Subject: [PATCH 18/40] Added Observability platform install functions Signed-off-by: Jeroen van Erp --- scripts/observability/platform.sh | 85 +++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 scripts/observability/platform.sh diff --git a/scripts/observability/platform.sh b/scripts/observability/platform.sh new file mode 100644 index 0000000..c8c2748 --- /dev/null +++ b/scripts/observability/platform.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +observability_platform_generate_values() { + local host=$1 + local license=$2 + local password=$3 + local values_dir=$4 + helm template --set license=$license \ + --set baseUrl=$host \ + --set adminPassword=$password \ + --set sizing.profile=trial \ + suse-observability-values suse-observability/suse-observability-values \ + --output-dir $values_dir + + cat << EOF > $values_dir/suse-observability-values/templates/ingress_values.yaml +ingress: + enabled: true + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/proxy-body-size: "50m" + hosts: + - host: $host + tls: + - hosts: + - $host + secretName: tls-secret +EOF +} + +observability_platform_bootstrap_token() { + local token=$1 + local values_dir=$2 + + cat << EOF > $values_dir/suse-observability-values/templates/bootstrap_token.yaml +stackstate: + authentication: + serviceToken: + bootstrap: + token: $token + roles: ["stackstate-k8s-troubleshooter", "stackstate-admin", "stackstate-k8s-admin"] +EOF +} + +observability_platform_install() { + local values_dir=$1 + helm upgrade --install --namespace suse-observability --create-namespace \ + --values $values_dir/suse-observability-values/templates/baseConfig_values.yaml \ + --values $values_dir/suse-observability-values/templates/sizing_values.yaml \ + --values $values_dir/suse-observability-values/templates/ingress_values.yaml \ + --values $values_dir/suse-observability-values/templates/bootstrap_token.yaml \ + suse-observability suse-observability/suse-observability +} + +observability_platform_wait_ready() { + echo ">>> Waiting for SUSE Observability to be ready" + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=suse-observability -n suse-observability --timeout=300s + if [ $? -ne 0 ]; then + echo ">>> SUSE Observability is not ready" + NON_RUNNING=$(kubectl get pods -n suse-observability -o json | jq -r '[.items[] | select(.status.phase != "Running" and .status.phase != "Succeeded") | {name: .metadata.name, status: .status.phase}]') + echo "Pods not running yet: $NON_RUNNING" + else + # # Wait for Observability URL available + _counter=0 + while [[ $_counter -lt 50 ]]; do + curl -sSfk $OBSERVABILITY_URL/api > /dev/null + if [ $? -eq 0 ]; then + break + fi + ((_counter++)) + echo "Waiting for Observability URL to be available... attempt ${_counter}/50" + sleep 5 + done + + if [[ $_counter -ge 50 ]] + then + # Exit with error should be uncommented for production labs. + echo ">>> TIME OUT for Observability URL to be available" + # exit 69 + else + echo ">>> Observability at '$OBSERVABILITY_URL' is available!" + fi + fi + echo ">>> SUSE Observability is ready" + +} From 9e048feef545fdafe6c1923d4b8d1b4345ad12a0 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 24 Jan 2025 12:02:47 +0100 Subject: [PATCH 19/40] Add longhorn install Signed-off-by: Jeroen van Erp --- scripts/longhorn/install.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 scripts/longhorn/install.sh diff --git a/scripts/longhorn/install.sh b/scripts/longhorn/install.sh new file mode 100644 index 0000000..ed3a78c --- /dev/null +++ b/scripts/longhorn/install.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +longhorn_install() { + local hostname=$1 + echo '>>> Setup prerequisites for Longhorn install' + helm repo add longhorn https://charts.longhorn.io + helm repo update + zypper install -y open-iscsi cryptsetup + systemctl enable --now iscsid.service + modprobe iscsi_tcp + echo '=== Check prerequisites' + curl -k https://raw.githubusercontent.com/longhorn/longhorn/refs/heads/master/scripts/environment_check.sh 2>/dev/null | bash || exit 1 + echo '=== Install LongHorn' + helm upgrade -i longhorn longhorn/longhorn --namespace longhorn-system --create-namespace --set ingress.enabled=true --set ingress.host=$hostname --set persistence.migratable=true --set longhornUI.replicas=1 + echo '<<< Longhorn should be available in a few minutes in: $hostname" +} From 742f7ebbeb3374e72205f3e98cf83067e94b6243 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 24 Jan 2025 14:08:06 +0100 Subject: [PATCH 20/40] Fix quo5te Signed-off-by: Jeroen van Erp --- scripts/longhorn/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/longhorn/install.sh b/scripts/longhorn/install.sh index ed3a78c..4d8365d 100644 --- a/scripts/longhorn/install.sh +++ b/scripts/longhorn/install.sh @@ -12,5 +12,5 @@ longhorn_install() { curl -k https://raw.githubusercontent.com/longhorn/longhorn/refs/heads/master/scripts/environment_check.sh 2>/dev/null | bash || exit 1 echo '=== Install LongHorn' helm upgrade -i longhorn longhorn/longhorn --namespace longhorn-system --create-namespace --set ingress.enabled=true --set ingress.host=$hostname --set persistence.migratable=true --set longhornUI.replicas=1 - echo '<<< Longhorn should be available in a few minutes in: $hostname" + echo "<<< Longhorn should be available in a few minutes in: $hostname" } From 58346bbe0cb3a290137211e884be398c2f532575 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 30 Jan 2025 16:09:52 +0100 Subject: [PATCH 21/40] Add checks for setup Signed-off-by: Jeroen van Erp --- scripts/kubernetes/certificate_management.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/kubernetes/certificate_management.sh b/scripts/kubernetes/certificate_management.sh index ec8bb86..f31266c 100644 --- a/scripts/kubernetes/certificate_management.sh +++ b/scripts/kubernetes/certificate_management.sh @@ -15,9 +15,17 @@ k8s_install_certmanager() { helm repo add jetstack https://charts.jetstack.io helm repo update kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/${version}/cert-manager.crds.yaml + if $? -ne 0; then + echo "Failed to install cert-manager CRDs" + exit 1 + fi helm upgrade --install cert-manager jetstack/cert-manager \ --namespace cert-manager --create-namespace \ --version ${version} + if $? -ne 0; then + echo "Failed to install cert-manager" + exit 1 + fi kubectl wait pods -n cert-manager -l app.kubernetes.io/instance=cert-manager --for condition=Ready 2>/dev/null } @@ -40,6 +48,10 @@ k8s_create_letsencryptclusterissuer() { --namespace cert-manager \ --set ingress.className=${ingressClassname} \ --set registration.emailAddress=${emailAddress} + if $? -ne 0; then + echo "Failed to create Let's Encrypt cluster issuer" + exit 1 + fi sleep 5 while kubectl get clusterissuers -o json | jq -e '.items[] | select(.status.conditions[] | select(.type == "Ready" and .status != "True"))' > /dev/null; do sleep 1 From 15d4f11f6e0eda46f820ffbf169a0f0dad026c85 Mon Sep 17 00:00:00 2001 From: rmahique-github <> Date: Fri, 31 Jan 2025 16:59:44 +0100 Subject: [PATCH 22/40] added retry limit --- scripts/rancher/manager_lifecycle.sh | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index 180ee48..65973d9 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -65,16 +65,29 @@ rancher_first_login() { # rancher_wait_capiready ####################################### rancher_wait_capiready() { - while true; do + +_counter=0 +while [[ $_counter -lt 50 ]]; do + _counter=0 + while [[ $_counter -lt 25 ]]; do status=$(kubectl get deployment capi-controller-manager -n cattle-provisioning-capi-system -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null) if [ "$status" == 'True' ]; then echo 'Deployment capi-controller-manager is available' break fi sleep 10 + ((_counter++)) + echo "Waiting for capi-controller-manager to become available... attempt ${_counter}/25" done - while [[ $(kubectl get endpoints capi-webhook-service -n cattle-provisioning-capi-system -o jsonpath='{.subsets}' 2>/dev/null) == '' ]]; do + _counter=0 + while [[ $_counter -lt 25 ]]; do + if [[ ! $(kubectl get endpoints capi-webhook-service -n cattle-provisioning-capi-system -o jsonpath='{.subsets}' 2>/dev/null) == '' ]]; then + echo 'Endpoint is ready' + break + fi sleep 10 + ((_counter++)) + echo "Waiting for endpoint capi-webhook-service to be ready... attempt ${_counter}/25" done echo 'Service capi-webhook-service is ready' } From 67294053dc5de8880d25a68e1ec13aa99cc9d545 Mon Sep 17 00:00:00 2001 From: rmahique-github <> Date: Fri, 31 Jan 2025 20:10:03 +0100 Subject: [PATCH 23/40] fix --- scripts/rancher/manager_lifecycle.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index 65973d9..a757510 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -66,8 +66,6 @@ rancher_first_login() { ####################################### rancher_wait_capiready() { -_counter=0 -while [[ $_counter -lt 50 ]]; do _counter=0 while [[ $_counter -lt 25 ]]; do status=$(kubectl get deployment capi-controller-manager -n cattle-provisioning-capi-system -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null) From 6ea79dc1f39f4b178831a8fd2ca085693ce8f222 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Mon, 3 Feb 2025 09:46:00 +0100 Subject: [PATCH 24/40] Fix for if-statement cert-manager Signed-off-by: Jeroen van Erp --- scripts/kubernetes/certificate_management.sh | 6 +++--- scripts/rancher/manager_lifecycle.sh | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/kubernetes/certificate_management.sh b/scripts/kubernetes/certificate_management.sh index f31266c..4c1326a 100644 --- a/scripts/kubernetes/certificate_management.sh +++ b/scripts/kubernetes/certificate_management.sh @@ -15,14 +15,14 @@ k8s_install_certmanager() { helm repo add jetstack https://charts.jetstack.io helm repo update kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/${version}/cert-manager.crds.yaml - if $? -ne 0; then + if [ $? -ne 0 ]; then echo "Failed to install cert-manager CRDs" exit 1 fi helm upgrade --install cert-manager jetstack/cert-manager \ --namespace cert-manager --create-namespace \ --version ${version} - if $? -ne 0; then + if [ $? -ne 0 ]; then echo "Failed to install cert-manager" exit 1 fi @@ -48,7 +48,7 @@ k8s_create_letsencryptclusterissuer() { --namespace cert-manager \ --set ingress.className=${ingressClassname} \ --set registration.emailAddress=${emailAddress} - if $? -ne 0; then + if [ $? -ne 0 ]; then echo "Failed to create Let's Encrypt cluster issuer" exit 1 fi diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index f9d07e1..47a97c1 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -98,7 +98,6 @@ rancher_first_login() { # rancher_wait_capiready ####################################### rancher_wait_capiready() { - _counter=0 while [[ $_counter -lt 25 ]]; do status=$(kubectl get deployment capi-controller-manager -n cattle-provisioning-capi-system -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null) From 52fda4dbcaae0a060dde3824b9f68fe03488114d Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Mon, 3 Feb 2025 10:13:42 +0100 Subject: [PATCH 25/40] Remove dependency on opensource.suse.com --- scripts/kubernetes/certificate_management.sh | 38 ++++++++++++++++---- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/scripts/kubernetes/certificate_management.sh b/scripts/kubernetes/certificate_management.sh index 4c1326a..0cf02c0 100644 --- a/scripts/kubernetes/certificate_management.sh +++ b/scripts/kubernetes/certificate_management.sh @@ -42,12 +42,38 @@ k8s_create_letsencryptclusterissuer() { local emailAddress=$2 echo "Creating certificate issuers using Let's Encrypt..." - helm repo add suse-lab-setup https://opensource.suse.com/lab-setup - helm repo update - helm upgrade --install letsencrypt suse-lab-setup/letsencrypt \ - --namespace cert-manager \ - --set ingress.className=${ingressClassname} \ - --set registration.emailAddress=${emailAddress} + kubectl apply -f - < Date: Tue, 4 Feb 2025 16:26:30 +0100 Subject: [PATCH 26/40] added nowait Signed-off-by: Jeroen van Erp --- scripts/observability/agent.sh | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/scripts/observability/agent.sh b/scripts/observability/agent.sh index b6a4c1c..6ba2762 100644 --- a/scripts/observability/agent.sh +++ b/scripts/observability/agent.sh @@ -1,15 +1,15 @@ #!/bin/bash ####################################### -# Install the Observability agent in the cluster +# Install the Observability agent in the cluster and not wait for the pods to be ready # Arguments: # url (SUSE Observability) # cluster_name # ingestion_api_key # Examples: -# observability_agent_install https://obs.suse.com demo xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +# observability_agent_install_nowait https://obs.suse.com demo xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx ####################################### -observability_agent_install() { +observability_agent_install_nowait() { local url=$1 local cluster_name=$2 local ingestion_api_key=$3 @@ -22,6 +22,23 @@ observability_agent_install() { --set stackstate.apiKey=${ingestion_api_key} \ --set stackstate.url="${url%/}/receiver/stsAgent" \ --set stackstate.cluster.name=${cluster_name} +} + +####################################### +# Install the Observability agent in the cluster +# Arguments: +# url (SUSE Observability) +# cluster_name +# ingestion_api_key +# Examples: +# observability_agent_install https://obs.suse.com demo xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +####################################### +observability_agent_install() { + local url=$1 + local cluster_name=$2 + local ingestion_api_key=$3 + + observability_agent_install_nowait $url $cluster_name $ingestion_api_key - kubectl wait pods -n suse-observability -l app.kubernetes.io/instance=suse-observability-agent --for condition=Ready 2>/dev/null + kubectl wait pods -n suse-observability -l app.kubernetes.io/instance=suse-observability-agent --for condition=Ready } From f038e5b627b0917a7771ffda38b797d072595206 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 6 Feb 2025 09:32:26 +0100 Subject: [PATCH 27/40] use kubectl wait to wait for pods Signed-off-by: Jeroen van Erp --- scripts/kubernetes/cluster_status.sh | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/scripts/kubernetes/cluster_status.sh b/scripts/kubernetes/cluster_status.sh index 1fa95ee..39cd31d 100644 --- a/scripts/kubernetes/cluster_status.sh +++ b/scripts/kubernetes/cluster_status.sh @@ -25,17 +25,11 @@ k8s_wait_fornodesandpods() { done # checks pods are completed or running - while ! kubectl get pods --all-namespaces --no-headers 2>/dev/null | grep -q .; do - echo 'Waiting for pods to be available...' - sleep 5 - done - while true; do - NOT_READY_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l) - if [ "$NOT_READY_PODS" -eq 0 ]; then - echo 'All pods are in Running or Completed status.' - break - else - sleep 5 - fi - done + kubectl wait --for=condition=Ready pods --all --timeout=300s + if [ $? -ne 0 ]; then + NOT_READY_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers) + echo 'Error: pods are not in Running or Completed status.' + echo "Not ready pods: $NOT_READY_PODS" + exit 1 + fi } From c9e9924e7aefbf3a286c7bbc292db143241682f5 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 6 Feb 2025 09:45:39 +0100 Subject: [PATCH 28/40] Fix script Signed-off-by: Jeroen van Erp --- scripts/kubernetes/cluster_status.sh | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/scripts/kubernetes/cluster_status.sh b/scripts/kubernetes/cluster_status.sh index 39cd31d..6806c29 100644 --- a/scripts/kubernetes/cluster_status.sh +++ b/scripts/kubernetes/cluster_status.sh @@ -25,11 +25,24 @@ k8s_wait_fornodesandpods() { done # checks pods are completed or running - kubectl wait --for=condition=Ready pods --all --timeout=300s + while ! kubectl get pods --all-namespaces --no-headers 2>/dev/null | grep -q .; do + echo 'Waiting for pods to be available...' + sleep 5 + done + kubectl wait --for=condition=Ready pods --all-namespaces --timeout=300s if [ $? -ne 0 ]; then NOT_READY_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers) - echo 'Error: pods are not in Running or Completed status.' - echo "Not ready pods: $NOT_READY_PODS" + echo 'Some pods are not ready.' + echo "$NOT_READY_PODS" exit 1 fi + + while true; do + if [ "$NOT_READY_PODS" -eq 0 ]; then + echo 'All pods are in Running or Completed status.' + break + else + sleep 5 + fi + done } From b8f269b62765e33b8e1a6d882cef9050fa4e729c Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 6 Feb 2025 09:46:35 +0100 Subject: [PATCH 29/40] fix on fix Signed-off-by: Jeroen van Erp --- scripts/kubernetes/cluster_status.sh | 9 --------- 1 file changed, 9 deletions(-) diff --git a/scripts/kubernetes/cluster_status.sh b/scripts/kubernetes/cluster_status.sh index 6806c29..9ee3642 100644 --- a/scripts/kubernetes/cluster_status.sh +++ b/scripts/kubernetes/cluster_status.sh @@ -36,13 +36,4 @@ k8s_wait_fornodesandpods() { echo "$NOT_READY_PODS" exit 1 fi - - while true; do - if [ "$NOT_READY_PODS" -eq 0 ]; then - echo 'All pods are in Running or Completed status.' - break - else - sleep 5 - fi - done } From 62cfd823f318f3f5fe14c06137abc56cad572b65 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 6 Feb 2025 09:59:04 +0100 Subject: [PATCH 30/40] More logging for create cluster Signed-off-by: Jeroen van Erp --- scripts/kubernetes/cluster_status.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/kubernetes/cluster_status.sh b/scripts/kubernetes/cluster_status.sh index 9ee3642..0121280 100644 --- a/scripts/kubernetes/cluster_status.sh +++ b/scripts/kubernetes/cluster_status.sh @@ -29,11 +29,16 @@ k8s_wait_fornodesandpods() { echo 'Waiting for pods to be available...' sleep 5 done - kubectl wait --for=condition=Ready pods --all-namespaces --timeout=300s - if [ $? -ne 0 ]; then - NOT_READY_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers) - echo 'Some pods are not ready.' - echo "$NOT_READY_PODS" - exit 1 - fi + while true; do + NOT_READY_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l) + if [ "$NOT_READY_PODS" -eq 0 ]; then + echo 'All pods are in Running or Completed status.' + break + else + # print pods not in Running or Completed status + kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers + echo "Sleeping..." + sleep 10 + fi + done } From f844ccf48f67713ab57b5e048c3ff4521a9faff8 Mon Sep 17 00:00:00 2001 From: rmahique-github <> Date: Fri, 7 Feb 2025 16:37:33 +0100 Subject: [PATCH 31/40] added functions to retrieve rancher and kubernetes versions --- scripts/rancher/cluster_actions.sh | 47 ++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/scripts/rancher/cluster_actions.sh b/scripts/rancher/cluster_actions.sh index fd4374b..d6d3f7a 100644 --- a/scripts/rancher/cluster_actions.sh +++ b/scripts/rancher/cluster_actions.sh @@ -11,6 +11,53 @@ rancher_list_clusters() { kubectl get clusters.provisioning.cattle.io --all-namespaces -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' } + +####################################### +# Shows Kubernetes version +# Examples: +# get_k8s_version +####################################### +get_k8s_version() { + # echo 'Retrieving Kubernetes version' + kubectl version -o yaml | yq .serverVersion.gitVersion +} + + + +####################################### +# Shows Rancher version +# Globals: +# RANCHER_CLUSTER_URL +# Arguments: +# rancher_cluster_url - optional if RANCHER_CLUSTER URL is defined +# Examples: +# get_rancher_version https://rancher.clustername.domain.name/ +# Return format: +# {"Version":"v2.10.2","GitCommit":"a8208b7884a5115d31bfda65de78e3a65798179f","RancherPrime":"true"} +####################################### +get_rancher_version() { + # echo 'Retrieving Rancher version' + # Thanks Eduardo Mínguez and Josh Meranda + # other options: + # kubectl get po -n cattle-system -l app=rancher -o jsonpath='{.items[0].spec.containers[0].image}' + # R: registry.rancher.com/rancher/rancher:v2.10.1 + # kubectl exec -it -n cattle-system $(kubectl get po -n cattle-system -l app=rancher -o name) -- rancher --version + # R: rancher version v2.10.1 (daaa287448fe866f141beead10ae93ffc2400469) + if [[ "$1" != "" ]] + then + local _rancher_url=${1}ancherversion + else + local _rancher_url= + fi + if [[ "${_rancher_url}" != "" ]] && [[ "${_rancher_url}" =~ "https://" ]]; then + curl -k ${_rancher_url}/rancherversion + else + echo "ERROR: Missing or incorrect rancher URL" + exit 1 + fi +} + + ####################################### # Create downstream custom cluster in Rancher (don't wait and retrieve name) # Globals: From 6510ef0f754102d8b6d79e8cddb8f3fe8f0295b1 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 21 Feb 2025 10:44:23 +0100 Subject: [PATCH 32/40] Make download and unzip quieter Signed-off-by: Jeroen van Erp --- scripts/download.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/download.sh b/scripts/download.sh index 9d54438..7089314 100644 --- a/scripts/download.sh +++ b/scripts/download.sh @@ -60,8 +60,8 @@ setup_env() { download() { info 'Download scripts' - wget https://github.com/SUSE/${GIT_REPO_NAME}/archive/${GIT_REVISION}.zip -O ${GIT_REPO_NAME}.zip - unzip -o ${GIT_REPO_NAME}.zip + wget -nv https://github.com/SUSE/${GIT_REPO_NAME}/archive/${GIT_REVISION}.zip -O ${GIT_REPO_NAME}.zip + unzip -q -o ${GIT_REPO_NAME}.zip mkdir -p ${OUTPUT_FOLDER} if [ -d ${OUTPUT_FOLDER}/scripts ]; then info "Delete ${OUTPUT_FOLDER}/scripts" From 6307586f0f22be5e6fb6f4a34bd78668e95c5733 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 20 Mar 2025 11:36:50 +0100 Subject: [PATCH 33/40] Fix longhorn install Signed-off-by: Jeroen van Erp --- scripts/longhorn/install.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/longhorn/install.sh b/scripts/longhorn/install.sh index 4d8365d..e5b949f 100644 --- a/scripts/longhorn/install.sh +++ b/scripts/longhorn/install.sh @@ -9,7 +9,9 @@ longhorn_install() { systemctl enable --now iscsid.service modprobe iscsi_tcp echo '=== Check prerequisites' - curl -k https://raw.githubusercontent.com/longhorn/longhorn/refs/heads/master/scripts/environment_check.sh 2>/dev/null | bash || exit 1 + curl -k -sSfL -o longhornctl https://github.com/longhorn/cli/releases/download/v1.8.1/longhornctl-linux-amd64 + chmod +x longhornctl + ./longhornctl check preflight echo '=== Install LongHorn' helm upgrade -i longhorn longhorn/longhorn --namespace longhorn-system --create-namespace --set ingress.enabled=true --set ingress.host=$hostname --set persistence.migratable=true --set longhornUI.replicas=1 echo "<<< Longhorn should be available in a few minutes in: $hostname" From ec33d045779b85258975d5b9838996e76e2e1c96 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Mon, 24 Mar 2025 15:12:43 +0100 Subject: [PATCH 34/40] Break if CAPI not ready Signed-off-by: Jeroen van Erp --- scripts/rancher/manager_lifecycle.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index 47a97c1..7af31e9 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -109,6 +109,10 @@ rancher_wait_capiready() { ((_counter++)) echo "Waiting for capi-controller-manager to become available... attempt ${_counter}/25" done + if [[ $_counter -eq 25 ]]; then + echo 'Deployment capi-controller-manager is not available' + exit 1 + fi _counter=0 while [[ $_counter -lt 25 ]]; do if [[ ! $(kubectl get endpoints capi-webhook-service -n cattle-provisioning-capi-system -o jsonpath='{.subsets}' 2>/dev/null) == '' ]]; then @@ -119,5 +123,7 @@ rancher_wait_capiready() { ((_counter++)) echo "Waiting for endpoint capi-webhook-service to be ready... attempt ${_counter}/25" done + + echo 'Service capi-webhook-service is ready' } From 56aed3f8c967db53963195d0c339df52bee60646 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Tue, 25 Mar 2025 10:34:45 +0100 Subject: [PATCH 35/40] Try to get a hand on why CAPI sometimes is not ready Signed-off-by: Jeroen van Erp --- scripts/rancher/manager_lifecycle.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index 7af31e9..26145f7 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -111,6 +111,7 @@ rancher_wait_capiready() { done if [[ $_counter -eq 25 ]]; then echo 'Deployment capi-controller-manager is not available' + kubectl get deployment capi-controller-manager -n cattle-provisioning-capi-system -o jsonpath='{.status}' exit 1 fi _counter=0 From 23475fe6aa6d4f3544254b196acc526b6b997346 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Thu, 24 Jul 2025 23:45:56 +0200 Subject: [PATCH 36/40] Fix ingestion-api-key -> service-token --- scripts/observability/api_key.sh | 32 ++++---------------------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/scripts/observability/api_key.sh b/scripts/observability/api_key.sh index b01ebda..4e7febf 100644 --- a/scripts/observability/api_key.sh +++ b/scripts/observability/api_key.sh @@ -17,33 +17,9 @@ observability_create_ingestion_api_key() { local cluster_name=$3 local resp - resp=$(/usr/local/bin/sts ingestion-api-key create --name $cluster_name -o json --url $url --service-token $service_token) + /usr/local/bin/sts rbac create-subject --subject $cluster_name-agent + /usr/local/bin/sts rbac grant --subject $cluster_name-agent --permission update-metrics + resp=$(/usr/local/bin/sts service-token create --name $PROD_CLUSTER_NAME --roles $PROD_CLUSTER_NAME-agent --service-token $service_token --url $url -o json) - echo $resp | jq -r '."ingestion-api-key".apiKey' -} - -####################################### -# Delete an Ingestion API key for SUSE Observability -# Arguments: -# url (SUSE Observability) -# service_token (SUSE Observability) -# cluster_name -# Examples: -# observability_delete_ingestion_api_key https://obs.suse.com/ xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx demo -####################################### -observability_delete_ingestion_api_key() { - local url=$1 - local service_token=$2 - local cluster_name=$3 - - local keys key_id - - keys=$(/usr/local/bin/sts ingestion-api-key list -o json --url $url --service-token $service_token) - key_id=$(echo $keys | jq -r '."ingestion-api-keys"[] | select(.name == "'$cluster_name'") | .id') - if [ -n "$key_id" ]; then - /usr/local/bin/sts ingestion-api-key delete --id $key_id --url $url --service-token $service_token - echo ">>> Ingestion API key for cluster '${cluster_name}' deleted" - else - echo ">>> Ingestion API key for cluster '${cluster_name}' not found" - fi + echo $resp | jq -r '."service-token".token' } From 8dde096a4a387efa79984034b76916bab0e19aeb Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Wed, 30 Jul 2025 22:44:18 +0200 Subject: [PATCH 37/40] Fix auth of sts command Signed-off-by: Jeroen van Erp --- scripts/observability/api_key.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/observability/api_key.sh b/scripts/observability/api_key.sh index 4e7febf..bd0be5e 100644 --- a/scripts/observability/api_key.sh +++ b/scripts/observability/api_key.sh @@ -17,8 +17,8 @@ observability_create_ingestion_api_key() { local cluster_name=$3 local resp - /usr/local/bin/sts rbac create-subject --subject $cluster_name-agent - /usr/local/bin/sts rbac grant --subject $cluster_name-agent --permission update-metrics + /usr/local/bin/sts rbac create-subject --subject $cluster_name-agent --service-token $service_token --url $url + /usr/local/bin/sts rbac grant --subject $cluster_name-agent --permission update-metrics --service-token $service_token --url $url resp=$(/usr/local/bin/sts service-token create --name $PROD_CLUSTER_NAME --roles $PROD_CLUSTER_NAME-agent --service-token $service_token --url $url -o json) echo $resp | jq -r '."service-token".token' From a9362e342c9ac973c4d5c271b5e5a4a4e590be1d Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 3 Oct 2025 09:36:40 +0200 Subject: [PATCH 38/40] Fixes for rodeo --- scripts/longhorn/install.sh | 5 +++-- scripts/observability/api_key.sh | 2 +- scripts/observability/cli.sh | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/longhorn/install.sh b/scripts/longhorn/install.sh index e5b949f..7e6dcee 100644 --- a/scripts/longhorn/install.sh +++ b/scripts/longhorn/install.sh @@ -2,6 +2,7 @@ longhorn_install() { local hostname=$1 + local version=${2:-1.8.1} echo '>>> Setup prerequisites for Longhorn install' helm repo add longhorn https://charts.longhorn.io helm repo update @@ -9,10 +10,10 @@ longhorn_install() { systemctl enable --now iscsid.service modprobe iscsi_tcp echo '=== Check prerequisites' - curl -k -sSfL -o longhornctl https://github.com/longhorn/cli/releases/download/v1.8.1/longhornctl-linux-amd64 + curl -k -sSfL -o longhornctl https://github.com/longhorn/cli/releases/download/v${version}/longhornctl-linux-amd64 chmod +x longhornctl ./longhornctl check preflight echo '=== Install LongHorn' - helm upgrade -i longhorn longhorn/longhorn --namespace longhorn-system --create-namespace --set ingress.enabled=true --set ingress.host=$hostname --set persistence.migratable=true --set longhornUI.replicas=1 + helm upgrade -i --version $version longhorn longhorn/longhorn --namespace longhorn-system --create-namespace --set ingress.enabled=true --set ingress.host=$hostname --set persistence.migratable=true --set longhornUI.replicas=1 echo "<<< Longhorn should be available in a few minutes in: $hostname" } diff --git a/scripts/observability/api_key.sh b/scripts/observability/api_key.sh index bd0be5e..f39317d 100644 --- a/scripts/observability/api_key.sh +++ b/scripts/observability/api_key.sh @@ -19,7 +19,7 @@ observability_create_ingestion_api_key() { local resp /usr/local/bin/sts rbac create-subject --subject $cluster_name-agent --service-token $service_token --url $url /usr/local/bin/sts rbac grant --subject $cluster_name-agent --permission update-metrics --service-token $service_token --url $url - resp=$(/usr/local/bin/sts service-token create --name $PROD_CLUSTER_NAME --roles $PROD_CLUSTER_NAME-agent --service-token $service_token --url $url -o json) + resp=$(/usr/local/bin/sts service-token create --name $cluster_name --roles $cluster_name-agent --service-token $service_token --url $url -o json) echo $resp | jq -r '."service-token".token' } diff --git a/scripts/observability/cli.sh b/scripts/observability/cli.sh index a228b30..fa0cf89 100644 --- a/scripts/observability/cli.sh +++ b/scripts/observability/cli.sh @@ -4,8 +4,9 @@ # Install the SUSE Observability CLI ####################################### observability_install_cli() { + local version=${1:-3.1.1} if ! [ -x "$(command -v sts)" ]; then - curl -s -o- https://dl.stackstate.com/stackstate-cli/install.sh | STS_CLI_LOCATION=/usr/local/bin bash + curl -s -o- https://dl.stackstate.com/stackstate-cli/install.sh | STS_CLI_LOCATION=/usr/local/bin STS_CLI_VERSION=$version bash else echo ">>> sts CLI already installed" fi From ad058c51a033daddec678993f5e6da1c75c679f4 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 3 Oct 2025 10:27:13 +0200 Subject: [PATCH 39/40] Echo resp to see whether it's OK --- scripts/observability/api_key.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/observability/api_key.sh b/scripts/observability/api_key.sh index f39317d..0cf5ba0 100644 --- a/scripts/observability/api_key.sh +++ b/scripts/observability/api_key.sh @@ -20,6 +20,6 @@ observability_create_ingestion_api_key() { /usr/local/bin/sts rbac create-subject --subject $cluster_name-agent --service-token $service_token --url $url /usr/local/bin/sts rbac grant --subject $cluster_name-agent --permission update-metrics --service-token $service_token --url $url resp=$(/usr/local/bin/sts service-token create --name $cluster_name --roles $cluster_name-agent --service-token $service_token --url $url -o json) - + echo $resp echo $resp | jq -r '."service-token".token' } From 1206e0e368cf392da0e4ab103bc4fd725e9bc390 Mon Sep 17 00:00:00 2001 From: Jeroen van Erp Date: Fri, 3 Oct 2025 11:23:48 +0200 Subject: [PATCH 40/40] Added logging --- scripts/observability/agent.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/observability/agent.sh b/scripts/observability/agent.sh index 6ba2762..3264e8d 100644 --- a/scripts/observability/agent.sh +++ b/scripts/observability/agent.sh @@ -14,14 +14,18 @@ observability_agent_install_nowait() { local cluster_name=$2 local ingestion_api_key=$3 echo "Installing Observability agent..." + echo " URL: $url" + echo " Cluster name: $cluster_name" + echo " Ingestion API key: $ingestion_api_key" + helm repo add suse-observability https://charts.rancher.com/server-charts/prime/suse-observability helm repo update helm upgrade --install suse-observability-agent suse-observability/suse-observability-agent \ --namespace suse-observability --create-namespace \ - --set stackstate.apiKey=${ingestion_api_key} \ + --set stackstate.apiKey="${ingestion_api_key}" \ --set stackstate.url="${url%/}/receiver/stsAgent" \ - --set stackstate.cluster.name=${cluster_name} + --set stackstate.cluster.name="${cluster_name}" } #######################################