diff --git a/assets/fleet/clustergroup.yaml b/assets/fleet/clustergroup.yaml new file mode 100644 index 0000000..b51a22b --- /dev/null +++ b/assets/fleet/clustergroup.yaml @@ -0,0 +1,16 @@ +apiVersion: fleet.cattle.io/v1alpha1 +kind: ClusterGroup +metadata: + name: build-a-dino + annotations: + {} + # key: string + labels: + {} + # key: string + namespace: fleet-default +spec: + selector: + matchLabels: + gpu-enabled: 'true' + app: build-a-dino diff --git a/assets/fleet/gitrepo.yaml b/assets/fleet/gitrepo.yaml new file mode 100644 index 0000000..7d3702e --- /dev/null +++ b/assets/fleet/gitrepo.yaml @@ -0,0 +1,24 @@ +apiVersion: fleet.cattle.io/v1alpha1 +kind: GitRepo +metadata: + name: build-a-dino + annotations: + {} + # key: string + labels: + {} + # key: string + namespace: fleet-default +spec: + branch: main + correctDrift: + enabled: true +# force: boolean +# keepFailHistory: boolean + insecureSkipTLSVerify: false + paths: + - /fleet/build-a-dino +# - string + repo: https://github.com/wiredquill/prime-rodeo + targets: + - clusterGroup: build-a-dino diff --git a/assets/monitors/certificate-expiration.yaml b/assets/monitors/certificate-expiration.yaml new file mode 100644 index 0000000..bf20abd --- /dev/null +++ b/assets/monitors/certificate-expiration.yaml @@ -0,0 +1,48 @@ +nodes: +- _type: Monitor + arguments: + criticalThreshold: 1w + deviatingThreshold: 30d + query: type = "secret" AND label = "secret-type:certificate" + resourceName: Certificate + timestampProperty: certificateExpiration + description: Verify certificates that are close to it's expiration date + function: {{ get "urn:stackpack:common:monitor-function:topology-timestamp-threshold-monitor" }} + id: -12 + identifier: urn:custom:monitor:certificate-expiration-v2 + intervalSeconds: 30 + name: Certificate Expiration V2 + remediationHint: | + + Certificate expiration date `\{{certificateExpiration\}}`. + + ### Obtain new TLS certificates + + If you're using a Certificate Authority (CA) or a third-party provider, follow their procedures to obtain a new TLS certificate. + Once validated, download the new TLS certificate and the corresponding private key from the third-party provider's dashboard or via their API. + When you have downloaded these two files, you can update the Secret with the new certificate and key data. + + ``` + kubectl create secret tls \{{name\}} --cert=path/to/new/certificate.crt --key=path/to/new/private.key + ``` + + 2. **Generate new self-signed certificates**: + + If you're using self-signed certificates, you can generate new ones locally and update the Secret with the new certificate and key data. + Use tools like OpenSSL to generate new self-signed certificates. + + ``` + openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout path/to/new/private.key -out path/to/new/certificate.crt + ``` + + Update the Secret with the new certificate and key data. + + ``` + kubectl create secret tls \{{name\}} --cert=path/to/new/certificate.crt --key=path/to/new/private.key + ``` + + Alternatively you can edit the existing secret with **`kubectl edit secret \{{name\}}`** and replace the certificate and key data with the new ones obtained from the third-party provider or generated locally. + status: ENABLED + tags: + - certificate + - secret diff --git a/assets/monitors/http-error-ratio-for-service.yaml b/assets/monitors/http-error-ratio-for-service.yaml new file mode 100644 index 0000000..97920b3 --- /dev/null +++ b/assets/monitors/http-error-ratio-for-service.yaml @@ -0,0 +1,81 @@ +_version: 1.0.85 +nodes: +- _type: Monitor + arguments: + deviatingThreshold: 0.05 + loggingLevel: WARN + timeWindow: 2 minutes + description: |- + HTTP responses with a status code in the 5xx range indicate server-side errors such as a misconfiguration, overload or internal server errors. + To ensure a good user experience, the percentage of 5xx responses should be less than the configured percentage (5% is the default) of the total HTTP responses for a Kubernetes (K8s) service. + To understand the full monitor definition check the details. + Because the exact threshold and severity might be application dependent, the thresholds can be overriden via a Kubernetes annotation on the service. For example to override the pre-configured deviating threshold and instead only have a critical threshold at 6% put this annotation on your service: + ``` + monitor.kubernetes-v2.stackstate.io/http-error-ratio-for-service: | + { + "criticalThreshold": 0.06, + "deviatingThreshold": null + } + ``` + Omitting the deviating threshold from this json snippet would have kept it at the configured 5%, with the critical threshold at 6% that means that the monitor would only result in a deviating state for an error ratio between 5% and 6%. + function: {{ get "urn:stackpack:prime-kubernetes:shared:monitor-function:http-error-ratio-for-service" }} + id: -8 + identifier: urn:stackpack:custom:shared:monitor:http-error-ratio-for-service-v2 + intervalSeconds: 10 + name: HTTP - 5xx error ratio + remediationHint: |- + We have detected that more than 5% of the total responses from your Kubernetes service have a 5xx status code, + this signals that a significant number of users are experiencing downtime and service interruptions. + Take the following steps to diagnose the problem: + + ## Possible causes + - Slow dependency or dependency serving errors + - Recent update of the application + - Load on the application has increased + - Code has memory leaks + - Environment issues (e.g. certain nodes, database or services that the service depends on) + + ### Slow dependency or dependency serving errors + Check, in the related health violations of this monitor (which can be found in the expanded version if you read this in the pinned minimised version) if there are any health violations on one of the services or pods that this service depends on (focus on the lowest dependency). If you find a violation (deviating or critical health), click on that component to see the related health violations in table next to it. You can than click on those health violations to follow the instructions to resolve the issue. + + ### New behavior of the service + If there are no dependencies that have health violations, it could be that the pod backing this service is returning errors. If this behavior is new, it could be caused by a recent deployment. + + This can be checked by looking at the Events shown on the [service highlights page](/#/components/\{{ componentUrnForUrl \}}/highlights) and checking whether a `Deployment` event happened recently after which the HTTP Error ratio behaviour changed. + + To troubleshoot further, you can have a look at the pod(s) backing this service. + - Click on the "Pods of this service" in the "Related resource" section of the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) + - Click on the pod name(s) to go to their highlights pages + - Check the logs of the pod(s) to see if they're returning any errors. + + ### Recent update of the service + Check if the service was recently updated: + - See the Age in the "About" section to identify on the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) + is this is recently deployed + - Check if any of the pods are recently updated by clicking on "Pods of this service" in "Related resource" section of + the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) and look if their Age is recent. + - If application has just started, it might be that the service has not warmed up yet. Compare the response time metrics + for the current deployment with the previous deployment by checking the response time metric chart with a time interval including both. + - Check if application is using more resources than before, consider scaling it up or giving it more resources. + - If increased latency is crucial, consider rolling back the service to the previous version: + - if that helps, then the issue is likely with new deployment + - if that does not help, then the issue may be in the environment (e.g. network issues or issues with the underlying infrastructure, e. g. database) + ### Load on the service has increased + - Check if the amount of requests to the service has increased by looking at the "Throughput (HTTP responses/s)" chart for the "HTTP response metrics for all clients (incoming requests)" on the [service highlight page](/#/components/\{{ componentUrnForUrl \}}). + If so, consider scaling up the service or giving it more resources. + ### Code has memory leaks + - Check if memory or CPU usage have been increasing over time. If so, there might be a memory leak. + You can find the pods supporting this service by clicking on "Pods of this service" in "Related resource" + section of the [service highlight page](/#/components/\{{ componentUrnForUrl \}}). + Check which pods are using the most disk space by clicking on the left side of the [service highlight page](/#/components/\{{ componentUrnForUrl \}}) on "Pods of this service" + - Check all the pods supporting this service by clicking on the pod name + - Check the resource usage on the "Resource usage" section + - Restart the pod(s) of this service that is having the issue or add more memory/cpu + ### Environment issues + - Check latency of particular pods of the service. If only certain pods are having issues, might be an issue with the node the pod is running on: + - Try to move the pod to another node + - Check if other pods of other services are also having latency increased on that node. Drain the node if that is the case. + status: ENABLED + tags: + - services +timestamp: 2025-01-16T13:16:53.208687Z[Etc/UTC] diff --git a/assets/monitors/out-of-memory-containers.yaml b/assets/monitors/out-of-memory-containers.yaml new file mode 100644 index 0000000..95507af --- /dev/null +++ b/assets/monitors/out-of-memory-containers.yaml @@ -0,0 +1,81 @@ +nodes: +- _type: Monitor + arguments: + comparator: GTE + failureState: DEVIATING + metric: + aliasTemplate: OOM Killed count + query: max(increase(kubernetes_containers_last_state_terminated{reason="OOMKilled"}[10m])) + by (cluster_name, namespace, pod_name, container) + unit: short + threshold: 1.0 + urnTemplate: urn:kubernetes:/${cluster_name}:${namespace}:pod/${pod_name} + description: |- + It is important to ensure that the containers running in your Kubernetes cluster have enough memory to function properly. Out of memory (OOM) conditions can cause containers to crash or become unresponsive, leading to restarts and potential data loss. + To monitor for these conditions, we set up a check that detects and reports OOM events in the containers running in the cluster. This check will help you identify any containers that are running out of memory and allow you to take action to prevent issues before they occur. + To understand the full monitor definition check the details. + function: {{ get "urn:stackpack:common:monitor-function:threshold" }} + id: -13 + identifier: urn:custom:monitor:out-of-memory-containers-v2 + intervalSeconds: 30 + name: Out of memory for containers V2 + remediationHint: |- + An Out of Memory (OOM) event in Kubernetes occurs when a container's memory usage exceeds the limit set for it. + The Linux kernel's OOM killer process is triggered, which attempts to free up memory by killing one or more processes. + This can cause the container to terminate, leading to issues such as lost data, service interruption, and increased + resource usage. + + Check the container [Logs](/#/components/\{{ componentUrnForUrl \}}#logs) for any hints on how the application is behaving. + + ### Recognize a memory leak + + A memory leak can be recognized by looking at the "Memory Usage" metric on the [pod metrics page](/#/components/\{{ componentUrnForUrl \}}/metrics). + + If the metric resembles a `saw-tooth` pattern that is a clear indication of a slow memory leak being present in your application. + The memory usage increases over time, but the memory is not released until the container is restarted. + + If the metric resembles a `dash` pattern that is an indication of a memory leak via a spike. + The memory usage suddenly increases that causes the limit to be violated and the container killed. + + You will notice that the container continually restarts. + + Common issues that can cause this problem include: + 1. New deployments that introduce a memory leak. + 2. Elevated traffic that causes a temporary increase of memory usage. + 3. Incorrectly configured memory limits. + + ### 1. New deployments that introduce a memory leak + + If the memory leak behaviour is new, it is likely that a new deployment introduced a memory leak. + + This can be checked by looking at the Events shown on the [pod highlights page](/#/components/\{{ componentUrnForUrl \}}/highlights) and checking whether a `Deployment` event happened recently after which the memory usage behaviour changed. + + If the memory leak is caused by a deployment, you can investigate which change led to the memory leak by checking the [Show last change](/#/components/\{{ componentUrnForUrl \}}#lastChange), which will highlight the latest changeset for the deployment. You can then revert the change or fix the memory leak. + + ### 2. Elevated traffic that causes a temporary increase of memory usage + This can be checked by looking at the "Network Throughput for pods (received)" metric on the [pod metrics page](/#/components/\{{ componentUrnForUrl \}}/metrics) and comparing the usage to the "Memory Usage" metric. If the memory usage increases at the same time as the network throughput, it is likely that the memory usage is caused by the increased traffic. + + As a temporary fix you can elevate the memory limit for the container. However, this is not a long-term solution as the memory usage will likely increase again in the future. You can also consider using Kubernetes autoscaling feature to scale up and down the number of replicas based on resource usage. + + ### 3. Incorrectly configured memory Limits + This can be checked by looking at the "Memory Usage" metric on the [pod metrics page](/#/components/\{{ componentUrnForUrl \}}/metrics) and comparing the usage to the requests and limits set for the pod. If the memory usage is higher than the limit set for the pod, the container will be terminated by the OOM killer. + + To fix this issue, you can increate the memory limit for the pod, by changing the Kubernetes resource YAML and increasing the memory limit values e.g. + ``` + metadata: + … + spec: + containers: + … + resources: + limits: + cpu: "2" + memory: "3Gi" + requests: + cpu: "2" + memory: "3Gi" + ``` + status: ENABLED + tags: + - containers + - pods diff --git a/assets/monitors/pod-cpu-throttling.yaml b/assets/monitors/pod-cpu-throttling.yaml new file mode 100644 index 0000000..0164a01 --- /dev/null +++ b/assets/monitors/pod-cpu-throttling.yaml @@ -0,0 +1,85 @@ +nodes: +- _type: Monitor + arguments: + comparator: GT + failureState: DEVIATING + metric: + aliasTemplate: CPU Throttling for ${container} of ${pod_name} + query: 100 * sum by (cluster_name, namespace, pod_name, container) (container_cpu_throttled_periods{}) + / sum by (cluster_name, namespace, pod_name, container) (container_cpu_elapsed_periods{}) + unit: percent + threshold: 95.0 + urnTemplate: urn:kubernetes:/${cluster_name}:${namespace}:pod/${pod_name} + description: |- + In Kubernetes, CPU throttling refers to the process where limits are applied to the amount of CPU resources a container can use. + This typically occurs when a container approaches the maximum CPU resources allocated to it, causing the system to throttle or restrict + its CPU usage to prevent a crash. + + While CPU throttling can help maintain system stability by avoiding crashes due to CPU exhaustion, it can also significantly slow down workload + performance. Ideally, CPU throttling should be avoided by ensuring that containers have access to sufficient CPU resources. + This proactive approach helps maintain optimal performance and prevents the slowdown associated with throttling. + function: {{ get "urn:stackpack:common:monitor-function:threshold" }} + id: -13 + identifier: urn:custom:monitor:pod-cpu-throttling-v2 + intervalSeconds: 60 + name: CPU Throttling V2 + remediationHint: |- + + ### Application behaviour + + Check the container [Logs](/#/components/\{{ componentUrnForUrl \}}#logs) for any hints on how the application is behaving under CPU Throttling + + ### Understanding CPU Usage and CPU Throttling + + On the [pod metrics page](/#/components/\{{ componentUrnForUrl \}}/metrics) you will find the CPU Usage and CPU Throttling charts. + + #### CPU Trottling + + The percentage of CPU throttling over time. CPU throttling occurs when a container reaches its CPU limit, restricting its CPU usage to + prevent it from exceeding the specified limit. The higher the percentage, the more throttling is occurring, which means the container's + performance is being constrained. + + #### CPU Usage + + This chart shows three key CPU metrics over time: + + 1. Request: The amount of CPU the container requests as its minimum requirement. This sets the baseline CPU resources the container is guaranteed to receive. + 2. Limit: The maximum amount of CPU the container can use. If the container's usage reaches this limit, throttling will occur. + 3. Current: The actual CPU usage of the container in real-time. + + The `Request` and `Limit` settings in the container can be seen in `Resource` section in [configuration](/#/components/\{{ componentUrnForUrl\}}#configuration) + + #### Correlation + + The two charts are correlated in the following way: + + - As the `Current` CPU usage approaches the CPU `Limit`, the CPU throttling percentage increases. This is because the container tries to use more CPU than it is allowed, and the system restricts it, causing throttling. + - The aim is to keep the `Current` usage below the `Limit` to minimize throttling. If you see frequent high percentages in the CPU throttling chart, it suggests that you may need to adjust the CPU limits or optimize the container's workload to reduce CPU demand. + + + ### Adjust CPU Requests and Limits + + On the [pod highlights page](/#/components/\{{ componentUrnForUrl \}}/highlights) and checking whether a `Deployment` event happened recently after which the cpu usage behaviour changed. + + You can investigate which change led to the cpu throttling by checking the [Show last change](/#/components/\{{ componentUrnForUrl \}}#lastChange), + which will highlight the latest changeset for the deployment. You can then revert the change or fix the cpu request and limit. + + + Review the pod's resource requests and limits to ensure they are set appropriately. + Show component [configuration](/#/components/\{{ componentUrnForUrl \}}#configuration) + + If the CPU usage consistently hits the limit, consider increasing the CPU limit of the pod.
+ Edit the pod or deployment configuration file to modify the `resources.limits.cpu` and `resources.requests.cpu` as needed. + ``` + resources: + requests: + cpu: "500m" # Adjust this value based on analysis + limits: + cpu: "1" # Adjust this value based on analysis + ``` + If CPU throttling persists, consider horizontal pod autoscaling to distribute the workload across more pods, or adjust the cluster's node resources to meet the demands. Continuously monitor and fine-tune resource settings to optimize performance and prevent further throttling issues. + status: ENABLED + tags: + - cpu + - performance + - pod diff --git a/assets/monitors/pods-in-waiting-state.yaml b/assets/monitors/pods-in-waiting-state.yaml new file mode 100644 index 0000000..6b62c11 --- /dev/null +++ b/assets/monitors/pods-in-waiting-state.yaml @@ -0,0 +1,236 @@ +nodes: +- _type: Monitor + arguments: + failureState: CRITICAL + loggingLevel: WARN + description: | + If a pod is within a waiting state and contains a reason of CreateContainerConfigError, CreateContainerError, + CrashLoopBackOff, or ImagePullBackOff it will be seen as deviating. + function: {{ get "urn:stackpack:kubernetes-v2:shared:monitor-function:pods-in-waiting-state" }} + id: -6 + identifier: urn:custom:monitor:pods-in-waiting-state-v2 + intervalSeconds: 30 + name: Pods in Waiting State V2 + remediationHint: |- + \{{#if reasons\}} + \{{#if reasons.CreateContainerConfigError\}} + ## CreateContainerConfigError + + In case of CreateContainerConfigError common causes are a secret or ConfigMap that is referenced in [your pod](/#/components/\{{ componentUrnForUrl \}}), but doesn’t exist. + + ### Missing ConfigMap + + If case of a missing ConfigMap you see an error like `Error: configmap "mydb-config" not found` you see the error mention in the message of this monitor. + + To solve this you should reference an existing ConfigMap. + + An example: + + ```markdown + # See if the configmap exists + kubectl get configmap mydb-config + + # Create the correct configmap, this is just an example + kubectl create configmap mydb-config --from-literal=database_name=mydb + + # Delete and recreate the pod using this configmag + kubectl delete -f mydb_pod.yaml + kubectl create -f mydb_pod.yaml + + # After recreating the pod this pod should be in a running state. + # This is visible because the waiting pod monitor will not trigger anymore on this condition. + ``` + + ### Missing Secret + + If case of a missing Secret you see an error like `Error from server (NotFound): secrets "my-secret" not found` + you see the error mention in the message of this monitor. + + To solve this you should reference an existing ConfigMap. + + An example: + + ```markdown + # See if the secret exists + kubectl get secret mydb-secret + + # Create the correct configmap, this is just an example + kubectl create secret mydb-secret --from-literal=password=mysupersecretpassword + + # Delete and recreate the pod using this configmag + kubectl delete -f mydb_pod.yaml + kubectl create -f mydb_pod.yaml + + # After recreating the pod this pod should be in a running state. + # This is visible because the waiting pod monitor will not trigger anymore on this condition. + ``` + \{{/if\}} + \{{#if reasons.CreateContainerError\}} + ## CreateContainerError + + Common causes for a CreateContainerError are: + + - Command Not Available + - Issues Mounting a Volume + - Container Runtime Not Cleaning Up Old Containers + + ### Command Not Available + + In case of ‘`Command Not Available`’ you will find this in the reason field at the top of this monitor (full screen). + If this is the case, the first thing you need to investigate is to check that you have a valid ENTRYPOINT in the Dockerfile + used to build your container image. + + If you don’t have access to the Dockerfile, you can configure your pod object by using + a valid command in the command attribute of the object. + + Check if your pod has a command set by inspecting the [Configuration"](/#/components/\{{ componentUrnForUrl \}}#configuration) on the pod, e.g.: + + ```markdown + apiVersion: v1 + kind: Pod + metadata: + name: nodeapp + labels: + app: nodeapp + spec: + containers: + - image: myimage/wrong-node-app + name: nodeapp + ports: + - containerPort: 80 + **command: ["node", "index.js"]** + ``` + + If the pod does not have a command set, check the container definition to see if an ENTRYPOINT is set, here you see an example without an existing ENTRYPOINT. + + if no exisiting ENTRYPOINT is set and the pod does not have a command the solution is to use a valid command in the pod definition: + + ```markdown + FROM ****node:16.3.0-alpine + WORKDIR /usr/src/app + COPY package*.json ./ + + RUN npm install + COPY . . + + EXPOSE 8080 + + **ENTRYPOINT []** + ``` + + ### Issues Mounting a Volume + + In the case of a `volume mount problem` the message of this monitor will give you a hint. For example, if you have a message like: + + ``` + Error: Error response from daemon: create \mnt\data: "\\mnt\\data" includes invalid characters for a local volume name, only "[a-zA-Z0-9][a-zA-Z0-9_.-]" are allowed. If you intended to pass a host directory, use absolute path + ``` + + In this case you should use a change the path in the PersistentVolume definition to a valid path. e.g. /mnt/data + + ### Container Runtime Not Cleaning Up Old Containers + + In this case you will see a message like: + + ``` + The container name "/myapp_ed236ae738" is already in use by container "22f4edaec41cb193857aefcead3b86cdb69edfd69b2ab57486dff63102b24d29". You have to remove (or rename) that container to be able to reuse that name. + ``` + + This is an indication that the [container runtime](https://kubernetes.io/docs/setup/production-environment/container-runtimes/) + doesn’t clean up old containers. + In this case the node should be removed from the cluster and the node container runtime should be reinstalled + (or be recreated). After that the node should be (re)assigned to the cluster. + + \{{/if\}} + \{{#if reasons.CrashLoopBackOff\}} + ## CrashLoopBackOff + + When a Kubernetes container has errors, it can enter into a state called CrashLoopBackOff, where Kubernetes attempts to restart the container to resolve the issue. + + The container will continue to restart until the problem is resolved. + + Take the following steps to diagnose the problem: + + ### Container Logs + Check the container logs for any explicit errors or warnings + + 1. Inspect the [Logs](/#/components/\{{ componentUrnForUrl \}}#logs) of all the containers in this pod. + 2. Scroll through it and validate if there is an excessive amount of errors. + 1. if a container is crashing due to an out of memory error, the logs may show errors related to memory allocation or exhaustion. + - If this is the case check if the memory limits are too low in which case you can make them higher. + - If the memory problem is not resolved you might have introduced an memory leak in which case you want to take a look at the last deployment. + - If there are no limits you might have a proble with the physical memory on the node running the pod. + 2. if a container is crashing due to a configuration error, the logs may show errors related to the incorrect configuration. + + ### Understand application + + It is important to understand what the intended behaviour of the application should be. + A good place to start is the [configuration](/#/components/\{{ componentUrnForUrl\}}#configuration). + Pay attention to environment variables and volume mounts as these are mechanism to configure the application. + We can use references to configmaps and secrets to futher explore configuration information. + + ### Pod Events + Check the pod events to identify any explicit errors or warnings. + 1. Go to the [Pod events page](/#/components/\{{ componentUrnForUrl \}}/events). + 2. Check if there is a large amount of events like `BackOff`, `FailedScheduling` or `FailedAttachVolume` + 3. If this is the case, see if the event details (click on the event) contains more information about this issue. + + ### Recent Deployment + Look at the pod age in the "About" section on the [Pod highlight page](/#/components/\{{ componentUrnForUrl \}}) to identify any recent deployments that might have caused the issue + + 1. The "Age" is shown in the "About" section on the left side of the screen + 2. If the "Age" and the time that the monitor was triggered are in close proximity then take a look at the most recent deployment by clicking on [Show last change](/#/components/\{{ componentUrnForUrl \}}#lastChange). + \{{/if\}} + \{{#if reasons.ImagePullBackOff\}} + ## ImagePullBackOff + + If you see the "ImagePullBackOff" error message while trying to pull a container image from a registry, it means that + the Docker engine was unable to pull the requested image for some reason. + + The reason field at the top of this monitor (full screen) might give you more information about the specific issue at hand. + + ## Diagnose + + To diagnose the problem, try the following actions: + + - Go to the [pod events page filtered by failed or unhealthy events](/#/components/\{{ componentUrnForUrl \}}/events?view=eventTypes--Unhealthy,Created,FailedMount,Failed) + + If there are no "Failed" events shown increase the time-range by clicking on the Zoom-out button on next to the telemetry-time-interval on the bottom left of the timeline. + + Click on the left side of the [Pod highlight page](/#/components/\{{ componentUrnForUrl \}}) on "Containers" in the "Related resources" + to view the `containers` and the `Image URL`. + + ## Common causes + + ### Rate Limit + A docker hub rate limit has been reached. + + Typical resolution is to authenticate using docker hub credentials (it will increase the rate limit from 100 to 200 pulls per 6 hours) + or to get a paid account and authenticate with that (bumping the limit to 5000 pulls per day). + + ### Network connectivity issues + Check your internet connection or the connection to the registry where the image is hosted. + + ### Authentication problems + If the registry requires authentication, make sure that your credentials are correct and that + you have the necessary permissions to access the image. + + ### Image availability + Verify that the image you are trying to pull exists in the registry and that you have specified the correct image name and tag. + + Here are some steps you can take to resolve the "ImagePullBackOff" error: + + 1. Check the registry logs for any error messages that might provide more information about the issue. + 2. Verify that the image exists in the registry and that you have the correct image name and tag. + 3. Check your network connectivity to ensure that you can reach the registry. + 4. Check the authentication credentials to ensure that they are correct and have the necessary permissions. + + If none of these steps work, you may need to consult the Docker documentation or contact support for the registry or Docker + itself for further assistance. + \{{/if\}} + \{{/if\}} + status: ENABLED + tags: + - pods + - containers +timestamp: 2024-10-17T10:15:31.714348Z[Etc/UTC] diff --git a/charts/ai-model/Chart.yaml b/charts/ai-model/Chart.yaml new file mode 100644 index 0000000..d35c96a --- /dev/null +++ b/charts/ai-model/Chart.yaml @@ -0,0 +1,12 @@ +apiVersion: v2 +name: ai-model +description: A Helm chart for ai-model Mackroservices +type: application +version: 0.1.0 +appVersion: "0.1.0" +maintainers: + - name: hierynomus + email: jeroen.vanerp@suse.com +keywords: +- challenge +- observability diff --git a/charts/ai-model/templates/_helpers.tpl b/charts/ai-model/templates/_helpers.tpl new file mode 100644 index 0000000..5c3f420 --- /dev/null +++ b/charts/ai-model/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "common.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "common.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "common.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "common.labels" -}} +helm.sh/chart: {{ include "common.chart" . }} +{{ include "common.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "common.selectorLabels" -}} +app.kubernetes.io/name: {{ include "common.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "common.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "common.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/ai-model/templates/ai-model-cm.yaml b/charts/ai-model/templates/ai-model-cm.yaml new file mode 100644 index 0000000..da05242 --- /dev/null +++ b/charts/ai-model/templates/ai-model-cm.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: ai-model-cm + labels: + {{- include "common.labels" . | nindent 4 }} +data: + config.toml: | + # General configuration + port = 11434 + address = "0.0.0.0" + serviceName = "AI Model" + logLevel = "info" + + # Endpoints + [[endpoints]] + uri = "/api/chat" + delay = "1000ms" + body.status = "success" + body.msg = "Your dino is a T-Rex" + + [endpoints.logging] + before = "Processing [[.Endpoint.Uri]] request" + beforeLevel = "Info" + after = "Completed [[.Endpoint.Uri]] request" + afterLevel = "Info" + + # OpenTelemetry + [otel.trace] + enabled = false + tracer-name = "ai-model" + diff --git a/charts/ai-model/templates/ai-model-deployment.yaml b/charts/ai-model/templates/ai-model-deployment.yaml new file mode 100644 index 0000000..bf0c4e7 --- /dev/null +++ b/charts/ai-model/templates/ai-model-deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ai-model + labels: + service: ai-model + {{- include "common.labels" . | nindent 4 }} +spec: + replicas: 1 + selector: + matchLabels: + service: ai-model + {{- include "common.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "common.labels" . | nindent 8 }} + service: ai-model + annotations: + checksum/config: '{{ include (print $.Template.BasePath "/ai-model-cm.yaml") . | sha256sum}}' + spec: + containers: + - name: ai-model + image: {{.Values.image}} + env: + - name: CONFIG_FILE + value: /etc/app/config.toml + ports: + - containerPort: 8080 + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: config-volume + mountPath: /etc/app + volumes: + - name: config-volume + configMap: + name: ai-model-cm + items: + - key: config.toml + path: config.toml diff --git a/charts/ai-model/templates/ai-model-ingress.yaml b/charts/ai-model/templates/ai-model-ingress.yaml new file mode 100644 index 0000000..03476ab --- /dev/null +++ b/charts/ai-model/templates/ai-model-ingress.yaml @@ -0,0 +1,26 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + {{- .Values.ingress.annotations | toYaml | nindent 4 }} + labels: + service: ai-model + {{- include "common.labels" . | nindent 4 }} + name: ai-model +spec: + ingressClassName: traefik + rules: + - host: {{ .Values.ingress.host }} + http: + paths: + - backend: + service: + name: ai-model + port: + number: 11434 + path: / + pathType: Prefix + tls: + - hosts: + - {{ .Values.ingress.host }} + secretName: tls-secret diff --git a/charts/ai-model/templates/ai-model-svc.yaml b/charts/ai-model/templates/ai-model-svc.yaml new file mode 100644 index 0000000..cd33a02 --- /dev/null +++ b/charts/ai-model/templates/ai-model-svc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: ai-model + labels: + service: ai-model + {{- include "common.labels" . | nindent 4 }} +spec: + selector: + service: ai-model + {{- include "common.selectorLabels" . | nindent 4 }} + ports: + - protocol: TCP + port: 80 # Service port + targetPort: 8080 # Container port + type: ClusterIP # Internal service within the Kubernetes cluster diff --git a/charts/ai-model/values.yaml b/charts/ai-model/values.yaml new file mode 100644 index 0000000..37ecdbd --- /dev/null +++ b/charts/ai-model/values.yaml @@ -0,0 +1,13 @@ +nameOverride: '' +fullnameOverride: '' +image: ravan/mockroservice:0.0.23 +resources: + requests: + memory: '8Mi' + cpu: '5m' + limits: + memory: '10Mi' + cpu: '10m' +ingress: + annotations: + host: diff --git a/scripts/README.md b/scripts/README.md index f27ab74..fbe7c65 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -53,6 +53,12 @@ Name | Source `rancher_update_serverurl` | [rancher/manager_settings.sh](rancher/manager_settings.sh) `rancher_wait_capiready` | [rancher/manager_lifecycle.sh](rancher/manager_lifecycle.sh) +### Rancher Prime + +Name | Source +-----------------------------------------------|------------------------------------------------------------- +`rancherprime_install_withcertmanagerclusterissuer` | [rancher/manager_lifecycle.sh](rancher/manager_lifecycle.sh) + ### SUSE Observability Name | Source @@ -64,6 +70,8 @@ Name | Source `observability_get_component_snapshot` | [observability/stql.sh](observability/stql.sh) `observability_get_component_state` | [observability/stql.sh](observability/stql.sh) `observability_install_cli` | [observability/cli.sh](observability/cli.sh) +`observability_create_service_token` | [observability/service_token.sh](observability/service_token.sh) +`observability_delete_service_token` | [observability/service_token.sh](observability/service_token.sh) ### SUSE Linux (previously SLES, SLE Micro) diff --git a/scripts/download.sh b/scripts/download.sh index 45717e8..7089314 100644 --- a/scripts/download.sh +++ b/scripts/download.sh @@ -60,14 +60,15 @@ setup_env() { download() { info 'Download scripts' - wget https://github.com/SUSE/${GIT_REPO_NAME}/archive/${GIT_REVISION}.zip -O ${GIT_REPO_NAME}.zip - unzip -o ${GIT_REPO_NAME}.zip + wget -nv https://github.com/SUSE/${GIT_REPO_NAME}/archive/${GIT_REVISION}.zip -O ${GIT_REPO_NAME}.zip + unzip -q -o ${GIT_REPO_NAME}.zip mkdir -p ${OUTPUT_FOLDER} if [ -d ${OUTPUT_FOLDER}/scripts ]; then info "Delete ${OUTPUT_FOLDER}/scripts" rm -rf ${OUTPUT_FOLDER}/scripts fi mv ${GIT_REPO_NAME}-${GIT_FOLDER}/scripts ${OUTPUT_FOLDER} + mv ${GIT_REPO_NAME}-${GIT_FOLDER}/assets ${OUTPUT_FOLDER} } cleanup() { diff --git a/scripts/kubernetes/certificate_management.sh b/scripts/kubernetes/certificate_management.sh index ec8bb86..0cf02c0 100644 --- a/scripts/kubernetes/certificate_management.sh +++ b/scripts/kubernetes/certificate_management.sh @@ -15,9 +15,17 @@ k8s_install_certmanager() { helm repo add jetstack https://charts.jetstack.io helm repo update kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/${version}/cert-manager.crds.yaml + if [ $? -ne 0 ]; then + echo "Failed to install cert-manager CRDs" + exit 1 + fi helm upgrade --install cert-manager jetstack/cert-manager \ --namespace cert-manager --create-namespace \ --version ${version} + if [ $? -ne 0 ]; then + echo "Failed to install cert-manager" + exit 1 + fi kubectl wait pods -n cert-manager -l app.kubernetes.io/instance=cert-manager --for condition=Ready 2>/dev/null } @@ -34,12 +42,42 @@ k8s_create_letsencryptclusterissuer() { local emailAddress=$2 echo "Creating certificate issuers using Let's Encrypt..." - helm repo add suse-lab-setup https://opensource.suse.com/lab-setup - helm repo update - helm upgrade --install letsencrypt suse-lab-setup/letsencrypt \ - --namespace cert-manager \ - --set ingress.className=${ingressClassname} \ - --set registration.emailAddress=${emailAddress} + kubectl apply -f - < /dev/null; do sleep 1 diff --git a/scripts/kubernetes/cluster_status.sh b/scripts/kubernetes/cluster_status.sh index 1fa95ee..0121280 100644 --- a/scripts/kubernetes/cluster_status.sh +++ b/scripts/kubernetes/cluster_status.sh @@ -35,7 +35,10 @@ k8s_wait_fornodesandpods() { echo 'All pods are in Running or Completed status.' break else - sleep 5 + # print pods not in Running or Completed status + kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers + echo "Sleeping..." + sleep 10 fi done } diff --git a/scripts/longhorn/install.sh b/scripts/longhorn/install.sh new file mode 100644 index 0000000..7e6dcee --- /dev/null +++ b/scripts/longhorn/install.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +longhorn_install() { + local hostname=$1 + local version=${2:-1.8.1} + echo '>>> Setup prerequisites for Longhorn install' + helm repo add longhorn https://charts.longhorn.io + helm repo update + zypper install -y open-iscsi cryptsetup + systemctl enable --now iscsid.service + modprobe iscsi_tcp + echo '=== Check prerequisites' + curl -k -sSfL -o longhornctl https://github.com/longhorn/cli/releases/download/v${version}/longhornctl-linux-amd64 + chmod +x longhornctl + ./longhornctl check preflight + echo '=== Install LongHorn' + helm upgrade -i --version $version longhorn longhorn/longhorn --namespace longhorn-system --create-namespace --set ingress.enabled=true --set ingress.host=$hostname --set persistence.migratable=true --set longhornUI.replicas=1 + echo "<<< Longhorn should be available in a few minutes in: $hostname" +} diff --git a/scripts/observability/agent.sh b/scripts/observability/agent.sh index b6a4c1c..3264e8d 100644 --- a/scripts/observability/agent.sh +++ b/scripts/observability/agent.sh @@ -1,27 +1,48 @@ #!/bin/bash ####################################### -# Install the Observability agent in the cluster +# Install the Observability agent in the cluster and not wait for the pods to be ready # Arguments: # url (SUSE Observability) # cluster_name # ingestion_api_key # Examples: -# observability_agent_install https://obs.suse.com demo xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +# observability_agent_install_nowait https://obs.suse.com demo xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx ####################################### -observability_agent_install() { +observability_agent_install_nowait() { local url=$1 local cluster_name=$2 local ingestion_api_key=$3 echo "Installing Observability agent..." + echo " URL: $url" + echo " Cluster name: $cluster_name" + echo " Ingestion API key: $ingestion_api_key" + helm repo add suse-observability https://charts.rancher.com/server-charts/prime/suse-observability helm repo update helm upgrade --install suse-observability-agent suse-observability/suse-observability-agent \ --namespace suse-observability --create-namespace \ - --set stackstate.apiKey=${ingestion_api_key} \ + --set stackstate.apiKey="${ingestion_api_key}" \ --set stackstate.url="${url%/}/receiver/stsAgent" \ - --set stackstate.cluster.name=${cluster_name} + --set stackstate.cluster.name="${cluster_name}" +} + +####################################### +# Install the Observability agent in the cluster +# Arguments: +# url (SUSE Observability) +# cluster_name +# ingestion_api_key +# Examples: +# observability_agent_install https://obs.suse.com demo xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +####################################### +observability_agent_install() { + local url=$1 + local cluster_name=$2 + local ingestion_api_key=$3 + + observability_agent_install_nowait $url $cluster_name $ingestion_api_key - kubectl wait pods -n suse-observability -l app.kubernetes.io/instance=suse-observability-agent --for condition=Ready 2>/dev/null + kubectl wait pods -n suse-observability -l app.kubernetes.io/instance=suse-observability-agent --for condition=Ready } diff --git a/scripts/observability/api_key.sh b/scripts/observability/api_key.sh index b01ebda..0cf5ba0 100644 --- a/scripts/observability/api_key.sh +++ b/scripts/observability/api_key.sh @@ -17,33 +17,9 @@ observability_create_ingestion_api_key() { local cluster_name=$3 local resp - resp=$(/usr/local/bin/sts ingestion-api-key create --name $cluster_name -o json --url $url --service-token $service_token) - - echo $resp | jq -r '."ingestion-api-key".apiKey' -} - -####################################### -# Delete an Ingestion API key for SUSE Observability -# Arguments: -# url (SUSE Observability) -# service_token (SUSE Observability) -# cluster_name -# Examples: -# observability_delete_ingestion_api_key https://obs.suse.com/ xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx demo -####################################### -observability_delete_ingestion_api_key() { - local url=$1 - local service_token=$2 - local cluster_name=$3 - - local keys key_id - - keys=$(/usr/local/bin/sts ingestion-api-key list -o json --url $url --service-token $service_token) - key_id=$(echo $keys | jq -r '."ingestion-api-keys"[] | select(.name == "'$cluster_name'") | .id') - if [ -n "$key_id" ]; then - /usr/local/bin/sts ingestion-api-key delete --id $key_id --url $url --service-token $service_token - echo ">>> Ingestion API key for cluster '${cluster_name}' deleted" - else - echo ">>> Ingestion API key for cluster '${cluster_name}' not found" - fi + /usr/local/bin/sts rbac create-subject --subject $cluster_name-agent --service-token $service_token --url $url + /usr/local/bin/sts rbac grant --subject $cluster_name-agent --permission update-metrics --service-token $service_token --url $url + resp=$(/usr/local/bin/sts service-token create --name $cluster_name --roles $cluster_name-agent --service-token $service_token --url $url -o json) + echo $resp + echo $resp | jq -r '."service-token".token' } diff --git a/scripts/observability/cli.sh b/scripts/observability/cli.sh index 40c2030..fa0cf89 100644 --- a/scripts/observability/cli.sh +++ b/scripts/observability/cli.sh @@ -4,8 +4,9 @@ # Install the SUSE Observability CLI ####################################### observability_install_cli() { + local version=${1:-3.1.1} if ! [ -x "$(command -v sts)" ]; then - curl -o- https://dl.stackstate.com/stackstate-cli/install.sh | STS_CLI_LOCATION=/usr/local/bin bash + curl -s -o- https://dl.stackstate.com/stackstate-cli/install.sh | STS_CLI_LOCATION=/usr/local/bin STS_CLI_VERSION=$version bash else echo ">>> sts CLI already installed" fi diff --git a/scripts/observability/monitors.sh b/scripts/observability/monitors.sh new file mode 100644 index 0000000..0aae47c --- /dev/null +++ b/scripts/observability/monitors.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +observability_disable_monitor() { + local monitor_identifier=$1 + local url=$2 + local service_token=$3 + /usr/local/bin/sts monitor disable --identifier $monitor_identifier --service-token $service_token --url $url +} + +observability_deploy_monitor() { + local file $1 + local url=$2 + local service_token=$3 + /usr/local/bin/sts monitor apply -f $file --service-token $service_token --url $url +} + +observability_enable_monitor() { + local monitor_identifier=$1 + local url=$2 + local service_token=$3 + /usr/local/bin/sts monitor enable --identifier $monitor_identifier --service-token $service_token --url $url +} + diff --git a/scripts/observability/platform.sh b/scripts/observability/platform.sh new file mode 100644 index 0000000..c8c2748 --- /dev/null +++ b/scripts/observability/platform.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +observability_platform_generate_values() { + local host=$1 + local license=$2 + local password=$3 + local values_dir=$4 + helm template --set license=$license \ + --set baseUrl=$host \ + --set adminPassword=$password \ + --set sizing.profile=trial \ + suse-observability-values suse-observability/suse-observability-values \ + --output-dir $values_dir + + cat << EOF > $values_dir/suse-observability-values/templates/ingress_values.yaml +ingress: + enabled: true + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/proxy-body-size: "50m" + hosts: + - host: $host + tls: + - hosts: + - $host + secretName: tls-secret +EOF +} + +observability_platform_bootstrap_token() { + local token=$1 + local values_dir=$2 + + cat << EOF > $values_dir/suse-observability-values/templates/bootstrap_token.yaml +stackstate: + authentication: + serviceToken: + bootstrap: + token: $token + roles: ["stackstate-k8s-troubleshooter", "stackstate-admin", "stackstate-k8s-admin"] +EOF +} + +observability_platform_install() { + local values_dir=$1 + helm upgrade --install --namespace suse-observability --create-namespace \ + --values $values_dir/suse-observability-values/templates/baseConfig_values.yaml \ + --values $values_dir/suse-observability-values/templates/sizing_values.yaml \ + --values $values_dir/suse-observability-values/templates/ingress_values.yaml \ + --values $values_dir/suse-observability-values/templates/bootstrap_token.yaml \ + suse-observability suse-observability/suse-observability +} + +observability_platform_wait_ready() { + echo ">>> Waiting for SUSE Observability to be ready" + kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=suse-observability -n suse-observability --timeout=300s + if [ $? -ne 0 ]; then + echo ">>> SUSE Observability is not ready" + NON_RUNNING=$(kubectl get pods -n suse-observability -o json | jq -r '[.items[] | select(.status.phase != "Running" and .status.phase != "Succeeded") | {name: .metadata.name, status: .status.phase}]') + echo "Pods not running yet: $NON_RUNNING" + else + # # Wait for Observability URL available + _counter=0 + while [[ $_counter -lt 50 ]]; do + curl -sSfk $OBSERVABILITY_URL/api > /dev/null + if [ $? -eq 0 ]; then + break + fi + ((_counter++)) + echo "Waiting for Observability URL to be available... attempt ${_counter}/50" + sleep 5 + done + + if [[ $_counter -ge 50 ]] + then + # Exit with error should be uncommented for production labs. + echo ">>> TIME OUT for Observability URL to be available" + # exit 69 + else + echo ">>> Observability at '$OBSERVABILITY_URL' is available!" + fi + fi + echo ">>> SUSE Observability is ready" + +} diff --git a/scripts/observability/service_token.sh b/scripts/observability/service_token.sh new file mode 100644 index 0000000..451a1e7 --- /dev/null +++ b/scripts/observability/service_token.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +####################################### +# Create a service token for SUSE Observability +# Output: +# The service token +# Arguments: +# url (SUSE Observability) +# service_token (SUSE Observability) +# cluster_name +# role +# Examples: +# observability_create_service_token https://obs.suse.com/ xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx lab-dsu37834 stackstate-k8s-troubleshooter +####################################### +observability_create_service_token() { + local url=$1 + local service_token=$2 + local cluster_name=$3 + local role=$4 + + local resp + resp=$(/usr/local/bin/sts service-token create --name $cluster_name --roles $role -o json --url $url --service-token $service_token) + + echo $resp | jq -r '."service-token".token' +} + +####################################### +# Delete a service token for SUSE Observability +# Arguments: +# url (SUSE Observability) +# service_token (SUSE Observability) +# cluster_name +# Examples: +# observability_delete_service_token https://obs.suse.com/ xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx lab-dsu37834 +####################################### +observability_delete_service_token() { + local url=$1 + local service_token=$2 + local cluster_name=$3 + + local tokens token_id + + tokens=$(/usr/local/bin/sts service-token list -o json --url $url --service-token $service_token) + token_id=$(echo $tokens | jq -r '."service-tokens"[] | select(.name == "'$cluster_name'") | .id') + if [ -n "$token_id" ]; then + /usr/local/bin/sts service-token delete --id $token_id --url $url --service-token $service_token + echo ">>> Service token named '${cluster_name}' deleted" + else + echo ">>> Service token named '${cluster_name}' not found" + fi +} diff --git a/scripts/rancher/cluster_actions.sh b/scripts/rancher/cluster_actions.sh index fd4374b..d6d3f7a 100644 --- a/scripts/rancher/cluster_actions.sh +++ b/scripts/rancher/cluster_actions.sh @@ -11,6 +11,53 @@ rancher_list_clusters() { kubectl get clusters.provisioning.cattle.io --all-namespaces -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' } + +####################################### +# Shows Kubernetes version +# Examples: +# get_k8s_version +####################################### +get_k8s_version() { + # echo 'Retrieving Kubernetes version' + kubectl version -o yaml | yq .serverVersion.gitVersion +} + + + +####################################### +# Shows Rancher version +# Globals: +# RANCHER_CLUSTER_URL +# Arguments: +# rancher_cluster_url - optional if RANCHER_CLUSTER URL is defined +# Examples: +# get_rancher_version https://rancher.clustername.domain.name/ +# Return format: +# {"Version":"v2.10.2","GitCommit":"a8208b7884a5115d31bfda65de78e3a65798179f","RancherPrime":"true"} +####################################### +get_rancher_version() { + # echo 'Retrieving Rancher version' + # Thanks Eduardo Mínguez and Josh Meranda + # other options: + # kubectl get po -n cattle-system -l app=rancher -o jsonpath='{.items[0].spec.containers[0].image}' + # R: registry.rancher.com/rancher/rancher:v2.10.1 + # kubectl exec -it -n cattle-system $(kubectl get po -n cattle-system -l app=rancher -o name) -- rancher --version + # R: rancher version v2.10.1 (daaa287448fe866f141beead10ae93ffc2400469) + if [[ "$1" != "" ]] + then + local _rancher_url=${1}ancherversion + else + local _rancher_url= + fi + if [[ "${_rancher_url}" != "" ]] && [[ "${_rancher_url}" =~ "https://" ]]; then + curl -k ${_rancher_url}/rancherversion + else + echo "ERROR: Missing or incorrect rancher URL" + exit 1 + fi +} + + ####################################### # Create downstream custom cluster in Rancher (don't wait and retrieve name) # Globals: diff --git a/scripts/rancher/manager_lifecycle.sh b/scripts/rancher/manager_lifecycle.sh index 180ee48..26145f7 100644 --- a/scripts/rancher/manager_lifecycle.sh +++ b/scripts/rancher/manager_lifecycle.sh @@ -36,6 +36,39 @@ rancher_install_withcertmanagerclusterissuer() { sleep 10 } +####################################### +# Installs Rancher Prime with a certificate generated by a cluster issuer +# Arguments: +# Version +# Number of replicas +# Hostname +# Cluster issuer name (managed by cert-manager) +# Examples: +# rancher_install_withcertmanagerclusterissuer latest "2.8.2" 1 rancher.random_string.geek letsencrypt-prod +####################################### +rancherprime_install_withcertmanagerclusterissuer() { + local version=$1 + local replicas=$2 + local hostname=$3 + local clusterissuer=$4 + + echo "Installing Rancher..." + helm repo add rancher-prime https://charts.rancher.com/server-charts/prime + helm repo update + helm upgrade --install rancher rancher-prime/rancher --namespace cattle-system --create-namespace \ + --version ${version} \ + --set replicas=${replicas} \ + --set hostname=${hostname} \ + --set ingress.extraAnnotations.'cert-manager\.io/cluster-issuer'=${clusterissuer} \ + --set ingress.tls.source=secret \ + --set ingress.tls.secretName=rancher-tls \ + --set agentTLSMode="system-store" + kubectl wait pods -n cattle-system -l app=rancher --for condition=Ready --timeout=180s + echo "Waiting for Rancher web app to be running with a valid certificate..." + while ! kubectl get secret rancher-tls --namespace cattle-system 2>/dev/null; do sleep 1; done + sleep 10 +} + ####################################### # Do the first log in Rancher (will update admin password and set server URL) # Arguments: @@ -65,16 +98,33 @@ rancher_first_login() { # rancher_wait_capiready ####################################### rancher_wait_capiready() { - while true; do + _counter=0 + while [[ $_counter -lt 25 ]]; do status=$(kubectl get deployment capi-controller-manager -n cattle-provisioning-capi-system -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null) if [ "$status" == 'True' ]; then echo 'Deployment capi-controller-manager is available' break fi sleep 10 + ((_counter++)) + echo "Waiting for capi-controller-manager to become available... attempt ${_counter}/25" done - while [[ $(kubectl get endpoints capi-webhook-service -n cattle-provisioning-capi-system -o jsonpath='{.subsets}' 2>/dev/null) == '' ]]; do + if [[ $_counter -eq 25 ]]; then + echo 'Deployment capi-controller-manager is not available' + kubectl get deployment capi-controller-manager -n cattle-provisioning-capi-system -o jsonpath='{.status}' + exit 1 + fi + _counter=0 + while [[ $_counter -lt 25 ]]; do + if [[ ! $(kubectl get endpoints capi-webhook-service -n cattle-provisioning-capi-system -o jsonpath='{.subsets}' 2>/dev/null) == '' ]]; then + echo 'Endpoint is ready' + break + fi sleep 10 + ((_counter++)) + echo "Waiting for endpoint capi-webhook-service to be ready... attempt ${_counter}/25" done + + echo 'Service capi-webhook-service is ready' }