Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions .github/workflows/alerts-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Alerts-test-zookeeper-operator
on:
workflow_run:
workflows: ["Build Artifacts"]
types:
- completed
pull_request:
branches:
- all

env:
max_attempts: 30
delay: 10

permissions:
contents: read

jobs:
Run-Alerts-Test:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Check out repository code
uses: actions/checkout@v4

- name: Check yq version
run: yq --version

- name: Install Helm
run: |
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash

- name: Render rules file from helm chart
run: |
helm template zookeeper-montemplate ./operator/charts/helm/zookeeper-service/ --set monitoring.install=true --set BackupDaemon.install=true --set monitoring.backupDaemonAlertsInstall=true --set monitoring.alertsPackVersion=v2 > ./operator/tests/alerts-tests/rules.yaml
sed -n '/prometheus_rules.yaml/,/---/p' -i ./operator/tests/alerts-tests/rules.yaml
sed '0,/spec:/d' -i ./operator/tests/alerts-tests/rules.yaml

- name: Check that all necessary tests exists
run: |
chmod +x ./operator/tests/alerts-tests/tests-checker.sh
cd ./operator/tests/alerts-tests/
./tests-checker.sh
continue-on-error: true

- name: Install vmalert-tool
run: |
wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v1.122.4/vmutils-linux-amd64-v1.122.4-enterprise.tar.gz
tar -xvf vmutils-linux-amd64-v1.122.4-enterprise.tar.gz
chmod +x vmalert-tool-prod

- name: Run test
run: |
./vmalert-tool-prod unittest --files ./operator/tests/alerts-tests/test.yaml
7 changes: 7 additions & 0 deletions operator/charts/helm/zookeeper-service/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,10 @@ version: 0.1.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application.
appVersion: 1.16.0

dependencies:
# Prometheus alert rules
- name: monitoring
condition: monitoring.install
version: ~0
repository: "file://charts/monitoring"
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: v2
name: monitoring
description: A Helm chart for Kubernetes

# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.16.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{{- define "defaultAlerts" -}}
{{ .Release.Namespace }}-{{ .Release.Name }}:
rules:
ZooKeeper_Is_Degraded_Alarm:
annotations:
description: 'ZooKeeper is Degraded.'
summary: Some of ZooKeeper Service pods are down
expr: zookeeper_status_code{host=~"^.*",project_name="{{ .Release.Namespace }}"} == 5
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
ZooKeeper_Is_Down_Alarm:
annotations:
description: 'ZooKeeper is Down.'
summary: All of ZooKeeper Service pods are down
expr: zookeeper_status_code{host=~"^.*",project_name="{{ .Release.Namespace }}"} == 10
for: 3m
labels:
severity: critical
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
ZooKeeper_CPU_Load_Alarm:
annotations:
description: 'ZooKeeper CPU load is higher than 95 percents'
summary: Some of ZooKeeper Service pod loads CPU higher then 95 percents
expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", pod=~".*zookeeper-[0-9].*"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_pod=~".*zookeeper-[0-9].*"}) > 0.95
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
ZooKeeper_Memory_Usage_Alarm:
annotations:
description: 'ZooKeeper memory usage is higher than 95 percents'
summary: Some of ZooKeeper Service pod uses memory higher then 95 percents
expr: max(container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}",container!~"POD|",pod=~".*zookeeper-[0-9].*"}) / max(kube_pod_container_resource_limits_memory_bytes{exported_namespace="{{ .Release.Namespace }}",exported_pod=~".*zookeeper-[0-9].*"}) >0.95
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
{{- if .Values.backupDaemonAlertsInstall }}
ZooKeeper_Last_Backup_Has_Failed_Alarm:
annotations:
description: 'ZooKeeper Last Backup Has Failed'
summary: ZooKeeper Last Backup Has Failed
expr: zookeeper_backup_metric_last_backup_status{host=~"^.*",project_name="{{ .Release.Namespace }}"} == 4
for: 3m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
{{- end }}
{{- end }}


Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{{- if and ( .Values.install) (eq .Values.alertsPackVersion "v2") }}
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: prometheusrules
spec:
groups:

{{- $defaultConfig := fromYaml (include "defaultAlerts" . ) -}}
{{- $overrideConfig := .Values.alerts -}}
{{- $finalConfig := merge $overrideConfig $defaultConfig -}}
{{- $alertGroups := .Values.ruleGroups -}}

{{- range $defaultGroupName, $defaultGroup := $finalConfig }}
{{- $found := true }}
{{- if $alertGroups }}
{{- $found := false }}
{{- range $alertGroups }}
{{- if eq $defaultGroupName . }}
{{- $found := true }}
{{- end }}
{{- end }}
{{- else }}
{{- $found := true }}
{{- end }}

{{- if $found }}
- name: {{ $defaultGroupName }}
{{- if $defaultGroup.labels }}
labels:
{{- range $defaultLabelName, $defaultLabelValue := $defaultGroup.labels }}
{{ $defaultLabelName }}: {{ $defaultLabelValue }}
{{- end }}
{{- end }}
{{- if $defaultGroup.interval }}
interval: {{ $defaultGroup.interval }}
{{- end }}
{{- if $defaultGroup.concurrency }}
concurrency: {{ $defaultGroup.concurrency }}
{{- end }}
rules:
{{- range $defaultRuleName, $defaultRule := $defaultGroup.rules }}
- alert: {{ $defaultRuleName }}
expr: {{ $defaultRule.expr }}
{{- if $defaultRule.for }}
for: {{ $defaultRule.for }}
{{- end }}
labels:
{{- range $defaultLabelName, $defaultLabelValue := $defaultRule.labels }}
{{ $defaultLabelName }}: {{ $defaultLabelValue }}
{{- end }}
annotations:
{{- range $defaultAnnotationName, $defaultAnnotationValue := $defaultRule.annotations }}
{{ $defaultAnnotationName }}: {{ printf $defaultAnnotationValue | trimAll "\n" | toJson | replace "\\u0026" "&" | replace "\\u003e" ">" | nindent 14 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
Empty file.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if (and (eq (include "monitoring.install" .) "true") .Values.monitoring.installGrafanaDashboard) }}
{{- if (and (eq (include "monitoring.install" .) "true") .Values.monitoring.installGrafanaDashboard (ne .Values.monitoring.alertsPackVersion "v2"))}}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
Expand All @@ -20,7 +20,7 @@ spec:
expr: zookeeper_status_code{host=~"^.*",project_name="{{ .Release.Namespace }}"} == 5
for: 3m
labels:
severity: high
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
- alert: ZooKeeper_Is_Down_Alarm
Expand All @@ -30,7 +30,7 @@ spec:
expr: zookeeper_status_code{host=~"^.*",project_name="{{ .Release.Namespace }}"} == 10
for: 3m
labels:
severity: disaster
severity: critical
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
- alert: ZooKeeper_CPU_Load_Alarm
Expand All @@ -40,7 +40,7 @@ spec:
expr: max(rate(container_cpu_usage_seconds_total{namespace="{{ .Release.Namespace }}", pod=~".*zookeeper-[0-9].*"}[1m])) / max(kube_pod_container_resource_limits_cpu_cores{exported_namespace="{{ .Release.Namespace }}", exported_pod=~".*zookeeper-[0-9].*"}) > 0.95
for: 3m
labels:
severity: high
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
- alert: ZooKeeper_Memory_Usage_Alarm
Expand All @@ -50,7 +50,7 @@ spec:
expr: max(container_memory_working_set_bytes{namespace="{{ .Release.Namespace }}",container!~"POD|",pod=~".*zookeeper-[0-9].*"}) / max(kube_pod_container_resource_limits_memory_bytes{exported_namespace="{{ .Release.Namespace }}",exported_pod=~".*zookeeper-[0-9].*"}) >0.95
for: 3m
labels:
severity: high
severity: warning
namespace: {{ .Release.Namespace }}
service: {{ .Release.Name }}
{{- if .Values.backupDaemon.install }}
Expand Down
5 changes: 4 additions & 1 deletion operator/charts/helm/zookeeper-service/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,10 @@ zooKeeper:

## Values for ZooKeeper Monitoring deployment
monitoring:
install: true
install: false
alertsPackVersion: v1
backupDaemonAlertsInstall: false

dockerImage: "ghcr.io/netcracker/qubership-zookeeper-monitoring:main"
# affinity: {
# "podAffinity": {
Expand Down
Loading