diff --git a/monitoring/rules/viya/beta-rules-viya-health.yaml b/monitoring/rules/viya/beta-rules-viya-health.yaml deleted file mode 100644 index 6c68d351..00000000 --- a/monitoring/rules/viya/beta-rules-viya-health.yaml +++ /dev/null @@ -1,149 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: viya-alerts - namespace: monitoring - labels: - sas.com/monitoring-base: kube-viya-monitoring -spec: - groups: - - name: prod - rules: - - alert: cas-restart - annotations: - description: - Check to see that the CAS pod existed for a short time. This - implies that CAS pod has restarted for whatever the reason. Will need to - further investigate the cause. - summary: - The current CAS (sas-cas-server-default-controller) pod < 15 minutes - in existence. Mostly likely it is due to restart of the CAS pod. - expr: cas_grid_uptime_seconds_total - for: 5m - labels: - severity: warning - - alert: viya-readiness - annotations: - description: - Checks for the Ready state of sas-readiness pod. Will need to - check the status of the Viya pods since sas-readiness pod reflects the health - of the Viya services. - summary: - sas-readiness pod is not in Ready state. This means that one or - more of the Viya services are not in a good state. - expr: kube_pod_container_status_ready{container="sas-readiness"} - for: 5m - labels: - severity: warning - - alert: rabbitmq-readymessages - annotations: - description: - Checks for accumulation of Rabbitmq ready messages > 10,000. It - could impact Model Studio pipelines. - summary: - Rabbitmq ready messages > 10,000. This means there is a large backlog - of messages due to high activity (which can be temporary) or something has - gone wrong. - expr: rabbitmq_queue_messages_ready - for: 5m - labels: - severity: warning - - alert: NFS-share - annotations: - description: - Checks if the NFS share attached to CAS is > 85% full. Use command - "du -h -d 1" to to find the location where large files are located in the - NFS shares. Most likely it will be one of the home directories due to runaway - size of a casuser table or Viya backups. - summary: - NFS share > 85% full. Typically, it is due to users filling their - own home directory or backups. - expr: - ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="cas-default-data"} - - kubelet_volume_stats_available_bytes{persistentvolumeclaim="cas-default-data"}) - / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="cas-default-data"}) - * 100 - for: 5m - labels: - severity: warning - - alert: cas-memory - annotations: - description: - Checks the CAS memory usage. If it is > 300GB, it will alert. Currently, - max. memory is 512GB. The expectation is that this alert will be an early - warning sign to investigate large memory usage as typical usage is less - than the threshold. Want to prevent OOMkill of CAS. - summary: - CAS memory > 300GB. This can be due to a program or pipeline taking - all the available memory. - expr: (cas_node_mem_size_bytes{type="physical"} - cas_node_mem_free_bytes{type="physical"})/1073741824 - for: 5m - labels: - severity: warning - - alert: catalog-dbconn - annotations: - description: - Checks the in-use catalog database connections > 21. The default - db connection pool is 22. If it reaches the limit, the rabbitmq queues - starts to fill up with ready messages causing issues with Model Studio pipelines. - summary: - The active catalog database connections > 21. If it reaches the - max. db connections, it will impact the rabbitmq queues. - expr: sas_db_pool_connections{container="sas-catalog-services", state="inUse"} - for: 5m - labels: - severity: warning - - alert: compute-age - annotations: - description: - It looks for compute pods > 1 day. Most likely, it is orphaned - compute pod that is lingering. Consider killing it. - summary: SAS compute-server pods > 1 day old. - expr: (time() - kube_pod_created{pod=~"sas-compute-server-.*"})/60/60/24 - for: 5m - labels: - severity: warning - - alert: crunchy-pgdata - annotations: - description: "Checks to see /pgdata filesystem is more than 50% full." - summary: - /pgdata storage > 50% full. This typically happens when the WAL - logs are increasing and not being cleared. - expr: - ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"} - - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"}) - / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"}) - * 100 - for: 5m - labels: - severity: warning - - alert: crunchy-backrest-repo - annotations: - description: - Checks to see /pgbackrest/repo1 filesystem is more than 50% - full. - summary: - /pgbackrest/repo1 storage > 50% full in the pgbackrest repo. This - typically happens when the archived WAL logs are increasing and not being - expired and cleared. - expr: - ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"} - - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"}) - / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"}) - * 100 - for: 5m - labels: - severity: warning - - alert: viya-pod-restarts - annotations: - description: - Checks the restart count of the pod(s). Will need to check why - the pod(s) have restarted so many times. One possible cause is OOMkill. This - means we will need to increase the memory limit. - summary: - The number of pod restarts > 20. The service pod(s) have restarted - many times due to issues. - expr: kube_pod_container_status_restarts_total{namespace="viya"} - for: 5m - labels: - severity: warning diff --git a/v4m-chart/Chart.yaml b/v4m-chart/Chart.yaml index dba9231e..e019633d 100644 --- a/v4m-chart/Chart.yaml +++ b/v4m-chart/Chart.yaml @@ -2,5 +2,5 @@ apiVersion: v2 name: v4m description: SAS Viya 4 Monitoring for Kubernetes (https://github.com/sassoftware/viya4-monitoring-kubernetes) type: application -version: "1.2.40" -appVersion: "1.2.40" +version: "1.2.41-SNAPSHOT" +appVersion: "1.2.41-SNAPSHOT"