begin cleanup of alerts

ceelias · ceelias · commit 37d0bcd5ad51 · 2025-06-16T13:46:20.000-04:00
diff --git a/monitoring/alerting/rules/viya-alert-rules.yaml b/monitoring/alerting/rules/viya-alert-rules.yaml
@@ -94,7 +94,7 @@ groups:
         isPaused: false
         labels: {}
         noDataState: NoData
-        title: cas-restart
+        title: CAS Restart Detected
         uid: fc41d560-9a18-4168-8a6a-615e60dc70de
       - annotations:
           description:
@@ -184,53 +184,8 @@ groups:
         isPaused: false
         labels: {}
         noDataState: NoData
-        title: cas-memory
+        title: CAS Memory Usage High
         uid: ca744a08-e4e9-49b7-85a1-79e9fe05d4c1
-      - annotations:
-          description:
-            Check to see that the CAS pod existed for a short time. This implies
-            that CAS pod has restarted for whatever the reason. Will need to further investigate
-            the cause.
-          summary:
-            The current CAS (sas-cas-server-default-controller) pod < 15 minutes
-            in existence. Mostly likely it is due to restart of the CAS pod.
-        condition: A
-        data:
-          - datasourceUid: prometheus
-            model:
-              expr: cas_grid_uptime_seconds_total
-              instant: true
-            refId: A
-            relativeTimeRange:
-              from: 300
-              to: 0
-        for: 5m
-        labels:
-          severity: warning
-        title: CAS Restart Alert
-        uid: cas_restart_alert
-      - annotations:
-          description: Checks the CAS memory usage. If it is > 300GB, it will alert.
-          summary:
-            CAS memory > 300GB. This can be due to a program or pipeline taking
-            all the available memory.
-        condition: A
-        data:
-          - datasourceUid: prometheus
-            model:
-              expr:
-                (cas_node_mem_size_bytes{type="physical"} - cas_node_mem_free_bytes{type="physical"})
-                / 1073741824 > 300
-              instant: true
-            refId: A
-            relativeTimeRange:
-              from: 300
-              to: 0
-        for: 5m
-        labels:
-          severity: warning
-        title: CAS Memory Usage
-        uid: cas_memory_usage
       - annotations:
           description:
             CAS thread count is higher than 400. May indicate overloaded CAS
@@ -345,7 +300,7 @@ groups:
         isPaused: false
         labels: {}
         noDataState: NoData
-        title: viya-readiness
+        title: Viya Readiness Probe Failed
         uid: e45e6d74-e396-40ce-a061-2a294295e61b
       - annotations:
           description:
@@ -438,7 +393,7 @@ groups:
         isPaused: false
         labels: {}
         noDataState: NoData
-        title: rabbitmq-readymessages
+        title: RabbitMQ Ready Queue Backlog
         uid: efb36686-4e44-4de8-80c4-7dde9130da90
       - annotations:
           description:
@@ -527,7 +482,7 @@ groups:
         isPaused: true
         labels: {}
         noDataState: OK
-        title: compute-age
+        title: Stale Compute Pod Detected
         uid: ed69b8e4-ce60-44a0-8f51-83743df0e448
       - annotations:
           description:
@@ -618,91 +573,8 @@ groups:
         isPaused: false
         labels: {}
         noDataState: NoData
-        title: viya-pod-restarts
+        title: Viya Pod Restart Count High
         uid: e7ecb843-f1bd-48b7-8c8c-58571d1642ad
-      - annotations:
-          description:
-            Checks for the Ready state of sas-readiness pod. Will need to check
-            the status of the Viya pods since sas-readiness pod reflects the health of
-            the Viya services.
-          summary:
-            sas-readiness pod is not in Ready state. This means that one or more
-            of the Viya services are not in a good state.
-        condition: A
-        data:
-          - datasourceUid: prometheus
-            model:
-              expr: kube_pod_container_status_ready{container="sas-readiness"}
-              instant: true
-            refId: A
-            relativeTimeRange:
-              from: 300
-              to: 0
-        for: 5m
-        labels:
-          severity: warning
-        title: Viya Readiness
-        uid: viya_readiness
-      - annotations:
-          description:
-            Checks for accumulation of Rabbitmq ready messages > 10,000. It
-            could impact Model Studio pipelines.
-          summary:
-            Rabbitmq ready messages > 10,000. This means there is a large backlog
-            of messages due to high activity or something has gone wrong.
-        condition: A
-        data:
-          - datasourceUid: prometheus
-            model:
-              expr: rabbitmq_queue_messages_ready > 10000
-              instant: true
-            refId: A
-            relativeTimeRange:
-              from: 300
-              to: 0
-        for: 5m
-        labels:
-          severity: warning
-        title: RabbitMQ Ready Messages
-        uid: rabbitmq_ready_msgs
-      - annotations:
-          description: Looks for compute pods > 1 day.
-          summary: SAS compute-server pods > 1 day old.
-        condition: A
-        data:
-          - datasourceUid: prometheus
-            model:
-              expr:
-                (time() - kube_pod_created{pod=~"sas-compute-server-.*"}) / 60 / 60
-                / 24 > 1
-              instant: true
-            refId: A
-            relativeTimeRange:
-              from: 300
-              to: 0
-        for: 5m
-        labels:
-          severity: warning
-        title: Compute Pod Age
-        uid: compute_pod_age
-      - annotations:
-          description: Checks if any Viya pods have restarted > 20 times.
-          summary: The number of pod restarts > 20. Investigate for OOM or instability.
-        condition: A
-        data:
-          - datasourceUid: prometheus
-            model:
-              expr: kube_pod_container_status_restarts_total{namespace="viya"} > 20
-              instant: true
-            refId: A
-            relativeTimeRange:
-              from: 300
-              to: 0
-        for: 5m
-        labels:
-          severity: warning
-        title: Viya Pod Restarts
-        uid: viya_pod_restarts
       - annotations:
           description:
             RabbitMQ has a high number of unacknowledged messages. This may
@@ -721,7 +593,7 @@ groups:
         for: 5m
         labels:
           severity: warning
-        title: RabbitMQ Unacked Messages High
+        title: RabbitMQ Unacked Queue Backlog
         uid: rabbitmq_unacked_messages
       - annotations:
           description:
@@ -743,105 +615,13 @@ groups:
         for: 5m
         labels:
           severity: warning
-        title: Viya API Latency High
+        title: High Viya API Latency
         uid: viya_api_latency
   - folder: Other Alerts
     interval: 5m
     name: SAS Viya Alerts
     orgId: 1
     rules:
-      - annotations:
-          description:
-            Checks if the NFS share attached to CAS is > 85% full. Use command
-            "du -h -d 1" to to find the location where large files are located in the
-            NFS shares. Most likely it will be one of the home directories due to runaway
-            size of a casuser table or Viya backups.
-          summary:
-            NFS share > 85% full. Typically, it is due to users filling their own
-            home directory or backups.
-        condition: C
-        data:
-          - datasourceUid: prometheus
-            model:
-              editorMode: code
-              expr:
-                ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="cas-default-data"}
-                - kubelet_volume_stats_available_bytes{persistentvolumeclaim="cas-default-data"})
-                / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="cas-default-data"})
-                * 100
-              instant: true
-              intervalMs: 1000
-              legendFormat: __auto
-              maxDataPoints: 43200
-              range: false
-              refId: A
-            refId: A
-            relativeTimeRange:
-              from: 600
-              to: 0
-          - datasourceUid: __expr__
-            model:
-              conditions:
-                - evaluator:
-                    params: []
-                    type: gt
-                  operator:
-                    type: and
-                  query:
-                    params:
-                      - B
-                  reducer:
-                    params: []
-                    type: last
-                  type: query
-              datasource:
-                type: __expr__
-                uid: __expr__
-              expression: A
-              intervalMs: 1000
-              maxDataPoints: 43200
-              reducer: last
-              refId: B
-              type: reduce
-            refId: B
-            relativeTimeRange:
-              from: 600
-              to: 0
-          - datasourceUid: __expr__
-            model:
-              conditions:
-                - evaluator:
-                    params:
-                      - 85
-                    type: gt
-                  operator:
-                    type: and
-                  query:
-                    params:
-                      - C
-                  reducer:
-                    params: []
-                    type: last
-                  type: query
-              datasource:
-                type: __expr__
-                uid: __expr__
-              expression: B
-              intervalMs: 1000
-              maxDataPoints: 43200
-              refId: C
-              type: threshold
-            refId: C
-            relativeTimeRange:
-              from: 600
-              to: 0
-        execErrState: Error
-        for: 5m
-        isPaused: false
-        labels: {}
-        noDataState: NoData
-        title: NFS-share
-        uid: d52b3c24-acf4-4b5e-ae52-31ff8f167330
       - annotations:
           description:
             "Checks to see /pgbackrest/repo1 filesystem is more than 50% full.
@@ -956,32 +736,8 @@ groups:
         for: 5m
         labels:
           severity: warning
-        title: NFS Share Usage
+        title: NFS Share Usage High
         uid: nfs_share_usage
-      - annotations:
-          description: Checks if /pgbackrest/repo1 is more than 50% full.
-          summary:
-            /pgbackrest/repo1 storage > 50% full. Possibly due to unexpired WAL
-            logs.
-        condition: A
-        data:
-          - datasourceUid: prometheus
-            model:
-              expr:
-                "((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~\"sas-crunchy-platform-postgres-repo1\"\
-                }\n - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~\"sas-crunchy-platform-postgres-repo1\"\
-                })\n / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~\"sas-crunchy-platform-postgres-repo1\"\
-                }) * 100 > 50"
-              instant: true
-            refId: A
-            relativeTimeRange:
-              from: 300
-              to: 0
-        for: 5m
-        labels:
-          severity: warning
-        title: Crunchy Backrest Repo Usage
-        uid: pgbackrest_repo_usage
   - folder: Database Alerts
     interval: 5m
     name: SAS Viya Alerts
@@ -1078,7 +834,7 @@ groups:
         isPaused: false
         labels: {}
         noDataState: NoData
-        title: catalog-dbconn
+        title: Catalog DB Connections High
         uid: fc65fbaf-c196-4eb4-a130-f45cc46b775b
       - annotations:
           description: "Checks to see /pgdata filesystem is more than 50% full.
@@ -1172,50 +928,8 @@ groups:
         isPaused: false
         labels: {}
         noDataState: NoData
-        title: crunchy-pgdata
+        title: Crunchy PGData Usage High
         uid: fb411e28-b2e5-43d0-a413-e6dedbf154c4
-      - annotations:
-          description: Checks the in-use catalog database connections > 21.
-          summary: The active catalog database connections > 21. May impact RabbitMQ queues.
-        condition: A
-        data:
-          - datasourceUid: prometheus
-            model:
-              expr:
-                sas_db_pool_connections{container="sas-catalog-services", state="inUse"}
-                > 21
-              instant: true
-            refId: A
-            relativeTimeRange:
-              from: 300
-              to: 0
-        for: 5m
-        labels:
-          severity: warning
-        title: Catalog DB Connections
-        uid: catalog_db_connections
-      - annotations:
-          description: Checks if /pgdata is more than 50% full.
-          summary: /pgdata storage > 50% full. Often due to WAL logs not being cleared.
-        condition: A
-        data:
-          - datasourceUid: prometheus
-            model:
-              expr:
-                "((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~\"sas-crunchy-platform-postgres-00-.*\"\
-                }\n - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~\"sas-crunchy-platform-postgres-00-.*\"\
-                })\n / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~\"sas-crunchy-platform-postgres-00-.*\"\
-                }) * 100 > 50"
-              instant: true
-            refId: A
-            relativeTimeRange:
-              from: 300
-              to: 0
-        for: 5m
-        labels:
-          severity: warning
-        title: Crunchy PGData Usage
-        uid: pgdata_usage
       - annotations:
           description: PostgreSQL database connection usage is above 85% of max connections.
           summary: Database is nearing connection limit.