diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e4fa24b..850577e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ * [UPGRADE] Elasticsearch Exporer Helm chart upgraded from 6.7.2 to 7.0.0 * [UPGRADE] OpenSearch Data Source Plugin to Grafana upgraded from 2.28.0 to 2.29.1 +* **Metrics** + * [CHANGE] The Grafana alerts targeting SAS Viya that previously were provided by default have been moved to the samples directory. Given the variability of SAS Viya environments, these alerts are now optional. They can be copied to USER_DIR/monitoring/alerting and customized to fit the SAS Viya environment prior to deployment. They have also been split into separate files for easier customization. See the [Alerting Samples README](samples/alerts/README.md) for more details. ## Version 1.2.41 (19AUG2025) * **Metrics** diff --git a/monitoring/alerting/rules/cas_alerts.yaml b/monitoring/alerting/rules/cas_alerts.yaml deleted file mode 100644 index bf77a1e5..00000000 --- a/monitoring/alerting/rules/cas_alerts.yaml +++ /dev/null @@ -1,208 +0,0 @@ -apiVersion: 1 -groups: - - interval: 5m - folder: CAS Alerts - name: SAS Viya Alerts - orgId: 1 - rules: - - title: CAS Restart Detected - annotations: - description: - Check to see that the CAS pod existed for a short time. This implies - that CAS pod has restarted for whatever the reason. Will need to further investigate - the cause. - summary: - The current CAS (sas-cas-server-default-controller) pod < 15 minutes - in existence. Mostly likely it is due to restart of the CAS pod. - condition: C - data: - - datasourceUid: prometheus - model: - disableTextWrap: false - editorMode: code - expr: cas_grid_uptime_seconds_total - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - refId: A - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - refId: B - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 900 - type: lt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - refId: C - relativeTimeRange: - from: 600 - to: 0 - execErrState: Error - for: 5m - isPaused: false - labels: {} - noDataState: NoData - uid: fc41d560-9a18-4168-8a6a-615e60dc70de - - title: CAS Memory Usage High - annotations: - description: - Checks the CAS memory usage. If it is > 300GB, it will alert. Currently, - max. memory is 512GB. The expectation is that this alert will be an early - warning sign to investigate large memory usage as typical usage is less than - the threshold. Want to prevent OOMkill of CAS. - summary: - CAS memory > 300GB. This can be due to a program or pipeline taking - all the available memory. - condition: C - data: - - datasourceUid: prometheus - model: - editorMode: code - exemplar: false - expr: (cas_node_mem_size_bytes{type="physical"} - cas_node_mem_free_bytes{type="physical"})/1073741824 - instant: true - interval: "" - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - refId: A - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - refId: B - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 300 - type: gt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - refId: C - relativeTimeRange: - from: 600 - to: 0 - execErrState: Error - for: 5m - isPaused: false - labels: {} - noDataState: NoData - uid: ca744a08-e4e9-49b7-85a1-79e9fe05d4c1 - - title: CAS Thread Count High - annotations: - description: - CAS thread count is higher than 400. May indicate overloaded CAS - server. - summary: CAS is using more than 400 threads. - condition: A - data: - - datasourceUid: prometheus - model: - expr: cas_thread_count > 400 - instant: true - refId: A - relativeTimeRange: - from: 300 - to: 0 - for: 5m - labels: - severity: warning - uid: cas_thread_count diff --git a/monitoring/alerting/rules/database_alerts.yaml b/monitoring/alerting/rules/database_alerts.yaml deleted file mode 100644 index 64992bb3..00000000 --- a/monitoring/alerting/rules/database_alerts.yaml +++ /dev/null @@ -1,299 +0,0 @@ -apiVersion: 1 -groups: - - interval: 5m - name: SAS Viya Alerts - folder: Database Alerts - orgId: 1 - rules: - - title: Catalog DB Connections High - annotations: - description: - Checks the in-use catalog database connections > 21. The default - db connection pool is 22. If it reaches the limit, the rabbitmq queues starts - to fill up with ready messages causing issues with Model Studio pipelines. - summary: - The active catalog database connections > 21. If it reaches the max. - db connections, it will impact the rabbitmq queues. - condition: C - data: - - datasourceUid: prometheus - model: - disableTextWrap: false - editorMode: builder - expr: sas_db_pool_connections{container="sas-catalog-services", state="inUse"} - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - refId: A - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - refId: B - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 21 - type: gt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - refId: C - relativeTimeRange: - from: 600 - to: 0 - execErrState: Error - for: 5m - isPaused: false - labels: {} - noDataState: NoData - uid: fc65fbaf-c196-4eb4-a130-f45cc46b775b - - title: Crunchy PGData Usage High - annotations: - description: "Checks to see /pgdata filesystem is more than 50% full. - - Go to the URL to follow the troubleshooting steps." - summary: - /pgdata storage > 50% full. This typically happens when the WAL logs - are increasing and not being cleared. - condition: C - data: - - datasourceUid: prometheus - model: - disableTextWrap: false - editorMode: code - expr: - ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"} - - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"}) - / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"}) - * 100 - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - refId: A - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - refId: B - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 50 - type: gt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - refId: C - relativeTimeRange: - from: 600 - to: 0 - execErrState: Error - for: 5m - isPaused: false - labels: {} - noDataState: NoData - uid: fb411e28-b2e5-43d0-a413-e6dedbf154c4 - - title: PostgreSQL Connection Utilization High - annotations: - description: PostgreSQL database connection usage is above 85% of max connections. - summary: Database is nearing connection limit. - condition: A - data: - - datasourceUid: prometheus - model: - expr: (pg_stat_activity_count / pg_settings_max_connections) * 100 > 85 - instant: true - refId: A - relativeTimeRange: - from: 300 - to: 0 - for: 5m - labels: - severity: warning - uid: postgres_connection_utilization - - title: Crunchy Backrest Repo - annotations: - description: Checks to see /pgbackrest/repo1 filesystem is more than 50% full. - summary: - /pgbackrest/repo1 storage > 50% full in the pgbackrest repo. This typically - happens when the archived WAL logs are increasing and not being expired and - cleared. - condition: C - data: - - datasourceUid: prometheus - model: - editorMode: code - expr: - ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"} - - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"}) - / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"}) - * 100 - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - refId: A - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - refId: B - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 50 - type: gt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - refId: C - relativeTimeRange: - from: 600 - to: 0 - execErrState: Error - for: 5m - isPaused: false - labels: {} - noDataState: NoData - uid: abe80c6a-3add-477a-b228-f8283704570f diff --git a/monitoring/alerting/rules/other_alerts.yaml b/monitoring/alerting/rules/other_alerts.yaml deleted file mode 100644 index 5004bdd4..00000000 --- a/monitoring/alerting/rules/other_alerts.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: 1 -groups: - - interval: 5m - name: SAS Viya Alerts - folder: Other Alerts - orgId: 1 - rules: - - title: NFS Share Usage High - annotations: - description: Checks if the NFS share attached to CAS is > 85% full. - summary: - NFS share > 85% full. Typically, it is due to users filling their home - directory or backups. - condition: A - data: - - datasourceUid: prometheus - model: - expr: - "((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=\"cas-default-data\"\ - }\n - kubelet_volume_stats_available_bytes{persistentvolumeclaim=\"cas-default-data\"\ - })\n / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=\"cas-default-data\"\ - }) * 100 > 85" - instant: true - refId: A - relativeTimeRange: - from: 21600 - to: 0 - for: 5m - labels: - severity: warning - uid: nfs_share_usage diff --git a/monitoring/alerting/rules/sas-job-launcher-rules.yaml b/monitoring/alerting/rules/sas-job-launcher-rules.yaml deleted file mode 100644 index 528c4a79..00000000 --- a/monitoring/alerting/rules/sas-job-launcher-rules.yaml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: 1 -groups: - - name: SAS Job Launcher - folder: Job Monitoring - interval: 1m - rules: - - uid: sas_job_launcher_ready - title: SAS Job Launcher Pod Not Running - condition: C - data: - - refId: A - datasourceUid: prometheus - relativeTimeRange: - from: 600 - to: 0 - model: - expr: sum(:sas_launcher_pod_status:{phase="Running"}) - interval: "" - intervalMs: 1000 - maxDataPoints: 43200 - datasource: - type: prometheus - uid: prometheus - instant: true - refId: A - editorMode: code - range: false - - refId: B - datasourceUid: __expr__ - model: - type: reduce - expression: A - reducer: last - refId: B - datasource: - type: __expr__ - uid: __expr__ - intervalMs: 1000 - maxDataPoints: 43200 - - refId: C - datasourceUid: __expr__ - model: - type: threshold - expression: B - refId: C - datasource: - type: __expr__ - uid: __expr__ - conditions: - - type: query - evaluator: - type: lt - params: - - 1 - operator: - type: and - query: - params: - - C - reducer: - type: last - params: [] - intervalMs: 1000 - maxDataPoints: 43200 - noDataState: NoData - execErrState: Error - for: 5m - annotations: - summary: No SAS Job Launcher pods in Running state - description: - Checks if any SAS launcher jobs are in Running state. If none are - running, this may indicate an issue with job launching or orchestration. - labels: - severity: warning diff --git a/monitoring/alerting/rules/viya_platform_alerts.yaml b/monitoring/alerting/rules/viya_platform_alerts.yaml deleted file mode 100644 index a187529c..00000000 --- a/monitoring/alerting/rules/viya_platform_alerts.yaml +++ /dev/null @@ -1,322 +0,0 @@ -apiVersion: 1 -groups: - - interval: 5m - name: SAS Viya Alerts - folder: Viya Platform Alerts - orgId: 1 - rules: - - title: Viya Readiness Probe Failed - annotations: - description: - Checks for the Ready state of sas-readiness pod. Will need to check - the status of the Viya pods since sas-readiness pod reflects the health of - the Viya services. - summary: - sas-readiness pod is not in Ready state. This means that one or more - of the Viya services are not in a good state. - condition: C - data: - - datasourceUid: prometheus - model: - disableTextWrap: false - editorMode: builder - expr: kube_pod_container_status_ready{container="sas-readiness"} - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - refId: A - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - refId: B - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 1 - type: lt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - refId: C - relativeTimeRange: - from: 600 - to: 0 - execErrState: Error - for: 5m - isPaused: false - labels: {} - noDataState: NoData - uid: e45e6d74-e396-40ce-a061-2a294295e61b - - title: RabbitMQ Ready Queue Backlog - annotations: - description: - Checks for accumulation of Rabbitmq ready messages > 10,000. The covers potential orphan - queues and/or bottlenecking of queues due to catalog service. - summary: - Rabbitmq ready messages > 10,000. This means there is a large backlog - of messages due to high activity (which can be temporary) or something has - gone wrong. - condition: C - data: - - datasourceUid: prometheus - model: - disableTextWrap: false - editorMode: builder - expr: rabbitmq_queue_messages_ready - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - refId: A - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - refId: B - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 10000 - type: gt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - refId: C - relativeTimeRange: - from: 600 - to: 0 - execErrState: Error - for: 5m - isPaused: false - labels: {} - noDataState: NoData - uid: efb36686-4e44-4de8-80c4-7dde9130da90 - - title: Viya Pod Restart Count High - annotations: - description: - Checks the restart count of the pod(s). Will need to check why - the pod(s) have restarted so many times. One possible cause is OOMkill. This - means we will need to increase the memory limit. - summary: - The number of pod restarts > 20. The service pod(s) have restarted - many times due to issues. - condition: C - data: - - datasourceUid: prometheus - model: - disableTextWrap: false - editorMode: builder - expr: kube_pod_container_status_restarts_total{namespace="viya"} - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - refId: A - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - refId: B - relativeTimeRange: - from: 600 - to: 0 - - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 20 - type: gt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - refId: C - relativeTimeRange: - from: 600 - to: 0 - execErrState: Error - for: 5m - isPaused: false - labels: {} - noDataState: NoData - uid: e7ecb843-f1bd-48b7-8c8c-58571d1642ad - - title: RabbitMQ Unacked Queue Backlog - annotations: - description: - RabbitMQ has a high number of unacknowledged messages. This may - indicate stuck consumers or performance issues. - summary: Unacked messages in RabbitMQ are greater than 5000. - condition: A - data: - - datasourceUid: prometheus - model: - expr: rabbitmq_queue_messages_unacknowledged > 5000 - instant: true - refId: A - relativeTimeRange: - from: 300 - to: 0 - for: 5m - labels: - severity: warning - uid: rabbitmq_unacked_messages - - title: High Viya API Latency - annotations: - description: - Viya service API response latency is high (95th percentile over - 1s). - summary: Degraded performance on Viya APIs. - condition: A - data: - - datasourceUid: prometheus - model: - expr: - histogram_quantile(0.95, sum(rate(http_server_requests_duration_seconds_bucket{job=~"sas-.*"}[5m])) - by (le)) > 1 - instant: true - refId: A - relativeTimeRange: - from: 300 - to: 0 - for: 5m - labels: - severity: warning - uid: viya_api_latency diff --git a/monitoring/bin/deploy_monitoring_cluster.sh b/monitoring/bin/deploy_monitoring_cluster.sh index be1e11d7..845bbf1f 100755 --- a/monitoring/bin/deploy_monitoring_cluster.sh +++ b/monitoring/bin/deploy_monitoring_cluster.sh @@ -371,21 +371,12 @@ versionstring="$(get_helm_versionstring "$KUBE_PROM_STACK_CHART_VERSION")" log_debug "Installing Helm chart from artifact [$chart2install]" # Alerts -if [ -d "monitoring/alerting/rules" ]; then - log_verbose "Creating Grafana alert rules ConfigMap" +log_verbose "Creating Grafana alert rules ConfigMap" +CUSTOM_ALERT_CONFIG_DIR="$USER_DIR/monitoring/alerting/" - # Start with required file - CM_ARGS=(--from-file="monitoring/alerting/rules") - - CUSTOM_ALERT_CONFIG_DIR="$USER_DIR/monitoring/alerting/" - - # Add optional custom directory if it exists - if [ -d "$CUSTOM_ALERT_CONFIG_DIR" ]; then - log_debug "Including notifiers and additional alert rules from '$CUSTOM_ALERT_CONFIG_DIR'" - CM_ARGS+=(--from-file="$CUSTOM_ALERT_CONFIG_DIR") - else - log_debug "No custom alert config directory found at '$CUSTOM_ALERT_CONFIG_DIR'. Skipping." - fi +if [ -d "$CUSTOM_ALERT_CONFIG_DIR" ] && [ "$(ls -A "$CUSTOM_ALERT_CONFIG_DIR" 2> /dev/null)" ]; then + log_debug "Creating configmap for alert rules/notifiers/contact points defined in '$CUSTOM_ALERT_CONFIG_DIR'" + CM_ARGS=(--from-file="$CUSTOM_ALERT_CONFIG_DIR") # Run the kubectl command with all arguments kubectl create configmap grafana-alert-rules \ @@ -393,7 +384,12 @@ if [ -d "monitoring/alerting/rules" ]; then -n "$MON_NS" \ --dry-run=client -o yaml | kubectl apply -f - else - log_debug "No alert rules file found at 'monitoring/alerting/rules'" + log_debug "No custom alert files found at '$CUSTOM_ALERT_CONFIG_DIR'. Creating empty ConfigMap." + # Create an empty ConfigMap to satisfy the volume mount + kubectl create configmap grafana-alert-rules \ + -n "$MON_NS" \ + --from-literal=_README.txt="Copy alert rules from samples/alerts to $USER_DIR/monitoring/alerting/ to enable them." \ + --dry-run=client -o yaml | kubectl apply -f - fi # shellcheck disable=SC2086 diff --git a/samples/alerts/README.md b/samples/alerts/README.md new file mode 100644 index 00000000..d554dcde --- /dev/null +++ b/samples/alerts/README.md @@ -0,0 +1,126 @@ +# Alert Rules Structure + +This directory contains Grafana alert rules for monitoring SAS Viya environments. The alerts are organized into subdirectories by component/category: + +- `cas/` - Alerts for CAS (Cloud Analytic Services) +- `database/` - Alerts for database services +- `platform/` - Alerts for Viya platform components +- `other/` - Miscellaneous alerts + +## Alert Files Structure + +Each alert is stored in its own YAML file with a descriptive name. This modular approach makes it easier to: + +- Manage individual alerts +- Track changes in version control +- Enable/disable specific alerts +- Customize alerts for specific environments + +## Alert File Format + +Each alert file follows this structure: + +```yaml +apiVersion: 1 +groups: + - interval: 5m # How often the alert is evaluated + folder: Category Name # The folder where the alert appears in Grafana + name: SAS Viya Alerts # The alert group name + orgId: 1 + rules: + - title: Alert Title # The name of the alert + annotations: + description: Detailed explanation of the alert condition + summary: Brief summary of the alert + condition: C # The condition reference letter + data: + # The alert query and evaluation conditions + execErrState: Error + for: 5m # Duration before alert fires + labels: + severity: warning # Alert severity + noDataState: NoData + uid: unique-alert-id # Unique identifier for the alert +``` + +## Customizing Alerts + +To customize an alert: + +1. Copy the alert file to your user directory at `$USER_DIR/monitoring/alerting/` +2. Modify the alert parameters as needed (thresholds, evaluation intervals, etc.) +3. Deploy the monitoring components to apply your custom alerts + +> **Important**: To enable any of these sample alerts, you must copy them to your user directory at `$USER_DIR/monitoring/alerting/`. Alerts in the samples directory are not automatically deployed. + +### Sample Customizations + +The following elements may need to be adjusted to match your specific environment: + +#### 1. Namespace Specifications +- Change `namespace="viya"` to match your SAS Viya namespace in: + - `platform/viya-pod-restart-count-high.yaml` +- Verify the pattern `job=~"sas-.*"` in `platform/high-viya-api-latency.yaml` matches your service naming convention + +#### 2. Persistent Volume Claims +- Update PVC names in: + - `other/nfs-share-high-usage.yaml`: `persistentvolumeclaim="cas-default-data"` + - `database/crunchy-backrest-repo.yaml`: `persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"` + - `database/crunchy-pgdata-usage-high.yaml`: `persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"` + +#### 3. Container Names +- Verify container names in: + - `database/catalog-db-connections-high.yaml`: `container="sas-catalog-services"` + - `platform/viya-readiness-probe-failed.yaml`: `container="sas-readiness"` + +#### 4. Alert Thresholds +Adjust thresholds based on your environment size and requirements: +- `cas/cas-memory-usage-high.yaml`: > 300 GB +- `platform/rabbitmq-ready-queue-backlog.yaml`: > 10,000 messages +- `platform/rabbitmq-unacked-queue-backlog.yaml`: > 5,000 messages +- `platform/viya-pod-restart-count-high.yaml`: > 20 restarts +- `other/nfs-share-high-usage.yaml`: > 85% full +- `database/crunchy-pgdata-usage-high.yaml` and `database/crunchy-backrest-repo.yaml`: > 50% full + +#### 5. Verify Metric Availability +Ensure the following metrics are available in your Prometheus instance: +- CAS metrics: `cas_grid_uptime_seconds_total` +- Database metrics: `sas_db_pool_connections`, `pg_stat_activity_count`, `pg_settings_max_connections` +- RabbitMQ metrics: `rabbitmq_queue_messages_ready`, `rabbitmq_queue_messages_unacked` +- Kubernetes metrics: `kube_pod_container_status_restarts_total`, `kube_pod_container_status_ready` +- HTTP metrics: `http_server_requests_duration_seconds_bucket` + +### Alert Expression Format + +Alert expressions in these samples use a multi-part approach for better compatibility with newer Grafana versions: + +- **Part A**: Fetches the raw metric +- **Part B**: Reduces the result (using the "reduce" function) +- **Part C**: Applies the threshold using a dedicated threshold component + +This approach addresses issues where direct threshold comparisons (e.g., `metric > threshold`) might not work properly in recent Grafana versions. If you experience "no data" results when the underlying metric has data, ensure your alert is using this multi-part approach. + +For example, instead of: +```yaml +expr: cas_thread_count > 400 +``` + +Use: +```yaml +# Part A: Fetch the metric +expr: cas_thread_count + +# Part B: Reduce the result +type: reduce +expression: A + +# Part C: Apply threshold +type: threshold +expression: B +evaluator: + type: gt + params: + - 400 +``` + +For more detailed information on Grafana alerting, see the [Grafana documentation](https://grafana.com/docs/grafana/latest/alerting/). diff --git a/samples/alerts/cas/cas-memory-usage-high.yaml b/samples/alerts/cas/cas-memory-usage-high.yaml new file mode 100644 index 00000000..5ed3dc41 --- /dev/null +++ b/samples/alerts/cas/cas-memory-usage-high.yaml @@ -0,0 +1,97 @@ +apiVersion: 1 +groups: + - interval: 5m + folder: CAS Alerts + name: SAS Viya Alerts + orgId: 1 + rules: + - title: CAS Memory Usage High + annotations: + description: + Checks the CAS memory usage. If it is > 300GB, it will alert. Currently, + max. memory is 512GB. The expectation is that this alert will be an early + warning sign to investigate large memory usage as typical usage is less than + the threshold. Want to prevent OOMkill of CAS. + summary: + CAS memory > 300GB. This can be due to a program or pipeline taking + all the available memory. + condition: C + data: + - datasourceUid: prometheus + model: + editorMode: code + exemplar: false + expr: (cas_node_mem_size_bytes{type="physical"} - cas_node_mem_free_bytes{type="physical"})/1073741824 + instant: true + interval: "" + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 300 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: ca744a08-e4e9-49b7-85a1-79e9fe05d4c1 diff --git a/samples/alerts/cas/cas-restart-detected.yaml b/samples/alerts/cas/cas-restart-detected.yaml new file mode 100644 index 00000000..cbc796c5 --- /dev/null +++ b/samples/alerts/cas/cas-restart-detected.yaml @@ -0,0 +1,98 @@ +apiVersion: 1 +groups: + - interval: 5m + folder: CAS Alerts + name: SAS Viya Alerts + orgId: 1 + rules: + - title: CAS Restart Detected + annotations: + description: + Check to see that the CAS pod existed for a short time. This implies + that CAS pod has restarted for whatever the reason. Will need to further investigate + the cause. + summary: + The current CAS (sas-cas-server-default-controller) pod < 15 minutes + in existence. Mostly likely it is due to restart of the CAS pod. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: code + expr: cas_grid_uptime_seconds_total + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 900 + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: fc41d560-9a18-4168-8a6a-615e60dc70de diff --git a/samples/alerts/database/catalog-db-connections-high.yaml b/samples/alerts/database/catalog-db-connections-high.yaml new file mode 100644 index 00000000..b7fd1353 --- /dev/null +++ b/samples/alerts/database/catalog-db-connections-high.yaml @@ -0,0 +1,98 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Database Alerts + orgId: 1 + rules: + - title: Catalog DB Connections High + annotations: + description: + Checks the in-use catalog database connections > 21. The default + db connection pool is 22. If it reaches the limit, the rabbitmq queues starts + to fill up with ready messages causing issues with Model Studio pipelines. + summary: + The active catalog database connections > 21. If it reaches the max. + db connections, it will impact the rabbitmq queues. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: builder + expr: sas_db_pool_connections{container="sas-catalog-services", state="inUse"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 21 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: fc65fbaf-c196-4eb4-a130-f45cc46b775b diff --git a/samples/alerts/database/crunchy-backrest-repo.yaml b/samples/alerts/database/crunchy-backrest-repo.yaml new file mode 100644 index 00000000..5f5397d2 --- /dev/null +++ b/samples/alerts/database/crunchy-backrest-repo.yaml @@ -0,0 +1,97 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Database Alerts + orgId: 1 + rules: + - title: Crunchy Backrest Repo + annotations: + description: Checks to see /pgbackrest/repo1 filesystem is more than 50% full. + summary: + /pgbackrest/repo1 storage > 50% full in the pgbackrest repo. This typically + happens when the archived WAL logs are increasing and not being expired and + cleared. + condition: C + data: + - datasourceUid: prometheus + model: + editorMode: code + expr: + ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"} + - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"}) + / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"}) + * 100 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 50 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: + severity: warning + noDataState: NoData + uid: abe80c6a-3add-477a-b228-f8283704570f diff --git a/samples/alerts/database/crunchy-pgdata-usage-high.yaml b/samples/alerts/database/crunchy-pgdata-usage-high.yaml new file mode 100644 index 00000000..c317493f --- /dev/null +++ b/samples/alerts/database/crunchy-pgdata-usage-high.yaml @@ -0,0 +1,102 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Database Alerts + orgId: 1 + rules: + - title: Crunchy PGData Usage High + annotations: + description: "Checks to see /pgdata filesystem is more than 50% full. + + Go to the URL to follow the troubleshooting steps." + summary: + /pgdata storage > 50% full. This typically happens when the WAL logs + are increasing and not being cleared. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: code + expr: + ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"} + - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"}) + / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"}) + * 100 + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 50 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: + severity: warning + noDataState: NoData + uid: fb411e28-b2e5-43d0-a413-e6dedbf154c4 diff --git a/samples/alerts/other/nfs-share-high-usage.yaml b/samples/alerts/other/nfs-share-high-usage.yaml new file mode 100644 index 00000000..18c6feec --- /dev/null +++ b/samples/alerts/other/nfs-share-high-usage.yaml @@ -0,0 +1,92 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Other Alerts + orgId: 1 + rules: + - title: NFS Share Usage High + annotations: + description: Checks if the NFS share attached to CAS is > 85% full. + summary: + NFS share > 85% full. Typically, it is due to users filling their home + directory or backups. + condition: C + data: + - datasourceUid: prometheus + model: + expr: | + (kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="cas-default-data"} + - kubelet_volume_stats_available_bytes{persistentvolumeclaim="cas-default-data"}) + / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="cas-default-data"} * 100 + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + refId: A + relativeTimeRange: + from: 21600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 21600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 21600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: + severity: warning + noDataState: NoData + uid: nfs_share_usage diff --git a/samples/alerts/platform/rabbitmq-ready-queue-backlog.yaml b/samples/alerts/platform/rabbitmq-ready-queue-backlog.yaml new file mode 100644 index 00000000..3bb12aee --- /dev/null +++ b/samples/alerts/platform/rabbitmq-ready-queue-backlog.yaml @@ -0,0 +1,99 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Viya Platform Alerts + orgId: 1 + rules: + - title: RabbitMQ Ready Queue Backlog + annotations: + description: + Checks for accumulation of Rabbitmq ready messages > 10,000. The covers potential orphan + queues and/or bottlenecking of queues due to catalog service. + summary: + Rabbitmq ready messages > 10,000. This means there is a large backlog + of messages due to high activity (which can be temporary) or something has + gone wrong. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: builder + expr: rabbitmq_queue_messages_ready + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 10000 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: + severity: warning + noDataState: NoData + uid: efb36686-4e44-4de8-80c4-7dde9130da90 diff --git a/samples/alerts/platform/rabbitmq-unacked-queue-backlog.yaml b/samples/alerts/platform/rabbitmq-unacked-queue-backlog.yaml new file mode 100644 index 00000000..caaf82e4 --- /dev/null +++ b/samples/alerts/platform/rabbitmq-unacked-queue-backlog.yaml @@ -0,0 +1,91 @@ +apiVersion: 1 +groups: + - interval: 5m + folder: Viya Platform Alerts + name: SAS Viya Alerts + orgId: 1 + rules: + - title: RabbitMQ Unacknowledged Queue Backlog + annotations: + description: + Checks for accumulation of Rabbitmq unacknowledged messages. This means + consumers are taking a long time to process messages or have failed. + summary: + Rabbitmq unacknowledged messages > 5000. Consumer services might be having + issues processing messages. + condition: C + data: + - datasourceUid: prometheus + model: + expr: rabbitmq_queue_messages_unacked + instant: true + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + refId: A + relativeTimeRange: + from: 300 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 300 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 5000 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 300 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: + severity: warning + noDataState: NoData + uid: rabbitmq_unacked_queue diff --git a/samples/alerts/platform/viya-pod-restart-count-high.yaml b/samples/alerts/platform/viya-pod-restart-count-high.yaml new file mode 100644 index 00000000..8b9bd00d --- /dev/null +++ b/samples/alerts/platform/viya-pod-restart-count-high.yaml @@ -0,0 +1,99 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Viya Platform Alerts + orgId: 1 + rules: + - title: Viya Pod Restart Count High + annotations: + description: + Checks the restart count of the pod(s). Will need to check why + the pod(s) have restarted so many times. One possible cause is OOMkill. This + means we will need to increase the memory limit. + summary: + The number of pod restarts > 20. The service pod(s) have restarted + many times due to issues. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: builder + expr: kube_pod_container_status_restarts_total{namespace="viya"} # namespace can be adjusted + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 20 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: + severity: warning + noDataState: NoData + uid: e7ecb843-f1bd-48b7-8c8c-58571d1642ad diff --git a/samples/alerts/platform/viya-readiness-probe-failed.yaml b/samples/alerts/platform/viya-readiness-probe-failed.yaml new file mode 100644 index 00000000..982e463d --- /dev/null +++ b/samples/alerts/platform/viya-readiness-probe-failed.yaml @@ -0,0 +1,99 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Viya Platform Alerts + orgId: 1 + rules: + - title: Viya Readiness Probe Failed + annotations: + description: + Checks for the Ready state of sas-readiness pod. Will need to check + the status of the Viya pods since sas-readiness pod reflects the health of + the Viya services. + summary: + sas-readiness pod is not in Ready state. This means that one or more + of the Viya services are not in a good state. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: builder + expr: kube_pod_container_status_ready{container="sas-readiness"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: + severity: warning + noDataState: NoData + uid: e45e6d74-e396-40ce-a061-2a294295e61b