update deploy script and update samples alert readme

ceelias · ceelias · commit 23716a8e8efc · 2025-09-11T10:14:00.000-04:00
diff --git a/monitoring/bin/deploy_monitoring_cluster.sh b/monitoring/bin/deploy_monitoring_cluster.sh
@@ -371,29 +371,21 @@ versionstring="$(get_helm_versionstring "$KUBE_PROM_STACK_CHART_VERSION")"
 log_debug "Installing Helm chart from artifact [$chart2install]"
 
 # Alerts
-if [ -d "monitoring/alerting/rules" ]; then
-    log_verbose "Creating Grafana alert rules ConfigMap"
+log_verbose "Creating Grafana alert rules ConfigMap"
+CUSTOM_ALERT_CONFIG_DIR="$USER_DIR/monitoring/alerting/"
 
-    # Start with required file
-    CM_ARGS=(--from-file="monitoring/alerting/rules")
-
-    CUSTOM_ALERT_CONFIG_DIR="$USER_DIR/monitoring/alerting/"
-
-    # Add optional custom directory if it exists
-    if [ -d "$CUSTOM_ALERT_CONFIG_DIR" ]; then
-        log_debug "Including notifiers and additional alert rules from '$CUSTOM_ALERT_CONFIG_DIR'"
-        CM_ARGS+=(--from-file="$CUSTOM_ALERT_CONFIG_DIR")
-    else
-        log_debug "No custom alert config directory found at '$CUSTOM_ALERT_CONFIG_DIR'. Skipping."
-    fi
+# Add optional custom directory if it exists
+if [ -d "$CUSTOM_ALERT_CONFIG_DIR" ]; then
+    log_debug "Including notifiers and additional alert rules from '$CUSTOM_ALERT_CONFIG_DIR'"
+    CM_ARGS=(--from-file="$CUSTOM_ALERT_CONFIG_DIR")
 
     # Run the kubectl command with all arguments
     kubectl create configmap grafana-alert-rules \
         "${CM_ARGS[@]}" \
         -n "$MON_NS" \
         --dry-run=client -o yaml | kubectl apply -f -
 else
-    log_debug "No alert rules file found at 'monitoring/alerting/rules'"
+    log_debug "No custom alert config directory found at '$CUSTOM_ALERT_CONFIG_DIR'. Skipping."
 fi
 
 # shellcheck disable=SC2086
diff --git a/samples/alerts/README.md b/samples/alerts/README.md
@@ -83,6 +83,15 @@ Adjust thresholds based on your environment size and requirements:
 - `platform/high-viya-api-latency.yaml`: > 1 second (95th percentile)
 - `database/crunchy-pgdata-usage-high.yaml` and `database/crunchy-backrest-repo.yaml`: > 50% full
 
+#### 5. Verify Metric Availability
+Ensure the following metrics are available in your Prometheus instance:
+- CAS metrics: `cas_thread_count`, `cas_grid_uptime_seconds_total`
+- Database metrics: `sas_db_pool_connections`, `pg_stat_activity_count`, `pg_settings_max_connections`
+- RabbitMQ metrics: `rabbitmq_queue_messages_ready`, `rabbitmq_queue_messages_unacknowledged`
+- Kubernetes metrics: `kube_pod_container_status_restarts_total`, `kube_pod_container_status_ready`
+- HTTP metrics: `http_server_requests_duration_seconds_bucket`
+- SAS Job Launcher: `:sas_launcher_pod_status:` (recording rule)
+
 ### Alert Expression Format
 
 Alert expressions in these samples use a multi-part approach for better compatibility with newer Grafana versions: