diff --git a/CHANGELOG.md b/CHANGELOG.md index 262da43a..f72fcb66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ * [UPGRADE] OpenSearch Data Source Plugin to Grafana upgraded from 2.23.1 to 2.24.0 * [UPGRADE] Admission Webhook upgraded from v1.5.1 to v1.5.2 * [CHANGE] Enable Grafana feature flag: prometheusSpecialCharsInLabelValues to improve handling of special characters in metric labels (addresses #699) + * [FEATURE] A set of SAS Viya specific alerts is now deployed with Grafana. Administrators can configure notifiers (which trigger messages via e-mail, Slack, SMS, etc. based on these alerts) and additional alerts via the Grafana web application after deployment. Or, alternatively, notifiers and/or additional alerts can be defined prior to running the monitoring deployment script ( `deploy_monitoring_cluster.sh` ) by placing yaml files in `$USER_DIR/monitoring/alerting/` Note: Due to Grafana's use of a single folder namespace, the folders used to organize these new Alerts will also appear when viewing Dashboards and will appear to be empty. When working with Dashboards, these folders can be ignored. + * **Logging** * [FIX] Resolved issue causing deploy_esexporter.sh to fail when doing an upgrade-in-place and serviceMonitor CRD is not installed. diff --git a/monitoring/alerting/rules/cas_alerts.yaml b/monitoring/alerting/rules/cas_alerts.yaml new file mode 100644 index 00000000..bf77a1e5 --- /dev/null +++ b/monitoring/alerting/rules/cas_alerts.yaml @@ -0,0 +1,208 @@ +apiVersion: 1 +groups: + - interval: 5m + folder: CAS Alerts + name: SAS Viya Alerts + orgId: 1 + rules: + - title: CAS Restart Detected + annotations: + description: + Check to see that the CAS pod existed for a short time. This implies + that CAS pod has restarted for whatever the reason. Will need to further investigate + the cause. + summary: + The current CAS (sas-cas-server-default-controller) pod < 15 minutes + in existence. Mostly likely it is due to restart of the CAS pod. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: code + expr: cas_grid_uptime_seconds_total + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 900 + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: fc41d560-9a18-4168-8a6a-615e60dc70de + - title: CAS Memory Usage High + annotations: + description: + Checks the CAS memory usage. If it is > 300GB, it will alert. Currently, + max. memory is 512GB. The expectation is that this alert will be an early + warning sign to investigate large memory usage as typical usage is less than + the threshold. Want to prevent OOMkill of CAS. + summary: + CAS memory > 300GB. This can be due to a program or pipeline taking + all the available memory. + condition: C + data: + - datasourceUid: prometheus + model: + editorMode: code + exemplar: false + expr: (cas_node_mem_size_bytes{type="physical"} - cas_node_mem_free_bytes{type="physical"})/1073741824 + instant: true + interval: "" + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 300 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: ca744a08-e4e9-49b7-85a1-79e9fe05d4c1 + - title: CAS Thread Count High + annotations: + description: + CAS thread count is higher than 400. May indicate overloaded CAS + server. + summary: CAS is using more than 400 threads. + condition: A + data: + - datasourceUid: prometheus + model: + expr: cas_thread_count > 400 + instant: true + refId: A + relativeTimeRange: + from: 300 + to: 0 + for: 5m + labels: + severity: warning + uid: cas_thread_count diff --git a/monitoring/alerting/rules/database_alerts.yaml b/monitoring/alerting/rules/database_alerts.yaml new file mode 100644 index 00000000..64992bb3 --- /dev/null +++ b/monitoring/alerting/rules/database_alerts.yaml @@ -0,0 +1,299 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Database Alerts + orgId: 1 + rules: + - title: Catalog DB Connections High + annotations: + description: + Checks the in-use catalog database connections > 21. The default + db connection pool is 22. If it reaches the limit, the rabbitmq queues starts + to fill up with ready messages causing issues with Model Studio pipelines. + summary: + The active catalog database connections > 21. If it reaches the max. + db connections, it will impact the rabbitmq queues. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: builder + expr: sas_db_pool_connections{container="sas-catalog-services", state="inUse"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 21 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: fc65fbaf-c196-4eb4-a130-f45cc46b775b + - title: Crunchy PGData Usage High + annotations: + description: "Checks to see /pgdata filesystem is more than 50% full. + + Go to the URL to follow the troubleshooting steps." + summary: + /pgdata storage > 50% full. This typically happens when the WAL logs + are increasing and not being cleared. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: code + expr: + ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"} + - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"}) + / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-00-.*"}) + * 100 + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 50 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: fb411e28-b2e5-43d0-a413-e6dedbf154c4 + - title: PostgreSQL Connection Utilization High + annotations: + description: PostgreSQL database connection usage is above 85% of max connections. + summary: Database is nearing connection limit. + condition: A + data: + - datasourceUid: prometheus + model: + expr: (pg_stat_activity_count / pg_settings_max_connections) * 100 > 85 + instant: true + refId: A + relativeTimeRange: + from: 300 + to: 0 + for: 5m + labels: + severity: warning + uid: postgres_connection_utilization + - title: Crunchy Backrest Repo + annotations: + description: Checks to see /pgbackrest/repo1 filesystem is more than 50% full. + summary: + /pgbackrest/repo1 storage > 50% full in the pgbackrest repo. This typically + happens when the archived WAL logs are increasing and not being expired and + cleared. + condition: C + data: + - datasourceUid: prometheus + model: + editorMode: code + expr: + ((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"} + - kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"}) + / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"sas-crunchy-platform-postgres-repo1"}) + * 100 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 50 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: abe80c6a-3add-477a-b228-f8283704570f diff --git a/monitoring/alerting/rules/other_alerts.yaml b/monitoring/alerting/rules/other_alerts.yaml new file mode 100644 index 00000000..5004bdd4 --- /dev/null +++ b/monitoring/alerting/rules/other_alerts.yaml @@ -0,0 +1,31 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Other Alerts + orgId: 1 + rules: + - title: NFS Share Usage High + annotations: + description: Checks if the NFS share attached to CAS is > 85% full. + summary: + NFS share > 85% full. Typically, it is due to users filling their home + directory or backups. + condition: A + data: + - datasourceUid: prometheus + model: + expr: + "((kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=\"cas-default-data\"\ + }\n - kubelet_volume_stats_available_bytes{persistentvolumeclaim=\"cas-default-data\"\ + })\n / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=\"cas-default-data\"\ + }) * 100 > 85" + instant: true + refId: A + relativeTimeRange: + from: 21600 + to: 0 + for: 5m + labels: + severity: warning + uid: nfs_share_usage diff --git a/monitoring/alerting/rules/sas-job-launcher-rules.yaml b/monitoring/alerting/rules/sas-job-launcher-rules.yaml new file mode 100644 index 00000000..528c4a79 --- /dev/null +++ b/monitoring/alerting/rules/sas-job-launcher-rules.yaml @@ -0,0 +1,74 @@ +apiVersion: 1 +groups: + - name: SAS Job Launcher + folder: Job Monitoring + interval: 1m + rules: + - uid: sas_job_launcher_ready + title: SAS Job Launcher Pod Not Running + condition: C + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 600 + to: 0 + model: + expr: sum(:sas_launcher_pod_status:{phase="Running"}) + interval: "" + intervalMs: 1000 + maxDataPoints: 43200 + datasource: + type: prometheus + uid: prometheus + instant: true + refId: A + editorMode: code + range: false + - refId: B + datasourceUid: __expr__ + model: + type: reduce + expression: A + reducer: last + refId: B + datasource: + type: __expr__ + uid: __expr__ + intervalMs: 1000 + maxDataPoints: 43200 + - refId: C + datasourceUid: __expr__ + model: + type: threshold + expression: B + refId: C + datasource: + type: __expr__ + uid: __expr__ + conditions: + - type: query + evaluator: + type: lt + params: + - 1 + operator: + type: and + query: + params: + - C + reducer: + type: last + params: [] + intervalMs: 1000 + maxDataPoints: 43200 + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: No SAS Job Launcher pods in Running state + description: + Checks if any SAS launcher jobs are in Running state. If none are + running, this may indicate an issue with job launching or orchestration. + labels: + severity: warning diff --git a/monitoring/alerting/rules/viya_platform_alerts.yaml b/monitoring/alerting/rules/viya_platform_alerts.yaml new file mode 100644 index 00000000..a187529c --- /dev/null +++ b/monitoring/alerting/rules/viya_platform_alerts.yaml @@ -0,0 +1,322 @@ +apiVersion: 1 +groups: + - interval: 5m + name: SAS Viya Alerts + folder: Viya Platform Alerts + orgId: 1 + rules: + - title: Viya Readiness Probe Failed + annotations: + description: + Checks for the Ready state of sas-readiness pod. Will need to check + the status of the Viya pods since sas-readiness pod reflects the health of + the Viya services. + summary: + sas-readiness pod is not in Ready state. This means that one or more + of the Viya services are not in a good state. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: builder + expr: kube_pod_container_status_ready{container="sas-readiness"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + type: lt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: e45e6d74-e396-40ce-a061-2a294295e61b + - title: RabbitMQ Ready Queue Backlog + annotations: + description: + Checks for accumulation of Rabbitmq ready messages > 10,000. The covers potential orphan + queues and/or bottlenecking of queues due to catalog service. + summary: + Rabbitmq ready messages > 10,000. This means there is a large backlog + of messages due to high activity (which can be temporary) or something has + gone wrong. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: builder + expr: rabbitmq_queue_messages_ready + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 10000 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: efb36686-4e44-4de8-80c4-7dde9130da90 + - title: Viya Pod Restart Count High + annotations: + description: + Checks the restart count of the pod(s). Will need to check why + the pod(s) have restarted so many times. One possible cause is OOMkill. This + means we will need to increase the memory limit. + summary: + The number of pod restarts > 20. The service pod(s) have restarted + many times due to issues. + condition: C + data: + - datasourceUid: prometheus + model: + disableTextWrap: false + editorMode: builder + expr: kube_pod_container_status_restarts_total{namespace="viya"} + fullMetaSearch: false + includeNullMetadata: true + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + useBackend: false + refId: A + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: [] + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + reducer: last + refId: B + type: reduce + refId: B + relativeTimeRange: + from: 600 + to: 0 + - datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 20 + type: gt + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: B + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + refId: C + relativeTimeRange: + from: 600 + to: 0 + execErrState: Error + for: 5m + isPaused: false + labels: {} + noDataState: NoData + uid: e7ecb843-f1bd-48b7-8c8c-58571d1642ad + - title: RabbitMQ Unacked Queue Backlog + annotations: + description: + RabbitMQ has a high number of unacknowledged messages. This may + indicate stuck consumers or performance issues. + summary: Unacked messages in RabbitMQ are greater than 5000. + condition: A + data: + - datasourceUid: prometheus + model: + expr: rabbitmq_queue_messages_unacknowledged > 5000 + instant: true + refId: A + relativeTimeRange: + from: 300 + to: 0 + for: 5m + labels: + severity: warning + uid: rabbitmq_unacked_messages + - title: High Viya API Latency + annotations: + description: + Viya service API response latency is high (95th percentile over + 1s). + summary: Degraded performance on Viya APIs. + condition: A + data: + - datasourceUid: prometheus + model: + expr: + histogram_quantile(0.95, sum(rate(http_server_requests_duration_seconds_bucket{job=~"sas-.*"}[5m])) + by (le)) > 1 + instant: true + refId: A + relativeTimeRange: + from: 300 + to: 0 + for: 5m + labels: + severity: warning + uid: viya_api_latency diff --git a/monitoring/bin/deploy_monitoring_cluster.sh b/monitoring/bin/deploy_monitoring_cluster.sh index 05b9be63..be1e11d7 100755 --- a/monitoring/bin/deploy_monitoring_cluster.sh +++ b/monitoring/bin/deploy_monitoring_cluster.sh @@ -370,6 +370,32 @@ versionstring="$(get_helm_versionstring "$KUBE_PROM_STACK_CHART_VERSION")" log_debug "Installing Helm chart from artifact [$chart2install]" +# Alerts +if [ -d "monitoring/alerting/rules" ]; then + log_verbose "Creating Grafana alert rules ConfigMap" + + # Start with required file + CM_ARGS=(--from-file="monitoring/alerting/rules") + + CUSTOM_ALERT_CONFIG_DIR="$USER_DIR/monitoring/alerting/" + + # Add optional custom directory if it exists + if [ -d "$CUSTOM_ALERT_CONFIG_DIR" ]; then + log_debug "Including notifiers and additional alert rules from '$CUSTOM_ALERT_CONFIG_DIR'" + CM_ARGS+=(--from-file="$CUSTOM_ALERT_CONFIG_DIR") + else + log_debug "No custom alert config directory found at '$CUSTOM_ALERT_CONFIG_DIR'. Skipping." + fi + + # Run the kubectl command with all arguments + kubectl create configmap grafana-alert-rules \ + "${CM_ARGS[@]}" \ + -n "$MON_NS" \ + --dry-run=client -o yaml | kubectl apply -f - +else + log_debug "No alert rules file found at 'monitoring/alerting/rules'" +fi + # shellcheck disable=SC2086 helm $helmDebug upgrade --install "$promRelease" \ --namespace "$MON_NS" \ diff --git a/monitoring/tls/grafana-datasource-prom-https-path.yaml b/monitoring/tls/grafana-datasource-prom-https-path.yaml index ba5c1b40..f27eb2f3 100644 --- a/monitoring/tls/grafana-datasource-prom-https-path.yaml +++ b/monitoring/tls/grafana-datasource-prom-https-path.yaml @@ -1,10 +1,11 @@ - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - access: proxy - isDefault: true - jsonData: - tlsSkipVerify: true - editable: true - url: https://v4m-prometheus:9090/prometheus +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + isDefault: true + jsonData: + tlsSkipVerify: true + editable: true + url: https://v4m-prometheus:9090/prometheus diff --git a/monitoring/tls/grafana-datasource-prom-https.yaml b/monitoring/tls/grafana-datasource-prom-https.yaml index a747ae6b..b05af51b 100644 --- a/monitoring/tls/grafana-datasource-prom-https.yaml +++ b/monitoring/tls/grafana-datasource-prom-https.yaml @@ -1,10 +1,11 @@ - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - access: proxy - isDefault: true - jsonData: - tlsSkipVerify: true - editable: true - url: https://v4m-prometheus:9090 +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + isDefault: true + jsonData: + tlsSkipVerify: true + editable: true + url: https://v4m-prometheus:9090 diff --git a/monitoring/values-prom-operator.yaml b/monitoring/values-prom-operator.yaml index c9cf1c08..80593553 100644 --- a/monitoring/values-prom-operator.yaml +++ b/monitoring/values-prom-operator.yaml @@ -9,7 +9,6 @@ # Default Values # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml - ## NOTE: All image.*, imagePullSecrets and related keys now derived from environment variables and generated at run-time (DO NOT SET HERE) ## See monitoring/prom-operator_container_image.template for complete list ###global: @@ -22,7 +21,6 @@ ### tag: foo ### pullPolicy: foo - commonLabels: sas.com/monitoring-base: kube-viya-monitoring @@ -31,7 +29,7 @@ commonLabels: # =================== # https://github.com/coreos/prometheus-operator prometheusOperator: -### image: # See NOTE at top of file + ### image: # See NOTE at top of file logFormat: json logLevel: info createCustomResource: false @@ -79,9 +77,9 @@ kube-state-metrics: cpu: "25m" memory: "50Mi" # kube-state-metrics 2.0 requires explicitly defining which labels are - # collected for each resource type + # collected for each resource type extraArgs: - - --metric-labels-allowlist=nodes=[*],namespaces=[*],pods=[*],deployments=[*],statefulsets=[*],daemonsets=[*],jobs=[*] + - --metric-labels-allowlist=nodes=[*],namespaces=[*],pods=[*],deployments=[*],statefulsets=[*],daemonsets=[*],jobs=[*] # Available collectors for kube-state-metrics. # By default, all available resources are enabled, comment out to disable. @@ -109,7 +107,7 @@ kube-state-metrics: - replicationcontrollers - resourcequotas ## Metrics on Secrets disabled to - ## eliminate need for granting + ## eliminate need for granting ## 'list' permission to ClusterRole #- secrets - services @@ -118,7 +116,6 @@ kube-state-metrics: - validatingwebhookconfigurations - volumeattachments - # ========== # Prometheus # ========== @@ -127,7 +124,7 @@ prometheus: name: sas-ops-acct service: type: ClusterIP - nodePort: null + nodePort: null prometheusSpec: #image: # See NOTE at top of file logLevel: info @@ -151,13 +148,13 @@ prometheus: spec: # storageClassName: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: - storage: 25Gi + storage: 25Gi volumeMode: Filesystem alertingEndpoints: - - name: v4m-alertmanager + - name: v4m-alertmanager port: http-web # ======================= @@ -181,7 +178,7 @@ alertmanager: spec: # storageClassName: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 10Gi @@ -214,16 +211,16 @@ prometheus-node-exporter: memory: "100Mi" # Be very tolerant tolerations: - - operator: "Exists" + - operator: "Exists" prometheus: monitor: relabelings: - - sourceLabels: [__meta_kubernetes_pod_node_name] - separator: ; - regex: ^(.*)$ - targetLabel: nodename - replacement: $1 - action: replace + - sourceLabels: [__meta_kubernetes_pod_node_name] + separator: ; + regex: ^(.*)$ + targetLabel: nodename + replacement: $1 + action: replace # ======= # Grafana @@ -236,12 +233,10 @@ grafana: "grafana.ini": analytics: check_for_updates: false - # dashboards: - # default_home_dashboard_path: log: mode: console "log.console": - format: json + format: json dashboards: default_home_dashboard_path: /tmp/dashboards/viya-welcome-dashboard.json feature_toggles: @@ -264,7 +259,6 @@ grafana: datasources: enabled: true label: grafana_datasource - resources: resources: requests: cpu: "50m" @@ -285,4 +279,10 @@ grafana: - kubernetes.io/pvc-protection # subPath: "" # existingClaim: - + extraVolumes: + - name: grafana-alert-rules + configMap: + name: grafana-alert-rules + extraVolumeMounts: + - name: grafana-alert-rules + mountPath: /etc/grafana/provisioning/alerting