diff --git a/modules/monitoring/README.md b/modules/monitoring/README.md new file mode 100644 index 0000000..e599dbf --- /dev/null +++ b/modules/monitoring/README.md @@ -0,0 +1,341 @@ +# modules/monitoring + +Terraform module that wires up a complete monitoring stack on top of the +`rancher-monitoring` (kube-prometheus-stack) add-on that ships with every +Harvester cluster. A single `module` block deploys PrometheusRules, +Alertmanager configuration, a Google Chat notification relay, and Grafana +dashboards — all configurable via input variables. + +## Prerequisites + +- `rancher-monitoring` add-on installed on the Harvester cluster +- Google Chat Space with an incoming webhook URL +- `kubectl` available in the Terraform execution environment (used by + `null_resource` provisioners to patch the Alertmanager Secret) + +## Usage + +```hcl +module "monitoring" { + source = "github.com/wso2-enterprise/open-cloud-datacenter//modules/monitoring?ref=v0.4.0" + + environment = "lk" + kubeconfig_path = "/path/to/harvester.kubeconfig" + kubeconfig_context = "local" + google_chat_webhook_url = var.google_chat_webhook_url + + # Optional — adds "View Alert" and "View in Prometheus" buttons to each card. + # Both URLs are routed through Rancher's authenticated proxy so users don't + # need a separate session directly on the Harvester IP. + # rancher_url = "https://rancher.example.com" + # harvester_cluster_id = "c-xxxxx" # Rancher UI → Cluster Management → cluster row +} +``` + +--- + +## Architecture + +```text +Prometheus (rancher-monitoring) + │ evaluates PrometheusRule CRDs labelled release=rancher-monitoring + ▼ +Alertmanager (rancher-monitoring) + │ matches severity label → route → receiver "google-chat" + │ webhook_configs: http://calert.cattle-monitoring-system:6000/create + ▼ +calert (Deployment — ghcr.io/mr-karan/calert) + │ accepts Alertmanager webhook POST, renders Google Chat Cards v2 + ▼ +Google Chat Space (incoming webhook) +``` + +### Resources created + +| Resource | Kubernetes kind | Name pattern | Namespace | +|---|---|---|---| +| Alertmanager config | Secret | `alertmanager-rancher-monitoring-alertmanager` | `cattle-monitoring-system` | +| calert config + template | Secret | `calert-config` | `cattle-monitoring-system` | +| calert | Deployment + Service | `calert` | `cattle-monitoring-system` | +| Storage alerts | PrometheusRule | `{env}-harvester-storage-alerts` | `cattle-monitoring-system` | +| VM alerts | PrometheusRule | `{env}-harvester-vm-alerts` | `cattle-monitoring-system` | +| Node alerts | PrometheusRule | `{env}-harvester-node-alerts` | `cattle-monitoring-system` | +| Storage dashboard | ConfigMap | `{env}-harvester-storage-dashboard` | `cattle-dashboards` | +| VM dashboard | ConfigMap | `{env}-harvester-vm-dashboard` | `cattle-dashboards` | +| Node dashboard | ConfigMap | `{env}-harvester-node-dashboard` | `cattle-dashboards` | + +--- + +## Design decisions + +### Why direct Secret injection instead of AlertmanagerConfig CRD + +`AlertmanagerConfig` v1alpha1 silently drops any field it does not recognise — +including `googleChatConfigs`. The module therefore patches the +`alertmanager-rancher-monitoring-alertmanager` Secret directly using a +`null_resource` + `kubectl apply`. Prometheus Operator watches the Secret and +hot-reloads Alertmanager within ~30 s of any change. + +The `kubernetes_manifest` resource is not used for this Secret because +rancher-monitoring Helm pre-creates it; a `kubernetes_manifest` would fail +with "already exists" on the first `terraform apply`. + +### calert as a Google Chat relay + +Google Chat does not have a native Alertmanager receiver. calert +(`ghcr.io/mr-karan/calert`) is a purpose-built relay that accepts the standard +Alertmanager webhook payload and reformats it into Google Chat Cards v2 JSON. + +### Hot-reload + +The calert Deployment carries a `checksum/config` annotation computed from the +rendered config and message template. Any `terraform apply` that changes the +template or config content automatically triggers a rolling restart — no manual +pod deletion required. + +### PrometheusRule label selector + +Prometheus Operator discovers PrometheusRule CRDs via its `ruleSelector`. The +rancher-monitoring Helm chart configures this selector to match +`release=rancher-monitoring`. Every PrometheusRule created by this module uses +`local.rule_labels`, which merges `release=rancher-monitoring` with the common +`managed_by` and `environment` labels. **Omitting `local.rule_labels` from a +PrometheusRule will cause Prometheus to ignore the rule entirely.** + +--- + +## Alert inventory + +### Storage (`prometheus_rule_storage`) + +| Alert name | Severity | Condition | +|---|---|---| +| `LonghornVolumeFaulted` | critical | Volume state = Faulted | +| `LonghornVolumeDegradedWarning` | warning | Volume degraded for 15 m | +| `LonghornVolumeDegradedCritical` | critical | Volume degraded for 60 m | +| `LonghornVolumeReplicaCountLow` | warning | Healthy replica count < expected | +| `LonghornReplicaRebuildBacklog` | warning | Concurrent rebuilds per node > threshold | +| `LonghornEvictionWithDegradedVolumes` | critical | Disk eviction active + volumes degraded | +| `LonghornDiskUsageHigh` | warning / critical | Disk usage % above configurable threshold | + +### VM / KubeVirt (`prometheus_rule_vm`) + +| Alert name | Severity | Condition | +|---|---|---| +| `VirtLauncherPodStuck` | critical | virt-launcher pod Pending > `virt_launcher_stuck_for` | +| `VirtLauncherContainerCreating` | critical | virt-launcher stuck in ContainerCreating | +| `VirtLauncherCrashLoop` | critical | ≥ 3 restarts in 15 m | +| `HpVolumePodNotRunning` | critical | hotplug volume pod not Running > `hp_volume_stuck_for` | +| `HpVolumeMapDeviceFailed` | critical | exit status 32 (NFS/Block mode conflict) | +| `StaleVolumeAttachmentBlocking` | warning | CSI blocked by stale VolumeAttachment | + +### Node (`prometheus_rule_node`) + +| Alert name | Severity | Condition | +|---|---|---| +| `NodeCpuHigh` | warning / critical | CPU utilisation > configurable threshold | +| `NodeMemoryHigh` | warning | Memory utilisation > configurable threshold | + +--- + +## How to add a new alert + +### Option A — extend an existing rule group + +This is the fastest path when the new alert belongs to an existing category +(storage, VM, or node). + +1. Open [main.tf](main.tf) and locate the matching `kubernetes_manifest` block: + - `prometheus_rule_storage` — Longhorn / disk + - `prometheus_rule_vm` — KubeVirt / virt-launcher + - `prometheus_rule_node` — node CPU / memory + +2. Add a new map to the `rules` list inside the relevant group: + + ```hcl + { + alert = "MyNewAlert" # PascalCase, [A-Za-z0-9_] only + expr = "my_metric > 0" # valid PromQL — test in Grafana Explore first + for = "5m" # omit for instant (stateless) alerts + labels = { + severity = "warning" # "warning" or "critical" — Alertmanager routes on this + } + annotations = { + summary = "One-line description shown in the card" + description = "Detail with template vars: instance={{ $labels.instance }}, value={{ $value }}" + runbook_url = "${var.runbook_base_url}/MyNewAlert" + } + } + ``` + +3. Apply: + + ```bash + terraform plan # verify the rule diff looks correct + terraform apply + ``` + +### Option B — add a new PrometheusRule resource + +Use this when the alert belongs to a distinct new category that warrants its +own Kubernetes object and Grafana dashboard. + +```hcl +resource "kubernetes_manifest" "prometheus_rule_myapp" { + manifest = { + apiVersion = "monitoring.coreos.com/v1" + kind = "PrometheusRule" + metadata = { + name = "${var.environment}-harvester-myapp-alerts" + namespace = var.monitoring_namespace + labels = local.rule_labels # required — carries release=rancher-monitoring + } + spec = { + groups = [ + { + name = "harvester.myapp" + rules = [ + { + alert = "MyAppDown" + expr = "up{job=\"myapp\"} == 0" + for = "2m" + labels = { severity = "critical" } + annotations = { + summary = "MyApp is unreachable" + description = "Instance {{ $labels.instance }} has been down for 2 m." + runbook_url = "${var.runbook_base_url}/MyAppDown" + } + } + ] + } + ] + } + } +} +``` + +Add a corresponding output in [outputs.tf](outputs.tf) and expose it through +the environment layer's `outputs.tf`. + +### Verifying a rule was picked up + +```bash +# List all PrometheusRule objects +kubectl get prometheusrules -n cattle-monitoring-system + +# Confirm the rule name appears in Prometheus's loaded rule set +kubectl exec -n cattle-monitoring-system \ + $(kubectl get pod -n cattle-monitoring-system -l app=rancher-monitoring-prometheus -o name | head -1) \ + -- wget -qO- localhost:9090/api/v1/rules \ + | jq '.data.groups[].rules[].name' | grep MyNewAlert +``` + +### Test-firing an alert + +```bash +# Port-forward Alertmanager +kubectl port-forward -n cattle-monitoring-system \ + svc/rancher-monitoring-alertmanager 9093:9093 + +# POST a synthetic alert +curl -s -X POST http://localhost:9093/api/v2/alerts \ + -H 'Content-Type: application/json' \ + -d '[{ + "labels": { "alertname": "MyNewAlert", "severity": "warning" }, + "annotations": { "summary": "Test fire", "description": "Synthetic test" } + }]' +``` + +A Google Chat card should appear within a few seconds. + +--- + +## Notification card anatomy + +Each alert produces one card. The card structure is defined in a Go template +stored in the `calert-config` Secret and rendered by calert at runtime. + +```text +┌─────────────────────────────────────────────────────┐ +│ (WARNING) LonghornDiskUsageHigh | Firing │ ← header +├─────────────────────────────────────────────────────┤ +│ Summary: Disk usage on node-1 is 87% │ ┐ +│ Description: Longhorn disk sdb on node-1 … │ │ one decoratedText +│ Runbook: https://wiki.internal/runbooks/… │ │ widget per annotation +├─────────────────────────────────────────────────────┤ ┘ +│ ▶ Alert Details (collapsible) │ ← all labels +├─────────────────────────────────────────────────────┤ +│ [View Alert] [View in Prometheus] │ ← buttons (optional) +└─────────────────────────────────────────────────────┘ +``` + +Both buttons are rendered only when `rancher_url` and `harvester_cluster_id` +are set. URLs are constructed at `terraform apply` time and routed through +Rancher's authenticated proxy — no separate Harvester session required. + +```text +View Alert → /k8s/clusters//…/rancher-monitoring-alertmanager:9093/proxy/#/alerts?filter={alertname=""} +View in Prometheus → /k8s/clusters//…/rancher-monitoring-prometheus:9090/proxy/alerts?search= +``` + +### Template evaluation: Terraform vs Go + +The card template is a Go template evaluated by calert at runtime — but it +lives inside a Terraform heredoc and is written to a Kubernetes Secret at +`terraform apply` time. This means two template engines interact: + +| Syntax | Evaluated by | When | +|---|---|---| +| `${var.alertmanager_url}` | Terraform | at `terraform apply` | +| `%{~ if var.alertmanager_url != "" ~}` | Terraform | at `terraform apply` | +| `{{.Labels.alertname}}` | calert (Go template) | at alert runtime | +| `{{.Annotations.SortedPairs}}` | calert (Go template) | at alert runtime | + +Terraform bakes the base URL as a literal string into the template file. +calert then substitutes the per-alert `alertname` at runtime. Both coexist +safely in the same heredoc because Terraform ignores `{{ }}` delimiters and +calert ignores `${ }` delimiters. + +--- + +## Variable reference + +### Required + +| Name | Type | Description | +|---|---|---| +| `environment` | string | Short environment identifier used in resource names (`lk`, `prod`, …) | +| `kubeconfig_path` | string | Path to the Harvester kubeconfig file | +| `kubeconfig_context` | string | kubectl context within the kubeconfig | +| `google_chat_webhook_url` | string (sensitive) | Google Chat incoming webhook URL | + +### Optional + +| Name | Type | Default | Description | +|---|---|---|---| +| `rancher_url` | string | `""` | Base URL of the Rancher server (e.g. `https://rancher.example.com`). Combined with `harvester_cluster_id` to build Rancher-authenticated proxy URLs for both buttons. Leave empty to omit buttons. | +| `harvester_cluster_id` | string | `""` | Rancher cluster ID for the Harvester cluster (e.g. `c-v7gvt`). Found in Rancher UI → Cluster Management. Required when `rancher_url` is set. | +| `monitoring_namespace` | string | `cattle-monitoring-system` | Namespace where rancher-monitoring runs | +| `dashboards_namespace` | string | `cattle-dashboards` | Namespace where Grafana picks up dashboard ConfigMaps | +| `runbook_base_url` | string | `https://wiki.internal/runbooks/harvester` | Base URL prepended to each alert's `runbook_url` annotation | +| `disk_usage_warning_pct` | number | `80` | Longhorn disk usage % — warning threshold | +| `disk_usage_critical_pct` | number | `90` | Longhorn disk usage % — critical threshold | +| `replica_rebuild_warning_count` | number | `5` | Concurrent Longhorn rebuilds per node before warning | +| `node_cpu_warning_pct` | number | `85` | Node CPU utilisation % — warning threshold | +| `node_cpu_critical_pct` | number | `95` | Node CPU utilisation % — critical threshold | +| `node_memory_warning_pct` | number | `85` | Node memory utilisation % — warning threshold | +| `virt_launcher_stuck_for` | string | `"5m"` | Duration virt-launcher must be Pending/ContainerCreating before alerting | +| `hp_volume_stuck_for` | string | `"3m"` | Duration hp-volume pod must be non-Running before alerting | + +### Outputs + +| Name | Description | +|---|---| +| `prometheus_rule_storage_name` | Name of the storage PrometheusRule | +| `prometheus_rule_vm_name` | Name of the VM PrometheusRule | +| `prometheus_rule_node_name` | Name of the node PrometheusRule | +| `alertmanager_config_name` | Name of the Alertmanager config Secret | +| `grafana_dashboard_storage_name` | Name of the storage Grafana dashboard ConfigMap | +| `grafana_dashboard_vm_name` | Name of the VM Grafana dashboard ConfigMap | +| `grafana_dashboard_node_name` | Name of the node Grafana dashboard ConfigMap | +| `monitoring_namespace` | Namespace all monitoring resources were deployed into | diff --git a/modules/monitoring/examples/basic/main.tf b/modules/monitoring/examples/basic/main.tf new file mode 100644 index 0000000..0dc350d --- /dev/null +++ b/modules/monitoring/examples/basic/main.tf @@ -0,0 +1,82 @@ +# Example: Deploy the monitoring stack for a Harvester + Rancher environment. +# +# Prerequisites: +# - rancher-monitoring (kube-prometheus-stack) deployed on the Harvester cluster. +# Verify: kubectl get pods -n cattle-monitoring-system +# - Harvester kubeconfig downloaded (Harvester UI → Support → Download KubeConfig). +# - Google Chat incoming webhook URL created +# (Chat Space → Apps & Integrations → Webhooks → Add webhook). +# +# Apply: +# export TF_VAR_google_chat_webhook_url="https://chat.googleapis.com/..." +# terraform init && terraform apply + +terraform { + required_version = ">= 1.5" + + required_providers { + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 3.0" + } + } +} + +provider "kubernetes" { + config_path = var.kubeconfig_path + config_context = var.kubeconfig_context +} + +module "monitoring" { + source = "../../" + + # Identifiers + environment = var.environment + kubeconfig_path = var.kubeconfig_path + kubeconfig_context = var.kubeconfig_context + + # Notification + google_chat_webhook_url = var.google_chat_webhook_url + + # Thresholds (optional — all have sensible defaults) + disk_usage_warning_pct = 80 + disk_usage_critical_pct = 90 + replica_rebuild_warning_count = 5 + node_cpu_warning_pct = 85 + node_cpu_critical_pct = 95 + node_memory_warning_pct = 85 + virt_launcher_stuck_for = "5m" + hp_volume_stuck_for = "3m" + + # Runbook base URL (optional) + runbook_base_url = "https://wiki.internal/runbooks/harvester" +} + +output "monitoring_resources" { + value = { + prometheus_rule_storage = module.monitoring.prometheus_rule_storage_name + prometheus_rule_vm = module.monitoring.prometheus_rule_vm_name + prometheus_rule_node = module.monitoring.prometheus_rule_node_name + alertmanager_config = module.monitoring.alertmanager_config_name + } +} + +variable "environment" { + type = string + description = "Short environment identifier used in resource names (e.g. \"lk\")." +} + +variable "kubeconfig_path" { + type = string + default = "~/.kube/harvester-lk.yaml" +} + +variable "kubeconfig_context" { + type = string + default = "local" +} + +variable "google_chat_webhook_url" { + type = string + sensitive = true +} diff --git a/modules/monitoring/main.tf b/modules/monitoring/main.tf new file mode 100644 index 0000000..6c29211 --- /dev/null +++ b/modules/monitoring/main.tf @@ -0,0 +1,1385 @@ +# ── Common locals ───────────────────────────────────────────────────────────── + +locals { + common_labels = { + managed_by = "terraform" + environment = var.environment + } + + # PrometheusRule resources must carry release=rancher-monitoring so that + # rancher-monitoring's Prometheus picks them up via its ruleSelector. + rule_labels = merge(local.common_labels, { + release = "rancher-monitoring" + }) + + # AlertmanagerConfig and template resources go in the monitoring namespace. + ns = var.monitoring_namespace + dns = var.dashboards_namespace + + # Rancher-authenticated proxy base for this Harvester cluster. + # e.g. https://rancher.example.com/k8s/clusters/c-v7gvt + # Both button URLs are derived from this so they pass through Rancher auth + # rather than hitting the Harvester IP directly (which returns 403 to users + # who are not separately authenticated to Harvester). + rancher_proxy_base = (var.rancher_url != "" && var.harvester_cluster_id != "") ? "${var.rancher_url}/k8s/clusters/${var.harvester_cluster_id}" : "" + alertmanager_base_url = local.rancher_proxy_base != "" ? "${local.rancher_proxy_base}/api/v1/namespaces/${var.monitoring_namespace}/services/http:rancher-monitoring-alertmanager:9093/proxy" : "" + prometheus_base_url = local.rancher_proxy_base != "" ? "${local.rancher_proxy_base}/api/v1/namespaces/${var.monitoring_namespace}/services/http:rancher-monitoring-prometheus:9090/proxy" : "" +} + +# ── Alertmanager config + calert (Google Chat webhook forwarder) ────────────── +# google_chat_configs is not a native Alertmanager receiver type. The standard +# pattern is: Alertmanager webhook_configs → calert → Google Chat API. +# calert (ghcr.io/mr-karan/calert) is a purpose-built forwarder that accepts +# the Alertmanager webhook payload and reformats it for Google Chat. +# +# Prometheus Operator watches the base alertmanager Secret and hot-reloads +# Alertmanager within ~30s of any change. The Secret is written via kubectl +# apply (idempotent create-or-update) since rancher-monitoring Helm pre-creates +# it and kubernetes_manifest would fail with "already exists" on fresh clusters. + +locals { + # calert config.toml — rendered with the webhook URL and stored in a Secret. + calert_config_toml = <<-TOML + [app] + address = "0.0.0.0:6000" + server_timeout = "30s" + log = "info" + + [providers.gchat] + type = "google_chat" + endpoint = "${var.google_chat_webhook_url}" + template = "/etc/calert/message.tmpl" + timeout = "20s" + thread_ttl = "12h" + threaded_replies = false + dry_run = false + TOML + + # calert message template — Go template that produces a Google Chat Cards v2 + # JSON array. calert detects the named "cardsV2" block and uses it to build + # a rich card instead of a plain text message. + # + # Template data (Alertmanager webhook payload): + # .Alerts[] — slice of firing/resolved alerts + # .Fingerprint — unique alert ID + # .Status — "firing" | "resolved" + # .Labels — map[string]string (alertname, severity, …) + # .Annotations — map[string]string (summary, description, runbook_url, …) + # .GeneratorURL — link to the Prometheus graph that fired + # .ExternalURL — base URL of the Alertmanager instance + # Template context: single alertmgrtmpl.Alert rendered per alert. + # Output: single JSON object → chatv1.CardWithId (no array wrapper). + # Section-level headers (strings) only — card-level header objects are not + # supported by Google Chat webhooks and produce blank cards. + # Available functions: toUpper, Title, SortedPairs (calert v2.3.0) + # local.alertmanager_base_url / local.prometheus_base_url are Terraform + # interpolations — baked in at apply time as literal strings. The Go template + # vars ({{.Labels.alertname}} etc.) are resolved at runtime per alert. + # Both coexist safely in the same heredoc. + calert_message_tmpl = <<-TMPL + {{- define "cardsV2" -}} + { + "card": { + "sections": [ + { + "header": "({{.Labels.severity | toUpper}}) {{.Labels.alertname | Title}} | {{.Status | Title}}", + "widgets": [ + {{- range $i, $pair := .Annotations.SortedPairs -}} + {{- if ne $i 0 -}},{{- end -}} + {"decoratedText": {"text": "{{ $pair.Name | Title }}: {{ $pair.Value }}"}} + {{- end -}} + %{~ if local.rancher_proxy_base != "" ~} + ,{"buttonList": {"buttons": [ + {"text": "View Alert", "onClick": {"openLink": {"url": "${local.alertmanager_base_url}/#/alerts?filter=%7Balertname%3D%22{{.Labels.alertname}}%22%7D"}}}, + {"text": "View in Prometheus", "onClick": {"openLink": {"url": "${local.prometheus_base_url}/alerts?search={{.Labels.alertname}}"}}} + ]}} + %{~ endif ~} + ] + }, + { + "header": "Alert Details", + "collapsible": true, + "uncollapsibleWidgetsCount": 0, + "widgets": [ + {{- range $i, $pair := .Labels.SortedPairs -}} + {{- if ne $i 0 -}},{{- end -}} + {"decoratedText": {"text": "{{ $pair.Name }}: {{ $pair.Value }}"}} + {{- end -}} + ] + } + ] + } + } + {{- end -}} + TMPL + + # Preserved from the rancher-monitoring Helm chart — used by any Slack/other + # receivers configured via the Rancher UI. Keep it so those still work. + rancher_defaults_tmpl = <<-TMPL + {{- define "slack.rancher.text" -}} + {{ template "rancher.text_multiple" . }} + {{- end -}} + + {{- define "rancher.text_multiple" -}} + *[GROUP - Details]* + One or more alarms in this group have triggered a notification. + + {{- if gt (len .GroupLabels.Values) 0 }} + *Group Labels:* + {{- range .GroupLabels.SortedPairs }} + • *{{ .Name }}:* `{{ .Value }}` + {{- end }} + {{- end }} + {{- if .ExternalURL }} + *Link to AlertManager:* {{ .ExternalURL }} + {{- end }} + + {{- range .Alerts }} + {{ template "rancher.text_single" . }} + {{- end }} + {{- end -}} + + {{- define "rancher.text_single" -}} + {{- if .Labels.alertname }} + *[ALERT - {{ .Labels.alertname }}]* + {{- else }} + *[ALERT]* + {{- end }} + {{- if .Labels.severity }} + *Severity:* `{{ .Labels.severity }}` + {{- end }} + {{- if .Labels.cluster }} + *Cluster:* {{ .Labels.cluster }} + {{- end }} + {{- if .Annotations.summary }} + *Summary:* {{ .Annotations.summary }} + {{- end }} + {{- if .Annotations.message }} + *Message:* {{ .Annotations.message }} + {{- end }} + {{- if .Annotations.description }} + *Description:* {{ .Annotations.description }} + {{- end }} + {{- if .Annotations.runbook_url }} + *Runbook URL:* <{{ .Annotations.runbook_url }}|:spiral_note_pad:> + {{- end }} + {{- with .Labels }} + {{- with .Remove (stringSlice "alertname" "severity" "cluster") }} + {{- if gt (len .) 0 }} + *Additional Labels:* + {{- range .SortedPairs }} + • *{{ .Name }}:* `{{ .Value }}` + {{- end }} + {{- end }} + {{- end }} + {{- end }} + {{- with .Annotations }} + {{- with .Remove (stringSlice "summary" "message" "description" "runbook_url") }} + {{- if gt (len .) 0 }} + *Additional Annotations:* + {{- range .SortedPairs }} + • *{{ .Name }}:* `{{ .Value }}` + {{- end }} + {{- end }} + {{- end }} + {{- end }} + {{- end -}} + TMPL + + alertmanager_config_yaml = yamlencode({ + global = { + resolve_timeout = "5m" + } + + route = { + group_by = ["alertname", "severity", "node", "volume"] + group_wait = "30s" + group_interval = "5m" + repeat_interval = "12h" + receiver = "null" + routes = [ + { + matchers = ["alertname = \"Watchdog\""] + receiver = "null" + }, + { + matchers = ["severity = \"critical\""] + receiver = "google-chat-critical" + repeat_interval = "1h" + }, + { + matchers = ["severity = \"warning\""] + receiver = "google-chat-warning" + repeat_interval = "4h" + }, + ] + } + + receivers = [ + { name = "null" }, + { + name = "google-chat-critical" + webhook_configs = [{ + url = "http://calert.${local.ns}:6000/dispatch?room_name=gchat" + send_resolved = true + }] + }, + { + name = "google-chat-warning" + webhook_configs = [{ + url = "http://calert.${local.ns}:6000/dispatch?room_name=gchat" + send_resolved = true + }] + }, + ] + + inhibit_rules = [ + # Rancher defaults — preserve existing rancher-monitoring behaviour. + { + source_matchers = ["severity = \"critical\""] + target_matchers = ["severity =~ \"warning|info\""] + equal = ["namespace", "alertname"] + }, + { + source_matchers = ["severity = \"warning\""] + target_matchers = ["severity = \"info\""] + equal = ["namespace", "alertname"] + }, + { + source_matchers = ["alertname = \"InfoInhibitor\""] + target_matchers = ["severity = \"info\""] + equal = ["namespace"] + }, + { target_matchers = ["alertname = \"InfoInhibitor\""] }, + # Suppress warning when critical fires for the same node. + { + source_matchers = ["severity = \"critical\""] + target_matchers = ["severity = \"warning\""] + equal = ["node"] + }, + # Suppress LonghornVolumeDegradedWarning when LonghornVolumeFaulted fires. + { + source_matchers = ["alertname = \"LonghornVolumeFaulted\""] + target_matchers = ["alertname = \"LonghornVolumeDegradedWarning\""] + equal = ["volume"] + }, + # Suppress VirtLauncherContainerCreating when LonghornVolumeFaulted fires. + { + source_matchers = ["alertname = \"LonghornVolumeFaulted\""] + target_matchers = ["alertname = \"VirtLauncherContainerCreating\""] + equal = ["namespace"] + }, + ] + + templates = ["/etc/alertmanager/config/*.tmpl"] + }) +} + +# ── Alertmanager base config Secret ────────────────────────────────────────── +# Writes directly to the Secret that Prometheus Operator uses as the base +# Alertmanager configuration. The AlertmanagerConfig v1alpha1 CRD does not +# support googleChatConfigs (field silently dropped), so we bypass it entirely +# and write native Alertmanager YAML here. Prometheus Operator hot-reloads +# Alertmanager within ~30s of Secret changes. +# +# We use null_resource + kubectl apply instead of kubernetes_manifest because +# rancher-monitoring Helm pre-creates this Secret on every fresh cluster. +# kubernetes_manifest tries CREATE (POST) when a resource isn't in TF state, +# which fails with "already exists". kubectl apply is always create-or-update +# and is fully idempotent — no terraform import step required. + +resource "null_resource" "alertmanager_base_config" { + triggers = { + config_hash = sha256(local.alertmanager_config_yaml) + tmpl_hash = sha256(local.rancher_defaults_tmpl) + } + + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + environment = { + KUBECONFIG = var.kubeconfig_path + AM_CONFIG_B64 = base64encode(local.alertmanager_config_yaml) + AM_TMPL_B64 = base64encode(local.rancher_defaults_tmpl) + } + command = <<-BASH + set -e + kubectl create secret generic alertmanager-rancher-monitoring-alertmanager \ + --context '${var.kubeconfig_context}' \ + -n '${local.ns}' \ + --from-file=alertmanager.yaml=<(base64 -d <<< "$AM_CONFIG_B64") \ + --from-file=rancher_defaults.tmpl=<(base64 -d <<< "$AM_TMPL_B64") \ + --dry-run=client -o yaml \ + | kubectl apply \ + --context '${var.kubeconfig_context}' \ + -f - + BASH + } +} + +# ── calert — Google Chat webhook forwarder ──────────────────────────────────── +# calert accepts Alertmanager webhook_configs payloads and forwards them to +# Google Chat. Deployed as a single-replica Deployment in the monitoring +# namespace. Config is stored in a Secret (contains the webhook URL). + +resource "null_resource" "calert_config" { + triggers = { + config_hash = sha256(local.calert_config_toml) + tmpl_hash = sha256(local.calert_message_tmpl) + } + + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + environment = { + KUBECONFIG = var.kubeconfig_path + CONFIG_B64 = base64encode(local.calert_config_toml) + TMPL_B64 = base64encode(local.calert_message_tmpl) + } + command = <<-BASH + set -e + kubectl create secret generic calert-config \ + --context '${var.kubeconfig_context}' \ + -n '${local.ns}' \ + --from-file=config.toml=<(base64 -d <<< "$CONFIG_B64") \ + --from-file=message.tmpl=<(base64 -d <<< "$TMPL_B64") \ + --dry-run=client -o yaml \ + | kubectl apply \ + --context '${var.kubeconfig_context}' \ + -f - + BASH + } +} + +resource "kubernetes_manifest" "calert_deployment" { + manifest = { + apiVersion = "apps/v1" + kind = "Deployment" + metadata = { + name = "calert" + namespace = local.ns + labels = local.common_labels + } + spec = { + replicas = 1 + selector = { matchLabels = { app = "calert" } } + template = { + metadata = { + labels = { app = "calert" } + annotations = { + # Changing config or template triggers a rolling restart automatically. + "checksum/config" = sha256("${local.calert_config_toml}${local.calert_message_tmpl}") + } + } + spec = { + containers = [{ + name = "calert" + image = "ghcr.io/mr-karan/calert:v2.3.0" + args = ["--config=/etc/calert/config.toml"] + ports = [{ containerPort = 6000 }] + volumeMounts = [{ + name = "config" + mountPath = "/etc/calert" + readOnly = true + }] + }] + volumes = [{ + name = "config" + secret = { secretName = "calert-config" } + }] + } + } + } + } + + depends_on = [null_resource.calert_config] +} + +resource "kubernetes_manifest" "calert_service" { + manifest = { + apiVersion = "v1" + kind = "Service" + metadata = { + name = "calert" + namespace = local.ns + labels = local.common_labels + } + spec = { + selector = { app = "calert" } + ports = [{ + name = "http" + port = 6000 + targetPort = 6000 + }] + } + } +} + +# ═══════════════════════════════════════════════════════════════════════════════ +# PrometheusRule 1 — Longhorn / Storage alerts +# ═══════════════════════════════════════════════════════════════════════════════ + +resource "kubernetes_manifest" "prometheus_rule_storage" { + manifest = { + apiVersion = "monitoring.coreos.com/v1" + kind = "PrometheusRule" + metadata = { + name = "${var.environment}-harvester-storage-alerts" + namespace = local.ns + labels = local.rule_labels + } + spec = { + groups = [ + { + name = "longhorn.storage" + rules = [ + { + alert = "LonghornVolumeFaulted" + expr = "longhorn_volume_robustness{robustness=\"faulted\"} == 1" + for = "2m" + labels = { severity = "critical" } + annotations = { + summary = "Longhorn volume {{ $labels.volume }} is faulted" + description = "Volume {{ $labels.volume }} is faulted (zero healthy replicas). VMs using this volume will have I/O errors. If disk eviction was in progress, immediately set evictionRequested=false on the source disk — eviction stops source replicas before destination finishes rebuilding." + runbook_url = "${var.runbook_base_url}/LonghornVolumeFaulted" + } + }, + { + alert = "LonghornVolumeDegradedWarning" + expr = "longhorn_volume_robustness{robustness=\"degraded\"} == 1" + for = "15m" + labels = { severity = "warning" } + annotations = { + summary = "Longhorn volume {{ $labels.volume }} degraded >15m" + description = "Volume {{ $labels.volume }} has been degraded for 15+ min. Replica count below desired. Next disk failure risks data loss." + runbook_url = "${var.runbook_base_url}/LonghornVolumeDegraded" + } + }, + { + alert = "LonghornVolumeDegradedCritical" + expr = "longhorn_volume_robustness{robustness=\"degraded\"} == 1" + for = "60m" + labels = { severity = "critical" } + annotations = { + summary = "Longhorn volume {{ $labels.volume }} degraded >1h" + description = "Volume {{ $labels.volume }} degraded > 1h. Rebuild stalled or insufficient capacity. Check replica pod logs and disk schedulability." + runbook_url = "${var.runbook_base_url}/LonghornVolumeDegraded" + } + }, + { + alert = "LonghornVolumeReplicaCountLow" + expr = "longhorn_volume_replicas_count < longhorn_volume_spec_replicas_count AND longhorn_volume_actual_size > 0" + for = "10m" + labels = { severity = "warning" } + annotations = { + summary = "Longhorn volume {{ $labels.volume }} has fewer replicas than configured" + description = "Volume {{ $labels.volume }} has {{ $value }} running replicas, fewer than configured. Rebuild may be stalled. Check: kubectl get replicas.longhorn.io -n longhorn-system" + runbook_url = "${var.runbook_base_url}/LonghornVolumeReplicaCountLow" + } + }, + { + alert = "LonghornReplicaRebuildBacklog" + expr = "sum by (node) (longhorn_replica_rebuilding) > ${var.replica_rebuild_warning_count}" + for = "5m" + labels = { severity = "warning" } + annotations = { + summary = "Node {{ $labels.node }} has {{ $value }} replicas rebuilding simultaneously" + description = "Node {{ $labels.node }} has {{ $value }} replicas rebuilding simultaneously. Do NOT initiate disk evictions while this is firing — mass eviction with active rebuilds causes cascade failure (source replica stopped before destination is healthy)." + runbook_url = "${var.runbook_base_url}/LonghornReplicaRebuildBacklog" + } + }, + { + alert = "LonghornEvictionWithDegradedVolumes" + expr = "longhorn_disk_eviction_requested == 1 and on() count(longhorn_volume_robustness{robustness=\"degraded\"}) > 0" + for = "5m" + labels = { severity = "critical" } + annotations = { + summary = "Longhorn disk eviction active on {{ $labels.node }}/{{ $labels.disk }}" + description = "Disk eviction is active on {{ $labels.node }}/{{ $labels.disk }}. Eviction stops source replicas before destinations finish rebuilding. If any volumes are degraded (see LonghornVolumeDegradedWarning), pause immediately: kubectl patch nodes.longhorn.io -n longhorn-system --type=json -p='[{\"op\":\"replace\",\"path\":\"/spec/disks//evictionRequested\",\"value\":false}]'" + runbook_url = "${var.runbook_base_url}/LonghornEvictionWithDegradedVolumes" + } + }, + { + alert = "LonghornDiskUsageHigh" + expr = "(longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > ${var.disk_usage_warning_pct}" + for = "10m" + labels = { severity = "warning" } + annotations = { + summary = "Longhorn disk {{ $labels.disk }} on {{ $labels.node }} at {{ $value | printf \"%.1f\" }}%" + description = "Disk {{ $labels.disk }} on {{ $labels.node }} at {{ $value | printf \"%.1f\" }}% capacity. New replica scheduling will fail if disk reaches 100%." + runbook_url = "${var.runbook_base_url}/LonghornDiskUsageHigh" + } + }, + { + alert = "LonghornDiskUsageCritical" + expr = "(longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > ${var.disk_usage_critical_pct}" + for = "5m" + labels = { severity = "critical" } + annotations = { + summary = "Longhorn disk {{ $labels.disk }} on {{ $labels.node }} critically full at {{ $value | printf \"%.1f\" }}%" + description = "Disk {{ $labels.disk }} on {{ $labels.node }} critically full at {{ $value | printf \"%.1f\" }}%. Immediate action required: check for ghost replicas (stopped replicas with no data dir that inflate storageScheduled)." + runbook_url = "${var.runbook_base_url}/LonghornDiskUsageCritical" + } + }, + { + alert = "LonghornDiskSchedulingDisabledLong" + expr = "longhorn_disk_schedulable == 0" + for = "60m" + labels = { severity = "warning" } + annotations = { + summary = "Longhorn disk {{ $labels.disk }} on {{ $labels.node }} has scheduling disabled >1h" + description = "Longhorn disk {{ $labels.disk }} on {{ $labels.node }} has allowScheduling=false for > 1h. Confirm this is intentional maintenance. New replicas will not be placed here." + runbook_url = "${var.runbook_base_url}/LonghornDiskSchedulingDisabled" + } + }, + { + alert = "LonghornReplicaUnhealthy" + expr = "count by (node) (longhorn_replica_state{state=~\"error|unknown\"} == 1) > 0" + for = "5m" + labels = { severity = "warning" } + annotations = { + summary = "{{ $value }} replicas in error/unknown state on {{ $labels.node }}" + description = "{{ $value }} Longhorn replicas in error or unknown state on {{ $labels.node }} for 5+ min. These are genuinely unhealthy (stopped is normal for powered-off VMs). Check Longhorn UI for affected volumes and consider deleting and rebuilding the replica." + runbook_url = "${var.runbook_base_url}/LonghornReplicaUnhealthy" + } + }, + { + alert = "LonghornShareManagerNotRunning" + expr = "kube_pod_status_phase{pod=~\"share-manager-.*\", namespace=\"longhorn-system\", phase!=\"Running\"} == 1" + for = "3m" + labels = { severity = "critical" } + annotations = { + summary = "Longhorn share-manager {{ $labels.pod }} is not Running" + description = "Longhorn share-manager {{ $labels.pod }} is not Running. All RWX volumes it exports lose access. Harvester CSI translates ALL downstream PVCs to RWX+Block — this share-manager failure directly blocks hotplugged disk access for affected VMs." + runbook_url = "${var.runbook_base_url}/LonghornShareManagerNotRunning" + } + }, + ] + } + ] + } + } +} + +# ═══════════════════════════════════════════════════════════════════════════════ +# PrometheusRule 2 — KubeVirt / VM alerts +# ═══════════════════════════════════════════════════════════════════════════════ + +resource "kubernetes_manifest" "prometheus_rule_vm" { + manifest = { + apiVersion = "monitoring.coreos.com/v1" + kind = "PrometheusRule" + metadata = { + name = "${var.environment}-harvester-vm-alerts" + namespace = local.ns + labels = local.rule_labels + } + spec = { + groups = [ + { + name = "kubevirt.vm" + rules = [ + { + alert = "VirtLauncherPodStuck" + expr = "kube_pod_status_phase{pod=~\"virt-launcher-.*\", phase=\"Pending\"} == 1" + for = var.virt_launcher_stuck_for + labels = { severity = "critical" } + annotations = { + summary = "VM pod {{ $labels.pod }} in {{ $labels.namespace }} pending for >${var.virt_launcher_stuck_for}" + description = "VM pod {{ $labels.pod }} in {{ $labels.namespace }} pending for > ${var.virt_launcher_stuck_for}. Likely cause: Longhorn volume not attaching. Check: kubectl get volumeattachment | grep . A stale VolumeAttachment from a previous node (attached=true on wrong node) may be blocking CSI. Delete it ONLY after confirming the referencing pod is terminated." + runbook_url = "${var.runbook_base_url}/VirtLauncherPodStuck" + } + }, + { + alert = "VirtLauncherContainerCreating" + expr = "kube_pod_container_status_waiting_reason{pod=~\"virt-launcher-.*\", reason=\"ContainerCreating\"} == 1" + for = var.virt_launcher_stuck_for + labels = { severity = "critical" } + annotations = { + summary = "VM pod {{ $labels.pod }} stuck in ContainerCreating" + description = "VM pod {{ $labels.pod }} stuck in ContainerCreating. 1. Check pod events: kubectl describe pod -n . 2. Look for FailedAttachVolume → stale VolumeAttachment. 3. Look for FailedMount exit status 32 → share-manager node conflict (see HP-volume alerts)." + runbook_url = "${var.runbook_base_url}/VirtLauncherContainerCreating" + } + }, + { + alert = "VirtLauncherCrashLoop" + expr = "increase(kube_pod_container_status_restarts_total{pod=~\"virt-launcher-.*\"}[15m]) > 3" + for = "0m" + labels = { severity = "critical" } + annotations = { + summary = "VM pod {{ $labels.pod }} restarted {{ $value }} times in 15m" + description = "VM pod {{ $labels.pod }} restarted {{ $value }} times in 15m. If LonghornVolumeFaulted is also firing, root cause is VM I/O errors from faulted backing volume forcing virt-launcher termination." + runbook_url = "${var.runbook_base_url}/VirtLauncherCrashLoop" + } + }, + { + alert = "HpVolumePodNotRunning" + expr = "kube_pod_status_phase{pod=~\"hp-volume-.*\", namespace=\"cpd-dp\", phase!=\"Running\"} == 1" + for = var.hp_volume_stuck_for + labels = { severity = "critical" } + annotations = { + summary = "hp-volume pod {{ $labels.pod }} not Running for >${var.hp_volume_stuck_for}" + description = "hp-volume pod {{ $labels.pod }} not Running for >${var.hp_volume_stuck_for}. ALL hotplugged disks for the target VM are now unavailable. Key failure modes: 1. exit status 32: Longhorn engine pinned to different node than hp-volume pod. Root cause: Harvester CSI RWX+Block share-manager conflict. Engine node = share-manager node (controlled by share-manager controller, cannot be overridden externally). Fix: delete + recreate the PVC with node scheduling controlled. 2. FailedAttachVolume: stale VolumeAttachment on wrong node. Delete it. Check: kubectl get events -n cpd-dp --field-selector involvedObject.name=" + runbook_url = "${var.runbook_base_url}/HpVolumePodNotRunning" + } + }, + { + alert = "HpVolumeMapDeviceFailed" + expr = "kube_event_count{involvedObject_kind=\"Pod\", involvedObject_namespace=\"cpd-dp\", reason=\"FailedMount\"} > 0" + for = "0m" + labels = { severity = "critical" } + annotations = { + summary = "hp-volume pod reporting FailedMount (exit status 32)" + description = "hp-volume pod reporting mount failure. exit status 32 = NFS/Block mode conflict: share-manager exported volume as NFS filesystem, but NodePublishVolume expects a block device file at staging path. Only fix: PVC recreation." + runbook_url = "${var.runbook_base_url}/HpVolumeMapDeviceFailed" + } + }, + { + alert = "StaleVolumeAttachmentBlocking" + expr = "increase(kube_event_count{reason=\"FailedAttachVolume\", involvedObject_kind=\"Pod\"}[10m]) > 3" + for = "0m" + labels = { severity = "critical" } + annotations = { + summary = "Pod {{ $labels.involvedObject_name }} has >3 FailedAttachVolume events in 10m" + description = "Pod {{ $labels.involvedObject_name }} has >3 FailedAttachVolume events in 10m. A stale VolumeAttachment from a previous node is blocking CSI. Steps: (1) kubectl get volumeattachment | grep (2) Confirm the pod referencing the old VA is terminated (3) kubectl delete volumeattachment . Note: deleting a VA while referencing pod exists causes immediate recreation." + runbook_url = "${var.runbook_base_url}/StaleVolumeAttachmentBlocking" + } + }, + ] + } + ] + } + } +} + +# ═══════════════════════════════════════════════════════════════════════════════ +# PrometheusRule 3 — Harvester node alerts +# ═══════════════════════════════════════════════════════════════════════════════ + +resource "kubernetes_manifest" "prometheus_rule_node" { + manifest = { + apiVersion = "monitoring.coreos.com/v1" + kind = "PrometheusRule" + metadata = { + name = "${var.environment}-harvester-node-alerts" + namespace = local.ns + labels = local.rule_labels + } + spec = { + groups = [ + { + name = "harvester.node" + rules = [ + { + alert = "HarvesterNodeNotReady" + expr = "kube_node_status_condition{condition=\"Ready\", status=\"true\"} == 0" + for = "1m" + labels = { severity = "critical" } + annotations = { + summary = "Harvester node {{ $labels.node }} NotReady" + description = "Harvester node {{ $labels.node }} NotReady for >1m. All VMs on this node are at risk. Longhorn replicas on this node will fault within 60s if it remains offline." + runbook_url = "${var.runbook_base_url}/HarvesterNodeNotReady" + } + }, + { + alert = "LonghornNodeOffline" + expr = "longhorn_node_status{condition=\"ready\"} == 0" + for = "2m" + labels = { severity = "critical" } + annotations = { + summary = "Longhorn reports {{ $labels.node }} offline" + description = "Longhorn reports {{ $labels.node }} offline. Volumes with replicas only on this node will degrade immediately." + runbook_url = "${var.runbook_base_url}/LonghornNodeOffline" + } + }, + { + alert = "NodeDiskPressure" + expr = "kube_node_status_condition{condition=\"DiskPressure\", status=\"true\"} == 1" + for = "2m" + labels = { severity = "warning" } + annotations = { + summary = "Node {{ $labels.node }} OS disk pressure" + description = "Node {{ $labels.node }} OS disk pressure (not Longhorn — this is the root filesystem). Kubelet will begin evicting pods. Check: df -h on the node." + runbook_url = "${var.runbook_base_url}/NodeDiskPressure" + } + }, + { + alert = "NodeMemoryPressure" + expr = "kube_node_status_condition{condition=\"MemoryPressure\", status=\"true\"} == 1" + for = "2m" + labels = { severity = "warning" } + annotations = { + summary = "Node {{ $labels.node }} memory pressure" + description = "Node {{ $labels.node }} memory pressure. Pod evictions may begin." + runbook_url = "${var.runbook_base_url}/NodeMemoryPressure" + } + }, + { + alert = "NodeHighCPU" + expr = "100 - (avg by (node) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) > ${var.node_cpu_warning_pct}" + for = "15m" + labels = { severity = "warning" } + annotations = { + summary = "Node {{ $labels.node }} CPU at {{ $value | printf \"%.1f\" }}%" + description = "Node {{ $labels.node }} CPU at {{ $value | printf \"%.1f\" }}% for 15m. Correlate with LonghornReplicaRebuildBacklog — rebuild storms saturate CPU/IO." + runbook_url = "${var.runbook_base_url}/NodeHighCPU" + } + }, + { + alert = "NodeCPUCritical" + expr = "100 - (avg by (node) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) > ${var.node_cpu_critical_pct}" + for = "10m" + labels = { severity = "critical" } + annotations = { + summary = "Node {{ $labels.node }} CPU critical at {{ $value | printf \"%.1f\" }}%" + description = "Node {{ $labels.node }} CPU at {{ $value | printf \"%.1f\" }}% for 10m. Immediate investigation required." + runbook_url = "${var.runbook_base_url}/NodeCPUCritical" + } + }, + { + alert = "NodeHighMemory" + expr = "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > ${var.node_memory_warning_pct}" + for = "10m" + labels = { severity = "warning" } + annotations = { + summary = "Node {{ $labels.node }} memory at {{ $value | printf \"%.1f\" }}%" + description = "Node {{ $labels.node }} memory at {{ $value | printf \"%.1f\" }}%." + runbook_url = "${var.runbook_base_url}/NodeHighMemory" + } + }, + { + alert = "NodeRootFSLow" + expr = "(node_filesystem_avail_bytes{mountpoint=\"/\", fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\", fstype!=\"tmpfs\"}) * 100 < 15" + for = "10m" + labels = { severity = "warning" } + annotations = { + summary = "Root filesystem on {{ $labels.instance }} < 15% free" + description = "Root filesystem on {{ $labels.instance }} < 15% free. This is NOT Longhorn disk space — affects kubelet, etcd, container images, logs." + runbook_url = "${var.runbook_base_url}/NodeRootFSLow" + } + }, + { + alert = "NodeRootFSCritical" + expr = "(node_filesystem_avail_bytes{mountpoint=\"/\", fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\", fstype!=\"tmpfs\"}) * 100 < 5" + for = "5m" + labels = { severity = "critical" } + annotations = { + summary = "Root filesystem on {{ $labels.instance }} critically low (<5% free)" + description = "Root filesystem on {{ $labels.instance }} < 5% free. Immediate action required — kubelet, etcd, and container runtime may fail." + runbook_url = "${var.runbook_base_url}/NodeRootFSCritical" + } + }, + ] + } + ] + } + } +} + +# ═══════════════════════════════════════════════════════════════════════════════ +# AlertmanagerConfig — Google Chat routing +# +# Uses monitoring.coreos.com/v1alpha1 AlertmanagerConfig CRD. +# Requires Prometheus Operator >= 0.73 (kube-prometheus-stack >= 60.x, +# included in Rancher Monitoring 103.x / Rancher 2.13.x+). +# +# The AlertmanagerConfig is picked up by rancher-monitoring's Alertmanager via +# alertmanagerConfigSelector (matches all AlertmanagerConfigs in the monitoring +# namespace by default in Rancher Monitoring). +# ═══════════════════════════════════════════════════════════════════════════════ + +# AlertmanagerConfig CRD removed — routing and receivers are now in +# kubernetes_manifest.alertmanager_base_config (native Alertmanager YAML). + +# ═══════════════════════════════════════════════════════════════════════════════ +# Grafana Dashboards — ConfigMaps in cattle-dashboards namespace +# Labels: grafana_dashboard = "1" (picked up by Grafana sidecar) +# ═══════════════════════════════════════════════════════════════════════════════ + +locals { + # ── Shared datasource template variable ───────────────────────────────────── + _ds_var = { + current = { selected = false, text = "Prometheus", value = "Prometheus" } + hide = 0 + includeAll = false + multi = false + name = "datasource" + options = [] + query = "prometheus" + refresh = 1 + type = "datasource" + label = "Datasource" + } + + # ── Dashboard 1: Storage Health ────────────────────────────────────────────── + dashboard_storage = { + title = "${var.environment} — Harvester Storage Health" + uid = "${var.environment}-harvester-storage" + schemaVersion = 38 + refresh = "30s" + tags = ["harvester", "storage", "longhorn", var.environment] + time = { from = "now-1h", to = "now" } + timezone = "browser" + templating = { list = [local._ds_var] } + annotations = { list = [] } + panels = [ + { + id = 1 + title = "Volume Robustness" + type = "table" + gridPos = { h = 8, w = 24, x = 0, y = 0 } + datasource = { type = "prometheus", uid = "$datasource" } + options = { + sortBy = [{ displayName = "volume", desc = false }] + footer = { show = false } + } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "longhorn_volume_robustness" + instant = true + legendFormat = "__auto" + refId = "A" + }] + transformations = [ + { + id = "labelsToFields" + options = { mode = "columns" } + }, + { + id = "organize" + options = { + renameByName = { volume = "Volume", namespace = "Namespace", robustness = "Robustness", node = "Node", state = "State" } + } + } + ] + fieldConfig = { + defaults = {} + overrides = [ + { + matcher = { id = "byName", options = "Robustness" } + properties = [{ + id = "custom.displayMode" + value = "color-background" + }, { + id = "mappings" + value = [ + { type = "value", options = { healthy = { color = "green", index = 0 } } }, + { type = "value", options = { degraded = { color = "orange", index = 1 } } }, + { type = "value", options = { faulted = { color = "red", index = 2 } } }, + ] + }] + } + ] + } + }, + { + id = 2 + title = "Active Replica Rebuilds per Node" + type = "timeseries" + gridPos = { h = 8, w = 12, x = 0, y = 8 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "sum by (node) (longhorn_replica_rebuilding)" + legendFormat = "{{node}}" + refId = "A" + }] + fieldConfig = { + defaults = { + color = { mode = "palette-classic" } + thresholds = { + mode = "absolute" + steps = [ + { color = "green", value = null }, + { color = "yellow", value = 3 }, + { color = "red", value = var.replica_rebuild_warning_count }, + ] + } + } + overrides = [] + } + options = { tooltip = { mode = "multi" } } + }, + { + id = 3 + title = "Disk Utilisation per Node/Disk" + type = "bargauge" + gridPos = { h = 8, w = 12, x = 12, y = 8 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "(longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100" + instant = true + legendFormat = "{{node}} / {{disk}}" + refId = "A" + }] + fieldConfig = { + defaults = { + unit = "percent" + min = 0 + max = 100 + thresholds = { + mode = "absolute" + steps = [ + { color = "green", value = null }, + { color = "yellow", value = var.disk_usage_warning_pct }, + { color = "red", value = var.disk_usage_critical_pct }, + ] + } + } + overrides = [] + } + options = { orientation = "horizontal", reduceOptions = { calcs = ["lastNotNull"] } } + }, + { + id = 4 + title = "Disk Eviction State" + type = "stat" + gridPos = { h = 4, w = 8, x = 0, y = 16 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "longhorn_disk_eviction_requested" + instant = true + legendFormat = "{{node}} / {{disk}}" + refId = "A" + }] + fieldConfig = { + defaults = { + mappings = [ + { type = "value", options = { "0" = { text = "No", color = "green" } } }, + { type = "value", options = { "1" = { text = "YES", color = "red" } } }, + ] + thresholds = { + mode = "absolute" + steps = [{ color = "green", value = null }, { color = "red", value = 1 }] + } + } + overrides = [] + } + options = { colorMode = "background", reduceOptions = { calcs = ["lastNotNull"] } } + }, + { + id = 5 + title = "Unhealthy Replica Count per Node (error/unknown)" + type = "stat" + gridPos = { h = 4, w = 8, x = 8, y = 16 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "count by (node) (longhorn_replica_state{state=~\"error|unknown\"} == 1)" + instant = true + legendFormat = "{{node}}" + refId = "A" + }] + fieldConfig = { + defaults = { + thresholds = { + mode = "absolute" + steps = [ + { color = "green", value = null }, + { color = "orange", value = 5 }, + { color = "red", value = 10 }, + ] + } + } + overrides = [] + } + options = { colorMode = "background", reduceOptions = { calcs = ["lastNotNull"] } } + }, + { + id = 6 + title = "Share-Manager Pod Status" + type = "table" + gridPos = { h = 4, w = 8, x = 16, y = 16 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "kube_pod_status_phase{pod=~\"share-manager-.*\", namespace=\"longhorn-system\"} == 1" + instant = true + legendFormat = "__auto" + refId = "A" + }] + transformations = [ + { id = "labelsToFields", options = { mode = "columns" } }, + { id = "organize", options = { renameByName = { pod = "Pod", node = "Node", phase = "Phase" } } } + ] + fieldConfig = { + defaults = {} + overrides = [ + { + matcher = { id = "byName", options = "Phase" } + properties = [{ + id = "custom.displayMode" + value = "color-background" + }, { + id = "mappings" + value = [ + { type = "value", options = { Running = { color = "green", index = 0 } } }, + { type = "value", options = { Pending = { color = "orange", index = 1 } } }, + { type = "value", options = { Failed = { color = "red", index = 2 } } }, + ] + }] + } + ] + } + options = { footer = { show = false } } + }, + ] + } + + # ── Dashboard 2: VM Health ─────────────────────────────────────────────────── + dashboard_vm = { + title = "${var.environment} — Harvester VM Health" + uid = "${var.environment}-harvester-vm" + schemaVersion = 38 + refresh = "30s" + tags = ["harvester", "kubevirt", "vm", var.environment] + time = { from = "now-1h", to = "now" } + timezone = "browser" + templating = { list = [local._ds_var] } + annotations = { list = [] } + panels = [ + { + id = 1 + title = "virt-launcher Pod Phase Breakdown" + type = "stat" + gridPos = { h = 6, w = 12, x = 0, y = 0 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "count by (phase) (kube_pod_status_phase{pod=~\"virt-launcher-.*\"} == 1)" + instant = true + legendFormat = "{{phase}}" + refId = "A" + }] + fieldConfig = { + defaults = { + mappings = [] + thresholds = { mode = "absolute", steps = [{ color = "green", value = null }] } + } + overrides = [] + } + options = { colorMode = "background", reduceOptions = { calcs = ["lastNotNull"] } } + }, + { + id = 2 + title = "Pods NOT Running (virt-launcher + hp-volume)" + type = "table" + gridPos = { h = 6, w = 12, x = 12, y = 0 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "kube_pod_status_phase{pod=~\"virt-launcher-.*|hp-volume-.*\", phase!~\"Running|Succeeded\"} == 1" + instant = true + legendFormat = "__auto" + refId = "A" + }] + transformations = [ + { id = "labelsToFields", options = { mode = "columns" } }, + { id = "organize", options = { renameByName = { pod = "Pod", namespace = "Namespace", phase = "Phase" } } } + ] + fieldConfig = { + defaults = {} + overrides = [ + { + matcher = { id = "byName", options = "Phase" } + properties = [{ id = "custom.displayMode", value = "color-background" }, { + id = "mappings", value = [ + { type = "value", options = { Pending = { color = "orange", index = 0 } } }, + { type = "value", options = { Failed = { color = "red", index = 1 } } }, + { type = "value", options = { Unknown = { color = "red", index = 2 } } }, + ] + }] + } + ] + } + options = { footer = { show = false } } + }, + { + id = 3 + title = "VolumeAttachment Count per Node (spike = stale VA)" + type = "timeseries" + gridPos = { h = 6, w = 12, x = 0, y = 6 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "count by (node) (kube_volumeattachment_info)" + legendFormat = "{{node}}" + refId = "A" + }] + fieldConfig = { + defaults = { color = { mode = "palette-classic" } } + overrides = [] + } + options = { tooltip = { mode = "multi" } } + }, + { + id = 4 + title = "hp-volume Pod Status" + type = "table" + gridPos = { h = 6, w = 12, x = 12, y = 6 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [ + { + datasource = { type = "prometheus", uid = "$datasource" } + expr = "kube_pod_status_phase{pod=~\"hp-volume-.*\", namespace=\"cpd-dp\"} == 1" + instant = true + legendFormat = "__auto" + refId = "A" + }, + { + datasource = { type = "prometheus", uid = "$datasource" } + expr = "kube_pod_container_status_restarts_total{pod=~\"hp-volume-.*\", namespace=\"cpd-dp\"}" + instant = true + legendFormat = "__auto" + refId = "B" + }, + ] + transformations = [ + { id = "labelsToFields", options = { mode = "columns" } }, + { id = "organize", options = { renameByName = { pod = "Pod", node = "Node", phase = "Phase" } } } + ] + fieldConfig = { + defaults = {} + overrides = [] + } + options = { footer = { show = false } } + }, + { + id = 5 + title = "virt-launcher Restart Rate (15m window)" + type = "timeseries" + gridPos = { h = 6, w = 24, x = 0, y = 12 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "increase(kube_pod_container_status_restarts_total{pod=~\"virt-launcher-.*\"}[15m]) > 0" + legendFormat = "{{pod}} ({{namespace}})" + refId = "A" + }] + fieldConfig = { + defaults = { + color = { mode = "palette-classic" } + thresholds = { + mode = "absolute" + steps = [{ color = "green", value = null }, { color = "red", value = 3 }] + } + } + overrides = [] + } + options = { tooltip = { mode = "multi" } } + }, + ] + } + + # ── Dashboard 3: Node Health ───────────────────────────────────────────────── + dashboard_node = { + title = "${var.environment} — Harvester Node Health" + uid = "${var.environment}-harvester-node" + schemaVersion = 38 + refresh = "30s" + tags = ["harvester", "node", var.environment] + time = { from = "now-1h", to = "now" } + timezone = "browser" + templating = { list = [local._ds_var] } + annotations = { list = [] } + panels = [ + { + id = 1 + title = "Node Ready Status" + type = "stat" + gridPos = { h = 4, w = 24, x = 0, y = 0 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "kube_node_status_condition{condition=\"Ready\", status=\"true\"}" + instant = true + legendFormat = "{{node}}" + refId = "A" + }] + fieldConfig = { + defaults = { + mappings = [ + { type = "value", options = { "0" = { text = "NOT READY", color = "red" } } }, + { type = "value", options = { "1" = { text = "Ready", color = "green" } } }, + ] + thresholds = { + mode = "absolute" + steps = [{ color = "red", value = null }, { color = "green", value = 1 }] + } + } + overrides = [] + } + options = { colorMode = "background", reduceOptions = { calcs = ["lastNotNull"] }, orientation = "horizontal" } + }, + { + id = 2 + title = "CPU Utilisation per Node" + type = "timeseries" + gridPos = { h = 8, w = 12, x = 0, y = 4 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "100 - (avg by (node) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)" + legendFormat = "{{node}}" + refId = "A" + }] + fieldConfig = { + defaults = { + unit = "percent" + min = 0 + max = 100 + thresholds = { + mode = "absolute" + steps = [ + { color = "green", value = null }, + { color = "yellow", value = var.node_cpu_warning_pct }, + { color = "red", value = var.node_cpu_critical_pct }, + ] + } + custom = { lineWidth = 2 } + } + overrides = [] + } + options = { tooltip = { mode = "multi" } } + }, + { + id = 3 + title = "Memory Utilisation per Node" + type = "timeseries" + gridPos = { h = 8, w = 12, x = 12, y = 4 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100" + legendFormat = "{{node}}" + refId = "A" + }] + fieldConfig = { + defaults = { + unit = "percent" + min = 0 + max = 100 + thresholds = { + mode = "absolute" + steps = [ + { color = "green", value = null }, + { color = "yellow", value = var.node_memory_warning_pct }, + { color = "red", value = 95 }, + ] + } + custom = { lineWidth = 2 } + } + overrides = [] + } + options = { tooltip = { mode = "multi" } } + }, + { + id = 4 + title = "Root Filesystem Free % per Node" + type = "bargauge" + gridPos = { h = 6, w = 12, x = 0, y = 12 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "(node_filesystem_avail_bytes{mountpoint=\"/\", fstype!=\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\", fstype!=\"tmpfs\"}) * 100" + instant = true + legendFormat = "{{instance}}" + refId = "A" + }] + fieldConfig = { + defaults = { + unit = "percent" + min = 0 + max = 100 + thresholds = { + mode = "absolute" + steps = [ + { color = "red", value = null }, + { color = "yellow", value = 15 }, + { color = "green", value = 25 }, + ] + } + } + overrides = [] + } + options = { orientation = "horizontal", reduceOptions = { calcs = ["lastNotNull"] } } + }, + { + id = 5 + title = "Active Alert Count by Severity" + type = "timeseries" + gridPos = { h = 6, w = 12, x = 12, y = 12 } + datasource = { type = "prometheus", uid = "$datasource" } + targets = [{ + datasource = { type = "prometheus", uid = "$datasource" } + expr = "count by (severity) (ALERTS{alertstate=\"firing\"})" + legendFormat = "{{severity}}" + refId = "A" + }] + fieldConfig = { + defaults = { color = { mode = "palette-classic" }, custom = { lineWidth = 2 } } + overrides = [ + { matcher = { id = "byName", options = "critical" }, properties = [{ id = "color", value = { fixedColor = "red", mode = "fixed" } }] }, + { matcher = { id = "byName", options = "warning" }, properties = [{ id = "color", value = { fixedColor = "yellow", mode = "fixed" } }] }, + ] + } + options = { tooltip = { mode = "multi" } } + }, + ] + } +} + +# ── Grafana ConfigMaps ──────────────────────────────────────────────────────── + +resource "kubernetes_manifest" "grafana_dashboard_storage" { + manifest = { + apiVersion = "v1" + kind = "ConfigMap" + metadata = { + name = "${var.environment}-harvester-storage-health" + namespace = local.dns + labels = merge(local.common_labels, { + grafana_dashboard = "1" + }) + } + data = { + # Key name becomes the filename on disk in Grafana's sidecar — must be unique + # across all ConfigMaps, otherwise dashboards overwrite each other. + "${var.environment}-harvester-storage-health.json" = jsonencode(local.dashboard_storage) + } + } +} + +resource "kubernetes_manifest" "grafana_dashboard_vm" { + manifest = { + apiVersion = "v1" + kind = "ConfigMap" + metadata = { + name = "${var.environment}-harvester-vm-health" + namespace = local.dns + labels = merge(local.common_labels, { + grafana_dashboard = "1" + }) + } + data = { + "${var.environment}-harvester-vm-health.json" = jsonencode(local.dashboard_vm) + } + } +} + +resource "kubernetes_manifest" "grafana_dashboard_node" { + manifest = { + apiVersion = "v1" + kind = "ConfigMap" + metadata = { + name = "${var.environment}-harvester-node-health" + namespace = local.dns + labels = merge(local.common_labels, { + grafana_dashboard = "1" + }) + } + data = { + "${var.environment}-harvester-node-health.json" = jsonencode(local.dashboard_node) + } + } +} diff --git a/modules/monitoring/outputs.tf b/modules/monitoring/outputs.tf new file mode 100644 index 0000000..0997713 --- /dev/null +++ b/modules/monitoring/outputs.tf @@ -0,0 +1,39 @@ +output "prometheus_rule_storage_name" { + description = "Name of the storage PrometheusRule CRD." + value = kubernetes_manifest.prometheus_rule_storage.manifest.metadata.name +} + +output "prometheus_rule_vm_name" { + description = "Name of the VM PrometheusRule CRD." + value = kubernetes_manifest.prometheus_rule_vm.manifest.metadata.name +} + +output "prometheus_rule_node_name" { + description = "Name of the node PrometheusRule CRD." + value = kubernetes_manifest.prometheus_rule_node.manifest.metadata.name +} + +output "alertmanager_config_name" { + description = "Name of the Alertmanager base config Secret." + value = "alertmanager-rancher-monitoring-alertmanager" +} + +output "grafana_dashboard_storage_name" { + description = "Name of the Grafana storage health dashboard ConfigMap." + value = kubernetes_manifest.grafana_dashboard_storage.manifest.metadata.name +} + +output "grafana_dashboard_vm_name" { + description = "Name of the Grafana VM health dashboard ConfigMap." + value = kubernetes_manifest.grafana_dashboard_vm.manifest.metadata.name +} + +output "grafana_dashboard_node_name" { + description = "Name of the Grafana node health dashboard ConfigMap." + value = kubernetes_manifest.grafana_dashboard_node.manifest.metadata.name +} + +output "monitoring_namespace" { + description = "Namespace where all monitoring resources are deployed." + value = var.monitoring_namespace +} diff --git a/modules/monitoring/templates/google-chat.tmpl b/modules/monitoring/templates/google-chat.tmpl new file mode 100644 index 0000000..5dbcf38 --- /dev/null +++ b/modules/monitoring/templates/google-chat.tmpl @@ -0,0 +1,98 @@ +{{/* google-chat.tmpl — Alertmanager Go template for Google Chat card v2 format. + * + * This template is provided as a reference for the intended Google Chat card v2 + * payload format. The inline message templates in the AlertmanagerConfig resource + * are derived from the patterns defined here. + * + * To load this template into Alertmanager for use with a relay service: + * 1. Create a Secret in cattle-monitoring-system containing this file content. + * 2. Reference it in alertmanagerSpec.configSecret via Helm values. + * 3. Call {{ template "google_chat.card" . }} from your relay's template. + * + * Google Chat card v2 API reference: + * https://developers.google.com/chat/api/guides/message-formats/cards + */}} + +{{/* ── Severity emoji helper ─────────────────────────────────────────────── */}} +{{ define "google_chat.severity_emoji" -}} + {{- if eq .Labels.severity "critical" }}🔴{{ else if eq .Labels.severity "warning" }}🟡{{ else }}🔵{{ end -}} +{{ end }} + +{{/* ── Single alert section widget ─────────────────────────────────────── */}} +{{ define "google_chat.alert_section" -}} +{ + "header": "{{ template "google_chat.severity_emoji" . }} {{ .Labels.alertname }}", + "widgets": [ + {{- if .Labels.node }} + { "decoratedText": { "topLabel": "Node", "text": "{{ .Labels.node }}" } }, + {{- end }} + {{- if .Labels.namespace }} + { "decoratedText": { "topLabel": "Namespace", "text": "{{ .Labels.namespace }}" } }, + {{- end }} + {{- if .Labels.volume }} + { "decoratedText": { "topLabel": "Volume", "text": "{{ .Labels.volume }}" } }, + {{- end }} + {{- if .Annotations.description }} + { "textParagraph": { "text": "{{ .Annotations.description | reReplaceAll "(.{500}).*" "${1}…" }}" } }, + {{- end }} + {{- if .Value }} + { "decoratedText": { "topLabel": "Value", "text": "{{ .Value }}" } }, + {{- end }} + { "decoratedText": { "topLabel": "Firing since", "text": "{{ .StartsAt.Format "2006-01-02 15:04:05 UTC" }}" } } + {{- if .Annotations.runbook_url }}, + { "buttonList": { "buttons": [ { "text": "📖 Runbook", "onClick": { "openLink": { "url": "{{ .Annotations.runbook_url }}" } } } ] } } + {{- end }} + ] +} +{{- end }} + +{{/* ── Full card v2 payload (for grouped alerts) ────────────────────────── */}} +{{ define "google_chat.card" -}} +{ + "cardsV2": [ + { + "cardId": "{{ .GroupLabels.alertname }}-{{ .GroupLabels.severity }}", + "card": { + "header": { + "title": "{{ if eq .GroupLabels.severity "critical" }}🔴 CRITICAL{{ else if eq .GroupLabels.severity "warning" }}🟡 WARNING{{ else }}🔵 ALERT{{ end }}: {{ .GroupLabels.alertname }}", + "subtitle": "{{ len .Alerts.Firing }} firing{{ if gt (len .Alerts.Resolved) 0 }}, {{ len .Alerts.Resolved }} resolved{{ end }}", + "imageUrl": "https://prometheus.io/assets/prometheus_logo_grey.svg", + "imageType": "CIRCLE" + }, + "sections": [ + {{ range $i, $alert := .Alerts.Firing -}} + {{ if gt $i 0 }},{{ end }} + {{ template "google_chat.alert_section" $alert }} + {{- end }} + {{ if gt (len .Alerts.Resolved) 0 }}, + { + "header": "✅ Resolved", + "widgets": [ + {{ range $i, $alert := .Alerts.Resolved -}} + {{ if gt $i 0 }},{{ end }} + { "decoratedText": { "topLabel": "{{ $alert.Labels.alertname }}", "text": "Resolved at {{ $alert.EndsAt.Format "2006-01-02 15:04:05 UTC" }}" } } + {{- end }} + ] + } + {{ end }} + ] + } + } + ] +} +{{- end }} + +{{/* ── Plain-text message (used inline in AlertmanagerConfig.message field) */}} +{{ define "google_chat.message" -}} +{{ if eq .GroupLabels.severity "critical" }}🔴 *CRITICAL*{{ else if eq .GroupLabels.severity "warning" }}🟡 *WARNING*{{ else }}🔵 *ALERT*{{ end }} — *{{ .GroupLabels.alertname }}* + +{{ range .Alerts.Firing -}} +• *{{ .Labels.alertname }}*{{ if .Labels.node }} | node: `{{ .Labels.node }}`{{ end }}{{ if .Labels.namespace }} | ns: `{{ .Labels.namespace }}`{{ end }}{{ if .Labels.volume }} | vol: `{{ .Labels.volume }}`{{ end }} + {{ if .Annotations.description }}{{ .Annotations.description | reReplaceAll "(.{300}).*" "${1}…" }}{{ end }} + Firing since: {{ .StartsAt.Format "2006-01-02 15:04 UTC" }}{{ if .Annotations.runbook_url }} | <{{ .Annotations.runbook_url }}|Runbook>{{ end }} + +{{ end -}} +{{ if gt (len .Alerts.Resolved) 0 -}} +✅ *Resolved:* {{ range .Alerts.Resolved }}{{ .Labels.alertname }} {{ end }} +{{ end -}} +{{- end }} diff --git a/modules/monitoring/variables.tf b/modules/monitoring/variables.tf new file mode 100644 index 0000000..edab966 --- /dev/null +++ b/modules/monitoring/variables.tf @@ -0,0 +1,104 @@ +# ── Required ────────────────────────────────────────────────────────────────── + +variable "environment" { + type = string + description = "Environment name used for resource naming (e.g. \"lk\")." +} + +variable "kubeconfig_path" { + type = string + description = "Path to the Harvester kubeconfig file." +} + +variable "kubeconfig_context" { + type = string + description = "kubectl context name to use from the kubeconfig." +} + +variable "google_chat_webhook_url" { + type = string + sensitive = true + description = "Google Chat incoming webhook URL for alert notifications." +} + +variable "rancher_url" { + type = string + default = "" + description = "Base URL of the Rancher server (e.g. https://rancher.example.com). Combined with harvester_cluster_id to build Rancher-authenticated proxy URLs for the 'View Alert' and 'View in Prometheus' buttons. Leave empty to omit both buttons." +} + +variable "harvester_cluster_id" { + type = string + default = "" + description = "Rancher cluster ID for the Harvester cluster (e.g. c-v7gvt). Found in Rancher UI → Cluster Management → cluster row. Required when rancher_url is set." +} + +# ── Optional (monitoring namespaces) ───────────────────────────────────────── + +variable "monitoring_namespace" { + type = string + default = "cattle-monitoring-system" + description = "Namespace where rancher-monitoring (kube-prometheus-stack) runs." +} + +variable "dashboards_namespace" { + type = string + default = "cattle-dashboards" + description = "Namespace where Grafana picks up dashboard ConfigMaps (label grafana_dashboard=1)." +} + +# ── Optional (alert thresholds) ─────────────────────────────────────────────── + +variable "runbook_base_url" { + type = string + default = "https://wiki.internal/runbooks/harvester" + description = "Base URL prepended to each alert's runbook_url annotation." +} + +variable "disk_usage_warning_pct" { + type = number + default = 80 + description = "Longhorn disk usage percentage that triggers a warning alert." +} + +variable "disk_usage_critical_pct" { + type = number + default = 90 + description = "Longhorn disk usage percentage that triggers a critical alert." +} + +variable "replica_rebuild_warning_count" { + type = number + default = 5 + description = "Number of concurrent replica rebuilds per node that triggers a warning." +} + +variable "node_cpu_warning_pct" { + type = number + default = 85 + description = "Node CPU utilisation percentage that triggers a warning alert." +} + +variable "node_cpu_critical_pct" { + type = number + default = 95 + description = "Node CPU utilisation percentage that triggers a critical alert." +} + +variable "node_memory_warning_pct" { + type = number + default = 85 + description = "Node memory utilisation percentage that triggers a warning alert." +} + +variable "virt_launcher_stuck_for" { + type = string + default = "5m" + description = "Duration a virt-launcher pod must be Pending/ContainerCreating before alerting." +} + +variable "hp_volume_stuck_for" { + type = string + default = "3m" + description = "Duration an hp-volume pod must be non-Running before alerting." +} diff --git a/modules/monitoring/versions.tf b/modules/monitoring/versions.tf new file mode 100644 index 0000000..6bcdc57 --- /dev/null +++ b/modules/monitoring/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.5" + + required_providers { + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 3.0" + } + null = { + source = "hashicorp/null" + version = "~> 3.0" + } + } +}