Add Postgres(self-hosted) critical HA upstream failure detection rule (prequel-dev#51)

Saturn225 · web-flow · commit 04245179b3f0 · 2025-06-04T14:42:51.000-05:00
* add cre-2024-0077

* add test.log

* fix format issues

* update categories

* update tags for consistency with existing ones

* Update categories.yaml

* update tags

* Update postgres-self-hosted.yaml
diff --git a/rules/rules/cre-2025-0077/postgres-self-hosted.yaml b/rules/rules/cre-2025-0077/postgres-self-hosted.yaml
@@ -0,0 +1,63 @@
+rules:
+  - metadata:
+      kind: prequel
+      id: 5UD1RZxGC5LJQnVmAkV11A
+      gen: 1
+    cre:
+      id: CRE-2025-0072
+      severity: 1
+      title: "Self-hosted PostgreSQL HA: WAL Streaming & HA Controller Crisis (Replication Slot Loss, Disk Full, Etcd Quorum Failure)"
+      category: "postgres-ha"
+      author: Prequel
+      description: |
+        Detects high-severity failures in self-hosted PostgreSQL high-availability clusters managed by Patroni, Zalando, or similar HA controllers.
+        This rule targets catastrophic conditions that break replication or cluster consensus:
+          - WAL streaming failures due to missing replication slots (usually after disk full or crash events)
+          - Persistent errors resolving HA controller endpoints (etcd/consul) and loss of HA controller quorum
+          - Disk saturation leading to WAL write errors and replication breakage
+      cause: |
+        - Replication slot(s) "patroniN" missing or cannot be created due to disk full or corruption
+        - PostgreSQL unable to stream WAL (Write-Ahead Log) to replicas, causing FATAL errors
+        - HA controller (etcd/consul) DNS/name resolution failures or full cluster outage (quorum lost)
+        - Disk full on primary prevents WAL writes or checkpointing
+      tags:
+        - ha
+        - patroni
+        - zalando
+        - etcd
+        - replication
+        - wal
+        - storage
+        - quorum
+        - crash
+        - data-loss
+        - timeout
+      mitigation: |
+        PREVENTION:
+          - Monitor disk usage on all PostgreSQL nodes, especially WAL and archive directories
+          - Set up alerting for replication lag and missing replication slots
+          - Ensure HA controllers (etcd/consul) are running on redundant, reliable nodes
+        RESPONSE:
+          - Restore or recreate missing replication slots
+          - Free up disk space and restart affected PostgreSQL instances
+          - Restore etcd/consul cluster quorum; check container/network status
+          - Perform manual failover if automatic recovery fails
+      references:
+        - https://patroni.readthedocs.io/en/latest/
+        - https://www.postgresql.org/docs/current/warm-standby.html
+        - https://etcd.io/docs/latest/op-guide/clustering/
+      applications:
+        - name: postgresql
+      impact: |
+        - Replication breakage; secondary/standby nodes cannot receive WAL
+        - Potential for split-brain, data loss, or full cluster outage
+        - Cluster may lose HA/failover capability; clients disconnected
+      impactScore: 10
+      mitigationScore: 6
+      reports: 2
+    rule:
+      set:
+        event:
+          source: cre.log.postgresql
+        match:
+          - regex: 'FATAL.*could not start WAL streaming: (replication slot|ERROR:  replication slot) "patroni[0-9]+" does not exist|ERROR.*replication slot "patroni[0-9]+" does not exist|ERROR.*dd: error writing.*No space left on device|failed to resolve host etcd[0-9]: \[Errno -3\] Temporary failure in name resolution|Failed to get list of machines from http://etcd[0-9]:2379/v3beta: MaxRetryError|etcd\.EtcdConnectionFailed: No more machines in the cluster|Request to server http://[0-9.]+:2379 failed: (ReadTimeoutError|MaxRetryError)|watchprefix failed: ProtocolError.*InvalidChunkLength'
diff --git a/rules/rules/cre-2025-0077/test.log b/rules/rules/cre-2025-0077/test.log
@@ -0,0 +1,61 @@
+2025-06-03T00:40:29Z FATAL   postgres   could not start WAL streaming: replication slot "patroni1" does not exist
+2025-06-03T00:40:29Z FATAL   postgres   could not start WAL streaming: replication slot "patroni1" does not exist
+2025-06-03T00:40:34Z FATAL   postgres   could not start WAL streaming: replication slot "patroni1" does not exist
+2025-06-03T00:40:29Z FATAL   postgres   could not start WAL streaming: replication slot "patroni3" does not exist
+2025-06-03T00:40:29Z ERROR   postgres   dd: error writing '/var/lib/postgresql/data/bigfile': No space left on device
+2025-06-03T00:40:29Z FATAL   postgres   could not start WAL streaming: replication slot "patroni3" does not exist
+2025-06-03T00:40:34Z FATAL   postgres   could not start WAL streaming: replication slot "patroni3" does not exist
+2025-06-03T00:40:29Z ERROR   postgres   replication slot "patroni3" does not exist
+2025-06-03T00:40:29Z ERROR   postgres   replication slot "patroni1" does not exist
+2025-06-03T00:40:29Z ERROR   postgres   replication slot "patroni1" does not exist
+2025-06-03T00:40:34Z ERROR   postgres   replication slot "patroni3" does not exist
+2025-06-03T00:40:34Z ERROR   postgres   replication slot "patroni1" does not exist
+
+2025-06-03T00:50:00Z WARNING etcd       failed to resolve host etcd2: [Errno -3] Temporary failure in name resolution
+2025-06-03T00:50:00Z ERROR   etcd       Failed to get list of machines from http://etcd2:2379/v3beta: MaxRetryError("Connection refused")
+2025-06-03T00:50:08Z WARNING etcd       failed to resolve host etcd1: [Errno -3] Temporary failure in name resolution
+2025-06-03T00:50:08Z ERROR   etcd       Failed to get list of machines from http://etcd1:2379/v3beta: MaxRetryError("Connection refused")
+2025-06-03T00:50:12Z ERROR   etcd       Request to server http://192.168.80.6:2379 failed: ReadTimeoutError
+2025-06-03T00:50:13Z ERROR   etcd       Request to server http://192.168.80.8:2379 failed: MaxRetryError("ConnectTimeout")
+2025-06-03T00:50:15Z ERROR   etcd       Request to server http://192.168.80.7:2379 failed: MaxRetryError("ConnectTimeout")
+2025-06-03T00:50:23Z WARNING etcd       failed to resolve host etcd1: [Errno -3] Temporary failure in name resolution
+2025-06-03T00:50:23Z ERROR   etcd       Failed to get list of machines from http://etcd1:2379/v3beta: MaxRetryError("Connection refused")
+
+2025-06-02T18:28:52Z ERROR   etcd       watchprefix failed: <FailedPrecondition error: “grpc: the client connection is closing”>
+2025-06-02T18:28:52Z ERROR   etcd       Request to server http://172.23.0.4:2379 failed: MaxRetryError("Connection refused")
+2025-06-02T18:28:52Z INFO    patroni    Reconnection allowed, trying another etcd node
+2025-06-02T18:28:52Z INFO    patroni    Retrying on http://172.23.0.5:2379
+2025-06-02T18:28:52Z INFO    patroni    Selected new etcd server http://172.23.0.5:2379
+2025-06-02T18:28:52Z ERROR   etcd       Failed to get list of machines from http://172.23.0.4:2379/v3beta: MaxRetryError("Connection refused")
+2025-06-02T18:28:52Z ERROR   etcd       Failed to get list of machines from http://172.23.0.3:2379/v3beta: MaxRetryError("Connection refused")
+2025-06-02T18:28:52Z WARNING etcd       Connected to Etcd node with term 2. Old known term 3. Switching again.
+2025-06-02T18:28:52Z ERROR   etcd       Request to server http://172.23.0.5:2379 failed: StaleEtcdNode()
+2025-06-02T18:28:52Z INFO    patroni    Reconnection allowed, trying yet another etcd node
+2025-06-02T18:28:52Z ERROR   etcd       Failed to get list of machines from http://172.23.0.4:2379/v3beta: MaxRetryError("Connection refused")
+2025-06-02T18:28:52Z ERROR   etcd       Failed to get list of machines from http://etcd2:2379/v3beta: MaxRetryError("Connection refused")
+2025-06-02T18:28:52Z ERROR   etcd       Failed to get list of machines from http://172.23.0.3:2379/v3beta: MaxRetryError("Connection refused")
+2025-06-02T18:28:54Z ERROR   etcd       Request to server http://172.23.0.4:2379 failed: MaxRetryError("ConnectTimeout")
+2025-06-02T18:28:56Z ERROR   etcd       Request to server http://172.23.0.3:2379 failed: MaxRetryError("ConnectTimeout")
+2025-06-02T18:28:57Z ERROR   etcd       Failed to get list of machines from http://172.23.0.4:2379/v3beta: MaxRetryError("ConnectTimeout")
+2025-06-02T18:28:59Z ERROR   etcd       Failed to get list of machines from http://172.23.0.3:2379/v3beta: MaxRetryError("ConnectTimeout")
+2025-06-02T18:29:02Z ERROR   etcd       Request to server http://172.23.0.5:2379 failed: ReadTimeoutError
+2025-06-02T18:29:02Z INFO    patroni    Reconnection allowed, trying another etcd node
+2025-06-02T18:29:02Z INFO    patroni    Retrying on http://172.23.0.3:2379
+
+2025-06-02T14:47:18Z FATAL   postgres   could not start WAL streaming: replication slot "patroni1" does not exist
+2025-06-02T14:47:18Z FATAL   postgres   could not start WAL streaming: replication slot "patroni1" does not exist
+2025-06-02T14:47:23Z FATAL   postgres   could not start WAL streaming: replication slot "patroni1" does not exist
+2025-06-02T14:47:19Z FATAL   postgres   could not start WAL streaming: replication slot "patroni2" does not exist
+2025-06-02T14:47:19Z FATAL   postgres   could not start WAL streaming: replication slot "patroni2" does not exist
+2025-06-02T14:47:24Z FATAL   postgres   could not start WAL streaming: replication slot "patroni2" does not exist
+2025-06-02T14:47:18Z ERROR   postgres   replication slot "patroni1" does not exist
+2025-06-02T14:47:18Z ERROR   postgres   replication slot "patroni1" does not exist
+2025-06-02T14:47:19Z ERROR   postgres   replication slot "patroni2" does not exist
+2025-06-02T14:47:19Z ERROR   postgres   replication slot "patroni2" does not exist
+2025-06-02T14:47:23Z ERROR   postgres   replication slot "patroni1" does not exist
+2025-06-02T14:47:24Z ERROR   postgres   replication slot "patroni2" does not exist
+
+2025-06-03T00:15:00Z NOTICE   postgres   relation "big_data" already exists, skipping
+2025-06-03T00:15:04Z INFO     patroni    Paused replica demo-patroni2
+2025-06-03T00:15:14Z INFO     patroni    Waiting 10s for WAL to build up on leader
+2025-06-03T00:15:15Z INFO     patroni    Resumed replica demo-patroni2
diff --git a/rules/tags/categories.yaml b/rules/tags/categories.yaml
@@ -129,6 +129,9 @@ categories:
   - name: in-memory-database-problem
     displayName: In-Memory Database Problems
     description: Problems specific to in-memory data stores (e.g. Redis, Memcached)
+  - name: postgres-ha
+    displayName: PostgreSQL High Availability
+    description: High-severity problems related to PostgreSQL in high-availability (HA) clusters, including replication, failover, WAL streaming, and HA controller outages.
   - name: kubernetes-storage-problems
     displayName: Kubernetes Storage Problems
     description: Problems related to container storage in Kubernetes
diff --git a/rules/tags/tags.yaml b/rules/tags/tags.yaml
@@ -507,6 +507,27 @@ tags:
   - name: cluster-degradation
     displayName: Cluster Degradation
     description: Problems related to cluster availability
+  - name: etcd
+    displayName: Etcd
+    description: Issues involving etcd clusters or consensus, especially in HA setups.
+  - name: patroni
+    displayName: Patroni
+    description: Issues related to Patroni high-availability controller for PostgreSQL.
+  - name: zalando
+    displayName: Zalando
+    description: Issues related to the Zalando Postgres Operator for HA Postgres.
+  - name: ha
+    displayName: High Availability
+    description: Problems or incidents involving high-availability clusters, failover, or consensus.
+  - name: replication
+    displayName: Replication
+    description: Replication failures, lag, or divergence in stateful systems.
+  - name: wal
+    displayName: WAL
+    description: Issues with Write-Ahead Logging in databases.
+  - name: quorum
+    displayName: Quorum
+    description: Loss or degradation of cluster quorum in distributed systems.
   - name: load-balancer-problem
     displayName: Load Balancer Problem
     description: Problems related to load balancers, such as misrouting, unhealthy backends, or configuration faults