Add NATS critical upstream failures detection rules (#86)

varshith257 · web-flow · commit e0cf716fcaf4 · 2025-06-25T08:52:54.000-05:00
* feat: add NATS Jetstream CRE

* fix format

* add negate conditions

* Update nats-jetstream-ha.yaml

* Update tags.yaml
diff --git a/rules/cre-2025-0082/nats-jetstream-ha.yaml b/rules/cre-2025-0082/nats-jetstream-ha.yaml
@@ -0,0 +1,62 @@
+rules:
+  - metadata:
+      kind: prequel
+      id: QsYzSA81AJSgnVqaQt4XGS
+      version: "0.1.0"
+    cre:
+      id: CRE-2025-0082
+      severity: 1
+      title: "NATS JetStream HA failures: monitor goroutine, consumer stalls and unsynced replicas"
+      category: "message-queue-problem"
+      author: Prequel
+      description: |
+        Detects high-availability failures in NATS JetStream clusters due to:
+        
+        1. **Monitor goroutine failure** — after node restarts, Raft group fails to elect a leader  
+        2. **Consumer deadlock** — using DeliverPolicy=LastPerSubject + AckPolicy=Explicit with low MaxAckPending  
+        3. **Unsynced replicas** — object store replication appears healthy but data is lost or inconsistent between nodes
+
+        These issues lead to invisible data loss, stalled consumers, or stream unavailability.
+      impact: |
+        - **Scenario 1**: Stream becomes unusable (publishes/read fail) due to no Raft leader  
+        - **Scenario 2**: Consumer stalls with `context deadline exceeded`, ACKs no longer move floor  
+        - **Scenario 3**: Object Store data loss occurs silently across restarts despite healthy status  
+        All scenarios disrupt reliability of JetStream-based systems and violate consistency expectations.
+      cause: |
+        - [Monitor failure]: JetStream monitor goroutine did not start after server restart  
+        - [Consumer stall]: ACK/sequence tracking inconsistency under `LastPerSubject + Explicit ACK + low MaxAckPending`  
+        - [Replica drift]: Raft replicas fall out of sync silently (especially during cold restart or recovery), leading to inconsistent object store contents
+      mitigation: |
+        - Always enable JetStream before ReadyForConnections  
+        - Use ProcessConfigString instead of on-the-fly JS enablement  
+        - Avoid MaxAckPending < 100 with DeliverPolicy=LastPerSubject  
+        - Run regular `nats stream-check --unsynced` checks  
+        - To recover object store:  
+          - Scale stream to replicas=1 and back  
+          - Or remove faulty replica via `nats stream cluster ... peer-remove`  
+        - Monitor for raftz and jsz inconsistencies in tooling
+      mitigationScore: 8
+      references:
+        - "https://github.com/nats-io/nats-server/issues/6890"
+        - "https://github.com/nats-io/nats-server/issues/6921"
+        - "https://github.com/nats-io/nats-server/issues/6929"
+      reports: 3
+      version: "0.1.0"
+      tags:
+        - nats
+        - jetstream
+        - raft
+        - ack-deadlock
+        - unsynced-replica
+      applications:
+        - name: nats-server
+          version: ">=2.11.3"
+    rule:
+      set:
+        event:
+          source: cre.log.nats
+        match:
+          - regex: "monitor goroutine not running|Fetch error: context deadline exceeded|UNSYNCED"
+        negate:
+          - "server shutdown"
+          - "shutting down"
diff --git a/rules/cre-2025-0082/test.log b/rules/cre-2025-0082/test.log
@@ -0,0 +1,9 @@
+2025-06-15 23:01:56 JetStream stream 'app > some-stream' is not current: monitor goroutine not running
+2025-06-15 23:08:15 [INFO] attempting to create stream `some-stream`
+2025-06-15 23:21:36 [ERROR] failed to create stream: context deadline exceeded
+2025-06-15 23:21:42 [ERROR] Fetch error: context deadline exceeded
+2025-06-15 23:48:11 [INFO] Running stream-check --unsynced on cluster...
+2025-06-15 23:48:17 [INFO] Found unsynced object store replicas:
+│ OBJ_task-documents               │ S-R3F-qGDUF5RO │ 5094964000 │ ABE56KWF6GW4LIMZI723WJMPKODITQIM3OYVACQAQWXRHYNHQCQY2LT3 │ nats-0* │       29 │  2061405 │       16 │       6 │         0 │      1 │     35 │ UNSYNCED │ nats-0 │ nats-2(current=true ,offline=false)      nats-1(current=true ,offline=false) │  │
+│ OBJ_task-documents               │ S-R3F-qGDUF5RO │ 5094964000 │ ABE56KWF6GW4LIMZI723WJMPKODITQIM3OYVACQAQWXRHYNHQCQY2LT3 │ nats-1  │       29 │  2061405 │       16 │       6 │         0 │      1 │     35 │ UNSYNCED │ nats-0 │ nats-2(current=false,offline=false)      nats-0(current=true ,offline=false) │  │
+│ OBJ_task-documents               │ S-R3F-qGDUF5RO │ 5094964000 │ ABE56KWF6GW4LIMZI723WJMPKODITQIM3OYVACQAQWXRHYNHQCQY2LT3 │ nats-2  │       23 │  1676277 │       12 │       6 │         0 │      1 │     29 │ UNSYNCED │ nats-0 │ nats-1(current=false,offline=false)      nats-0(current=true ,offline=false) │  │
diff --git a/rules/tags/categories.yaml b/rules/tags/categories.yaml
@@ -8,7 +8,7 @@ categories:
     description: Problems related to well-known external APIs
   - name: message-queue-problem
     displayName: Message Queue Problems
-    description: Problems related to message queues, like Kafka, RabbitMQ, NATS, and others
+    description: Problems related to message queues, like Kafka, RabbitMQ, NATS and others
   - name: asynchronous-task-problem
     displayName: Asynchronous Task Problems
     description: Problems related to asynchronous tasks, like Celery, Sidekiq, and others
diff --git a/rules/tags/tags.yaml b/rules/tags/tags.yaml
@@ -672,6 +672,15 @@ tags:
   - name: sigkill
     displayName: SIGKILL
     description: Failures caused by processes being terminated with a SIGKILL signal.
+  - name: jetstream
+    displayName: JetStream
+    description: NATS JetStream persistence & streaming subsystem issues.
+  - name: ack-deadlock
+    displayName: Ack Deadlock
+    description: Deadlocks caused by unacknowledged messages or backpressure in JetStream acks.
+  - name: unsynced-replica
+    displayName: Unsynced Replica
+    description: JetStream replicas that fail to synchronize state with the leader after restart or failover.
   - name: connection-exhaustion
     displayName: Connection Exhaustion
     description: Problems where systems reach their maximum connection limits, preventing new connections and causing service degradation