Merge branch 'main' into k8s-exit-code

amanycodes · web-flow · commit 1e34924eeeb1 · 2025-09-06T00:50:25.000+05:30
diff --git a/README.md b/README.md
@@ -60,58 +60,19 @@ New contributors are encouraged to join the problem detection community add new
 
 ## Rule Coverage
 
-### Tags
+### Tags & Categories
 
-* [Tags](rules/tags/tags.yaml)
-* [Categories](rules/tags/categories.yaml)
+CREs are organized by tags and categories.
+
+* [Tags](https://docs.prequel.dev/cres/public?view=tags)
+* [Categories](https://docs.prequel.dev/cres/public?view=categories)
 
 ### Technology Coverage
 
-The table below lists the technologies targeted by the existing CRE rules and the number of rules that describe each technology.
-
-<!-- BEGIN TECHNOLOGY TABLE -->
-| Technology | CRE Count | Documentation |
-|-----------|----------:|---------------|
-| [nginx](https://nginx.org/en/docs/) | 8 | https://nginx.org/en/docs/ |
-| [loki](https://grafana.com/docs/loki/latest/) | 6 | https://grafana.com/docs/loki/latest/ |
-| [otel-collector](https://opentelemetry.io/docs/collector/) | 4 | https://opentelemetry.io/docs/collector/ |
-| [kubernetes](https://kubernetes.io/docs/home/) | 4 | https://kubernetes.io/docs/home/ |
-| [aws](https://aws.amazon.com/) | 4 | https://aws.amazon.com/ |
-| [rabbitmq](https://www.rabbitmq.com/documentation.html) | 4 | https://www.rabbitmq.com/documentation.html |
-| [redis](https://redis.io/docs/) | 4 | https://redis.io/docs/ |
-| [grafana](https://grafana.com/docs/) | 4 | https://grafana.com/docs/ |
-| [ovn](https://www.ovn.org/docs/) | 3 | https://www.ovn.org/docs/ |
-| [datadog](https://docs.datadoghq.com/) | 3 | https://docs.datadoghq.com/ |
-| [neutron](https://docs.openstack.org/neutron/latest/) | 2 | https://docs.openstack.org/neutron/latest/ |
-| [openstack](https://docs.openstack.org/) | 2 | https://docs.openstack.org/ |
-| [keda](https://keda.sh/docs/) | 2 | https://keda.sh/docs/ |
-| [opentelemetry](https://opentelemetry.io/docs/) | 2 | https://opentelemetry.io/docs/ |
-| [postgres](https://www.postgresql.org/docs/) | 2 | https://www.postgresql.org/docs/ |
-| [dns](https://en.wikipedia.org/wiki/Domain_Name_System) | 2 | https://en.wikipedia.org/wiki/Domain_Name_System |
-| [memcached](https://memcached.org/) | 2 | https://memcached.org/ |
-| [prometheus](https://prometheus.io/docs/) | 2 | https://prometheus.io/docs/ |
-| [karpenter](https://karpenter.sh/docs/) | 2 | https://karpenter.sh/docs/ |
-| [cws](https://docs.datadoghq.com/cloud_workload_security/) | 1 | https://docs.datadoghq.com/cloud_workload_security/ |
-| [postgresql](https://www.postgresql.org/docs/) | 1 | https://www.postgresql.org/docs/ |
-| [nfs](https://wiki.linux-nfs.org/wiki/) | 1 | https://wiki.linux-nfs.org/wiki/ |
-| [nvidia](https://docs.nvidia.com/) | 1 | https://docs.nvidia.com/ |
-| [helm](https://helm.sh/docs/) | 1 | https://helm.sh/docs/ |
-| [temporal](https://docs.temporal.io/) | 1 | https://docs.temporal.io/ |
-| [slurm](https://slurm.schedmd.com/documentation.html) | 1 | https://slurm.schedmd.com/documentation.html |
-| [slurmdbd](https://slurm.schedmd.com/slurmdbd.html) | 1 | https://slurm.schedmd.com/slurmdbd.html |
-| [mysql](https://dev.mysql.com/doc/) | 1 | https://dev.mysql.com/doc/ |
-| [redis-cli](https://redis.io/docs/ui/cli/) | 1 | https://redis.io/docs/ui/cli/ |
-| [kubelet](https://kubernetes.io/docs/concepts/architecture/nodes/#kubelet) | 1 | https://kubernetes.io/docs/concepts/architecture/nodes/#kubelet |
-| [redis-py](https://redis-py.readthedocs.io/en/stable/) | 1 | https://redis-py.readthedocs.io/en/stable/ |
-| [spicedb](https://spicedb.dev/) | 1 | https://spicedb.dev/ |
-| [celery](https://docs.celeryq.dev/en/stable/) | 1 | https://docs.celeryq.dev/en/stable/ |
-| [kombu](https://docs.celeryq.dev/projects/kombu/en/stable/) | 1 | https://docs.celeryq.dev/projects/kombu/en/stable/ |
-| [vpc-cni](https://docs.aws.amazon.com/eks/latest/userguide/pod-networking.html) | 1 | https://docs.aws.amazon.com/eks/latest/userguide/pod-networking.html |
-| [csi](https://kubernetes-csi.github.io/docs/) | 1 | https://kubernetes-csi.github.io/docs/ |
-| [terraform](https://developer.hashicorp.com/terraform/docs) | 1 | https://developer.hashicorp.com/terraform/docs |
-| [ovsdb](https://docs.openvswitch.org/en/latest/ref/ovsdb/) | 1 | https://docs.openvswitch.org/en/latest/ref/ovsdb/ |
-| [eks](https://docs.aws.amazon.com/eks/) | 1 | https://docs.aws.amazon.com/eks/ |
-| [gke](https://cloud.google.com/kubernetes-engine/docs/) | 1 | https://cloud.google.com/kubernetes-engine/docs/ |
+CREs exist for both popular and obscure project. 
+
+* [CREs by Technology](https://docs.prequel.dev/cres/public?view=technologies)
+
 
 ## Join the community!
 
diff --git a/rules/cre-2025-0102/redpanda-quorum-error.yaml b/rules/cre-2025-0102/redpanda-quorum-error.yaml
@@ -67,7 +67,9 @@ rules:
       reports: 1
     rule:
       set:
+        window: 10s
         event:
           source: cre.log.redpanda
         match:
-          - regex: 'failure|leaving all raft groups|down|CRITICAL|Multiple nodes unresponsive|Low available memory|health degraded'
+          - value: 'Marking node as down'
+          - value: 'Not enough live replicas to form quorum'
diff --git a/rules/cre-2025-0126/mongodb-primary-election-failure.yaml b/rules/cre-2025-0126/mongodb-primary-election-failure.yaml
@@ -4,7 +4,7 @@ rules:
       id: 5UD1RZxGC5LJQnVmAkV11B
       gen: 1
     cre:
-      id: CRE-2025-0108
+      id: CRE-2025-0126
       severity: 1
       title: "MongoDB Replica Set Primary Election Failure"
       category: "database-problem"
diff --git a/rules/cre-2025-0162/stable-diffusion-cuda-oom.yaml b/rules/cre-2025-0162/stable-diffusion-cuda-oom.yaml
@@ -0,0 +1,70 @@
+rules:
+  - metadata:
+      kind: prequel
+      id: SD8xK2mN9pQzYvWr3aLfJ7
+      hash: XpQ9Lm4Zk8TnVb2Ry6HwGs
+    cre:
+      id: CRE-2025-0162
+      severity: 1
+      title: "Stable Diffusion WebUI CUDA Out of Memory Crash"
+      category: "memory-problem"
+      author: Prequel Community
+      description: |
+        Detects critical CUDA out of memory errors in Stable Diffusion WebUI that cause image generation failures and application crashes. This occurs when GPU VRAM is exhausted during model loading or image generation, resulting in complete task failure and potential WebUI instability.
+      cause: |
+        - Insufficient GPU VRAM for requested image resolution or batch size
+        - Memory fragmentation preventing large contiguous allocations
+        - Model loading exceeding available VRAM capacity
+        - Concurrent GPU processes consuming memory
+        - High-resolution image generation without memory optimization flags
+      impact: |
+        - Complete image generation failure
+        - WebUI crash requiring restart
+        - Loss of in-progress generation work
+        - Potential GPU driver instability
+        - Service unavailability for users
+      tags:
+        - memory
+        - nvidia
+        - crash
+        - out-of-memory
+        - configuration
+      mitigation: |
+        IMMEDIATE ACTIONS:
+        - Restart Stable Diffusion WebUI
+        - Clear GPU memory: nvidia-smi --gpu-reset
+        - Add memory optimization flags: --medvram or --lowvram
+        CONFIGURATION FIXES:
+        - For 4-6GB VRAM: Add --medvram to webui-user.bat
+        - For 2-4GB VRAM: Add --lowvram to webui-user.bat
+        - Enable xformers: --xformers for memory efficiency
+        - Add --always-batch-cond-uncond for batch processing
+        RUNTIME ADJUSTMENTS:
+        - Reduce image resolution (512x512 instead of 1024x1024)
+        - Decrease batch size to 1
+        - Lower batch count for multiple generations
+        - Set PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.9,max_split_size_mb:512
+        PREVENTION:
+        - Monitor GPU memory usage with nvidia-smi
+        - Implement gradual resolution scaling
+        - Use cloud services for high-resolution generation
+        - Upgrade to GPU with minimum 8GB VRAM
+      references:
+        - https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/12992
+        - https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/9770
+        - https://github.com/CompVis/stable-diffusion/issues/39
+      applications:
+        - name: stable-diffusion-webui
+          version: ">=1.0.0"
+      impactScore: 8
+      mitigationScore: 7
+      reports: 15
+    rule:
+      set:
+        window: 120s
+        event:
+          source: cre.log.stable-diffusion
+        match:
+          - regex: 'OutOfMemoryError.*CUDA out of memory'
+          - regex: 'CUDA out of memory.*Tried to allocate'
+          - regex: 'model failed to load.*OutOfMemoryError'
diff --git a/rules/cre-2025-0162/test.log b/rules/cre-2025-0162/test.log
@@ -0,0 +1,14 @@
+2025-08-29 14:23:45.123 [ERROR] Loading model stable-diffusion-v1.5
+2025-08-29 14:23:47.456 [INFO] Model weights: 4.27 GB
+2025-08-29 14:23:48.789 [INFO] Allocating GPU memory...
+2025-08-29 14:23:49.012 [ERROR] torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 6.00 GiB total capacity; 4.50 GiB already allocated; 1.20 GiB free; 4.80 GiB reserved in total by PyTorch)
+2025-08-29 14:23:49.013 [ERROR] RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 6.00 GiB of which 1.20 GiB is free. Process 12345 has 4.50 GiB memory in use.
+2025-08-29 14:23:49.014 [CRITICAL] Stable Diffusion model failed to load: OutOfMemoryError
+2025-08-29 14:23:49.015 [ERROR] CUDA error: out of memory
+2025-08-29 14:23:49.016 [ERROR] GPU 0 has a total capacity of 6.00 GiB of which 1.20 GiB is free. Allocation failed.
+2025-08-29 14:23:49.017 [ERROR] Failed to generate image: CUDA out of memory
+2025-08-29 14:23:49.018 [INFO] Attempting to clear cache...
+2025-08-29 14:23:50.123 [INFO] Cache cleared, retrying...
+2025-08-29 14:23:51.456 [ERROR] torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB
+2025-08-29 14:23:51.457 [CRITICAL] Image generation failed after retry
+2025-08-29 14:23:51.458 [ERROR] WebUI shutting down due to memory error
diff --git a/rules/cre-2025-0179/n8n-silent-data-loss.yaml b/rules/cre-2025-0179/n8n-silent-data-loss.yaml
@@ -0,0 +1,74 @@
+rules:
+- cre:
+    id: CRE-2025-0179
+    severity: 0
+    title: N8N Workflow Silent Data Loss During Execution
+    category: workflow-automation-problem
+    author: Claude Code Assistant
+    description: |
+      N8N workflow automation platform experiences critical silent data loss where items 
+      disappear between workflow nodes without generating error messages. This high-severity 
+      issue affects long-running workflows (60-115+ minutes) and can cause workflows to 
+      randomly cancel mid-execution, leading to incomplete processing and data integrity 
+      problems. Items silently vanish between nodes, with different item counts across 
+      the workflow pipeline, making the issue particularly dangerous for production systems 
+      that rely on complete data processing.
+    cause: |
+      * Workflow execution engine fails to properly track items between nodes in long-running workflows
+      * Memory management issues during extended workflow processing causing item references to be lost
+      * Race conditions in the worker queue system when handling multiple concurrent items
+      * Node-to-node data transfer mechanisms failing silently under certain load conditions
+      * Queue worker timeout or resource contention causing partial item processing without error reporting
+      * Database transaction issues where some items fail to persist between workflow stages
+    tags:
+      - n8n
+      - workflow-automation
+      - data-loss
+      - silent-failure
+      - production-critical
+      - data-integrity
+      - public
+    mitigation: |
+      - **Implement workflow item counting checks** - Add validation nodes between critical 
+        processing steps to verify item counts match expected values
+      - **Enable comprehensive execution logging** - Set N8N_LOG_LEVEL to debug and 
+        EXECUTIONS_DATA_SAVE_ON_SUCCESS to 'all' to capture detailed execution data
+      - **Add workflow timeout monitoring** - Monitor executions that cancel around 21-23 
+        minute mark and implement retry mechanisms for failed workflows
+      - **Implement data integrity validation** - Add checksum or validation steps at 
+        workflow start/end to detect silent data loss
+      - **Use error handling workflows** - Configure error workflows to capture and log 
+        execution failures, even when main workflow fails silently
+      - **Monitor execution metrics** - Set up alerting on workflow completion rates and 
+        item processing inconsistencies
+      - **Consider workflow segmentation** - Break long workflows into smaller, more 
+        manageable chunks to reduce exposure to the data loss issue
+    references:
+      - https://github.com/n8n-io/n8n/issues/14909
+      - https://docs.n8n.io/flow-logic/error-handling/
+      - https://community.n8n.io/t/workflow-randomly-cancels-mid-execution-without-error-data-items-silently-dropped-between-nodes/51141
+    applications:
+      - name: n8n
+        version: ">= 1.90.0"
+        processName: n8n
+        containerName: n8n
+    impact: |
+      Silent data loss in workflow automation can cause critical business processes to fail 
+      without detection, leading to incomplete data processing, missing business transactions, 
+      failed integrations, and potential compliance violations. The silent nature makes it 
+      extremely difficult to detect and troubleshoot, potentially causing weeks or months 
+      of data integrity issues before discovery.
+    impactScore: 9
+    mitigationScore: 7
+  metadata:
+    kind: prequel
+    id: N8nSilentDataLossDetection919
+    gen: 1
+  rule:
+    sequence:
+      window: 120s  
+      event:
+        source: cre.log.n8n
+      order:
+        - regex: "(cancelled mid-execution|execution terminated unexpectedly|workflow.*cancelled|Execution.*cancelled)"
+        - regex: "(silent data loss detected|data.*loss|itemsLost|dataIntegrityIssue.*true|Items processed inconsistently|Data integrity check failed|Expected [0-9]+ items, found [0-9]+ items)"
diff --git a/rules/cre-2025-0179/test.log b/rules/cre-2025-0179/test.log
@@ -0,0 +1,10 @@
+Aug 27 18:30:29 n8n[1234]: INFO: Starting workflow execution exec_384574 for workflow workflow_9084
+Aug 27 18:35:29 n8n[1234]: DEBUG: Node processing started - HTTP Request node
+Aug 27 18:45:29 n8n[1234]: INFO: Processing 150 items through workflow pipeline
+Aug 27 18:53:29 n8n[1234]: DEBUG: Node completed with 142 items (expected 150)
+Aug 27 19:05:29 n8n[1234]: DEBUG: Transform node processing remaining items
+Aug 27 19:25:29 n8n[1234]: WARN: Execution exec_384574 cancelled mid-execution after 55 minutes
+Aug 27 19:25:44 n8n[1234]: ERROR: Data integrity check failed - Items processed inconsistently across nodes
+Aug 27 19:25:49 n8n[1234]: ERROR: Expected 150 items, found 127 items at completion
+Aug 27 19:26:15 n8n[1234]: CRITICAL: Massive data loss detected - Expected 500 items, found 75 items
+Aug 27 19:26:20 n8n[1234]: ERROR: Critical workflow failure detected - 85% data loss in processing pipeline
diff --git a/rules/cre-2025-0200/autogpt-recursive-self-analysis-loop.yaml b/rules/cre-2025-0200/autogpt-recursive-self-analysis-loop.yaml
@@ -0,0 +1,68 @@
+rules:
+- cre:
+    id: CRE-2025-0200
+    severity: 0
+    title: AutoGPT Recursive Self-Analysis Loop Leading to Token Exhaustion and System Crash
+    category: infinite-loop-problem
+    author: prequel
+    description: |
+      - AutoGPT enters an infinite recursive loop when attempting to analyze and fix its own execution errors
+      - The agent repeatedly tries to debug its own code, spawning new analysis tasks for each failure
+      - Each iteration consumes API tokens and memory, eventually exhausting resources
+      - The loop accelerates as error messages grow longer, consuming tokens exponentially
+      - System becomes unresponsive and crashes with out-of-memory errors or API rate limit failures
+    cause: |
+      - AutoGPT's autonomous reasoning incorrectly identifies its own execution as a problem to solve
+      - Lack of loop detection mechanisms allows unlimited recursive task spawning
+      - Error context accumulation causes exponential growth in prompt size
+      - Missing safeguards for self-referential task creation
+      - Insufficient resource monitoring and circuit breakers for runaway processes
+    tags:
+      - autogpt
+      - infinite-loop
+      - token-exhaustion
+      - autonomous-agents
+      - llm
+      - openai
+      - recursive-analysis
+      - critical-failure
+      - memory-exhaustion
+      - crash-loop
+      - rate-limiting
+    mitigation: |
+      - Implement loop detection to identify and break recursive self-analysis patterns
+      - Add resource consumption thresholds (tokens, memory, API calls) with automatic shutdown
+      - Create task depth limits to prevent unlimited recursion
+      - Implement circuit breakers that trigger after repeated similar failures
+      - Add explicit blacklist for self-referential task creation
+      - Monitor token usage rate and implement exponential backoff
+      - Use separate monitoring process to detect and kill runaway AutoGPT instances
+      - Implement task deduplication to prevent identical recursive operations
+    references:
+      - https://github.com/Significant-Gravitas/AutoGPT/issues/1994
+      - https://github.com/Significant-Gravitas/AutoGPT/issues/3766
+      - https://github.com/Significant-Gravitas/AutoGPT/issues/1543
+      - https://jina.ai/news/auto-gpt-unmasked-hype-hard-truths-production-pitfalls/
+    applications:
+      - name: autogpt
+        version: ">=0.3.0"
+      - name: openai
+        version: ">=0.27.0"
+    impact: Complete system failure with resource exhaustion, potential financial losses from API overconsumption
+    impactScore: 9
+    mitigationScore: 3
+    reports: 15
+  metadata:
+    kind: prequel
+    id: 8qy5Et9NbNGgGxhBP7umKa
+    gen: 1
+  rule:
+    set:
+      window: 30s
+      event:
+        source: cre.log.autogpt
+      match:
+        - value: 'Entering recursive analysis loop'
+        - value: 'COMMAND = analyze_code'
+        - value: 'recursion depth'
+        - value: 'RecursionError: maximum recursion depth exceeded'
diff --git a/rules/cre-2025-0200/test.log b/rules/cre-2025-0200/test.log
diff --git a/rules/tags/categories.yaml b/rules/tags/categories.yaml
diff --git a/rules/tags/tags.yaml b/rules/tags/tags.yaml