Merge branch 'main' into supabase-cre-rules

RaghavArora14 · web-flow · commit 3c1738a18bd4 · 2025-09-03T21:46:29.000+05:30
diff --git a/rules/cre-2025-0162/stable-diffusion-cuda-oom.yaml b/rules/cre-2025-0162/stable-diffusion-cuda-oom.yaml
@@ -0,0 +1,70 @@
+rules:
+  - metadata:
+      kind: prequel
+      id: SD8xK2mN9pQzYvWr3aLfJ7
+      hash: XpQ9Lm4Zk8TnVb2Ry6HwGs
+    cre:
+      id: CRE-2025-0162
+      severity: 1
+      title: "Stable Diffusion WebUI CUDA Out of Memory Crash"
+      category: "memory-problem"
+      author: Prequel Community
+      description: |
+        Detects critical CUDA out of memory errors in Stable Diffusion WebUI that cause image generation failures and application crashes. This occurs when GPU VRAM is exhausted during model loading or image generation, resulting in complete task failure and potential WebUI instability.
+      cause: |
+        - Insufficient GPU VRAM for requested image resolution or batch size
+        - Memory fragmentation preventing large contiguous allocations
+        - Model loading exceeding available VRAM capacity
+        - Concurrent GPU processes consuming memory
+        - High-resolution image generation without memory optimization flags
+      impact: |
+        - Complete image generation failure
+        - WebUI crash requiring restart
+        - Loss of in-progress generation work
+        - Potential GPU driver instability
+        - Service unavailability for users
+      tags:
+        - memory
+        - nvidia
+        - crash
+        - out-of-memory
+        - configuration
+      mitigation: |
+        IMMEDIATE ACTIONS:
+        - Restart Stable Diffusion WebUI
+        - Clear GPU memory: nvidia-smi --gpu-reset
+        - Add memory optimization flags: --medvram or --lowvram
+        CONFIGURATION FIXES:
+        - For 4-6GB VRAM: Add --medvram to webui-user.bat
+        - For 2-4GB VRAM: Add --lowvram to webui-user.bat
+        - Enable xformers: --xformers for memory efficiency
+        - Add --always-batch-cond-uncond for batch processing
+        RUNTIME ADJUSTMENTS:
+        - Reduce image resolution (512x512 instead of 1024x1024)
+        - Decrease batch size to 1
+        - Lower batch count for multiple generations
+        - Set PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.9,max_split_size_mb:512
+        PREVENTION:
+        - Monitor GPU memory usage with nvidia-smi
+        - Implement gradual resolution scaling
+        - Use cloud services for high-resolution generation
+        - Upgrade to GPU with minimum 8GB VRAM
+      references:
+        - https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/12992
+        - https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/9770
+        - https://github.com/CompVis/stable-diffusion/issues/39
+      applications:
+        - name: stable-diffusion-webui
+          version: ">=1.0.0"
+      impactScore: 8
+      mitigationScore: 7
+      reports: 15
+    rule:
+      set:
+        window: 120s
+        event:
+          source: cre.log.stable-diffusion
+        match:
+          - regex: 'OutOfMemoryError.*CUDA out of memory'
+          - regex: 'CUDA out of memory.*Tried to allocate'
+          - regex: 'model failed to load.*OutOfMemoryError'
diff --git a/rules/cre-2025-0162/test.log b/rules/cre-2025-0162/test.log
@@ -0,0 +1,14 @@
+2025-08-29 14:23:45.123 [ERROR] Loading model stable-diffusion-v1.5
+2025-08-29 14:23:47.456 [INFO] Model weights: 4.27 GB
+2025-08-29 14:23:48.789 [INFO] Allocating GPU memory...
+2025-08-29 14:23:49.012 [ERROR] torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 6.00 GiB total capacity; 4.50 GiB already allocated; 1.20 GiB free; 4.80 GiB reserved in total by PyTorch)
+2025-08-29 14:23:49.013 [ERROR] RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB. GPU 0 has a total capacity of 6.00 GiB of which 1.20 GiB is free. Process 12345 has 4.50 GiB memory in use.
+2025-08-29 14:23:49.014 [CRITICAL] Stable Diffusion model failed to load: OutOfMemoryError
+2025-08-29 14:23:49.015 [ERROR] CUDA error: out of memory
+2025-08-29 14:23:49.016 [ERROR] GPU 0 has a total capacity of 6.00 GiB of which 1.20 GiB is free. Allocation failed.
+2025-08-29 14:23:49.017 [ERROR] Failed to generate image: CUDA out of memory
+2025-08-29 14:23:49.018 [INFO] Attempting to clear cache...
+2025-08-29 14:23:50.123 [INFO] Cache cleared, retrying...
+2025-08-29 14:23:51.456 [ERROR] torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB
+2025-08-29 14:23:51.457 [CRITICAL] Image generation failed after retry
+2025-08-29 14:23:51.458 [ERROR] WebUI shutting down due to memory error
diff --git a/rules/cre-2025-0179/n8n-silent-data-loss.yaml b/rules/cre-2025-0179/n8n-silent-data-loss.yaml
@@ -0,0 +1,74 @@
+rules:
+- cre:
+    id: CRE-2025-0179
+    severity: 0
+    title: N8N Workflow Silent Data Loss During Execution
+    category: workflow-automation-problem
+    author: Claude Code Assistant
+    description: |
+      N8N workflow automation platform experiences critical silent data loss where items 
+      disappear between workflow nodes without generating error messages. This high-severity 
+      issue affects long-running workflows (60-115+ minutes) and can cause workflows to 
+      randomly cancel mid-execution, leading to incomplete processing and data integrity 
+      problems. Items silently vanish between nodes, with different item counts across 
+      the workflow pipeline, making the issue particularly dangerous for production systems 
+      that rely on complete data processing.
+    cause: |
+      * Workflow execution engine fails to properly track items between nodes in long-running workflows
+      * Memory management issues during extended workflow processing causing item references to be lost
+      * Race conditions in the worker queue system when handling multiple concurrent items
+      * Node-to-node data transfer mechanisms failing silently under certain load conditions
+      * Queue worker timeout or resource contention causing partial item processing without error reporting
+      * Database transaction issues where some items fail to persist between workflow stages
+    tags:
+      - n8n
+      - workflow-automation
+      - data-loss
+      - silent-failure
+      - production-critical
+      - data-integrity
+      - public
+    mitigation: |
+      - **Implement workflow item counting checks** - Add validation nodes between critical 
+        processing steps to verify item counts match expected values
+      - **Enable comprehensive execution logging** - Set N8N_LOG_LEVEL to debug and 
+        EXECUTIONS_DATA_SAVE_ON_SUCCESS to 'all' to capture detailed execution data
+      - **Add workflow timeout monitoring** - Monitor executions that cancel around 21-23 
+        minute mark and implement retry mechanisms for failed workflows
+      - **Implement data integrity validation** - Add checksum or validation steps at 
+        workflow start/end to detect silent data loss
+      - **Use error handling workflows** - Configure error workflows to capture and log 
+        execution failures, even when main workflow fails silently
+      - **Monitor execution metrics** - Set up alerting on workflow completion rates and 
+        item processing inconsistencies
+      - **Consider workflow segmentation** - Break long workflows into smaller, more 
+        manageable chunks to reduce exposure to the data loss issue
+    references:
+      - https://github.com/n8n-io/n8n/issues/14909
+      - https://docs.n8n.io/flow-logic/error-handling/
+      - https://community.n8n.io/t/workflow-randomly-cancels-mid-execution-without-error-data-items-silently-dropped-between-nodes/51141
+    applications:
+      - name: n8n
+        version: ">= 1.90.0"
+        processName: n8n
+        containerName: n8n
+    impact: |
+      Silent data loss in workflow automation can cause critical business processes to fail 
+      without detection, leading to incomplete data processing, missing business transactions, 
+      failed integrations, and potential compliance violations. The silent nature makes it 
+      extremely difficult to detect and troubleshoot, potentially causing weeks or months 
+      of data integrity issues before discovery.
+    impactScore: 9
+    mitigationScore: 7
+  metadata:
+    kind: prequel
+    id: N8nSilentDataLossDetection919
+    gen: 1
+  rule:
+    sequence:
+      window: 120s  
+      event:
+        source: cre.log.n8n
+      order:
+        - regex: "(cancelled mid-execution|execution terminated unexpectedly|workflow.*cancelled|Execution.*cancelled)"
+        - regex: "(silent data loss detected|data.*loss|itemsLost|dataIntegrityIssue.*true|Items processed inconsistently|Data integrity check failed|Expected [0-9]+ items, found [0-9]+ items)"
diff --git a/rules/cre-2025-0179/test.log b/rules/cre-2025-0179/test.log
@@ -0,0 +1,10 @@
+Aug 27 18:30:29 n8n[1234]: INFO: Starting workflow execution exec_384574 for workflow workflow_9084
+Aug 27 18:35:29 n8n[1234]: DEBUG: Node processing started - HTTP Request node
+Aug 27 18:45:29 n8n[1234]: INFO: Processing 150 items through workflow pipeline
+Aug 27 18:53:29 n8n[1234]: DEBUG: Node completed with 142 items (expected 150)
+Aug 27 19:05:29 n8n[1234]: DEBUG: Transform node processing remaining items
+Aug 27 19:25:29 n8n[1234]: WARN: Execution exec_384574 cancelled mid-execution after 55 minutes
+Aug 27 19:25:44 n8n[1234]: ERROR: Data integrity check failed - Items processed inconsistently across nodes
+Aug 27 19:25:49 n8n[1234]: ERROR: Expected 150 items, found 127 items at completion
+Aug 27 19:26:15 n8n[1234]: CRITICAL: Massive data loss detected - Expected 500 items, found 75 items
+Aug 27 19:26:20 n8n[1234]: ERROR: Critical workflow failure detected - 85% data loss in processing pipeline
diff --git a/rules/cre-2025-0200/autogpt-recursive-self-analysis-loop.yaml b/rules/cre-2025-0200/autogpt-recursive-self-analysis-loop.yaml
@@ -0,0 +1,68 @@
+rules:
+- cre:
+    id: CRE-2025-0200
+    severity: 0
+    title: AutoGPT Recursive Self-Analysis Loop Leading to Token Exhaustion and System Crash
+    category: infinite-loop-problem
+    author: prequel
+    description: |
+      - AutoGPT enters an infinite recursive loop when attempting to analyze and fix its own execution errors
+      - The agent repeatedly tries to debug its own code, spawning new analysis tasks for each failure
+      - Each iteration consumes API tokens and memory, eventually exhausting resources
+      - The loop accelerates as error messages grow longer, consuming tokens exponentially
+      - System becomes unresponsive and crashes with out-of-memory errors or API rate limit failures
+    cause: |
+      - AutoGPT's autonomous reasoning incorrectly identifies its own execution as a problem to solve
+      - Lack of loop detection mechanisms allows unlimited recursive task spawning
+      - Error context accumulation causes exponential growth in prompt size
+      - Missing safeguards for self-referential task creation
+      - Insufficient resource monitoring and circuit breakers for runaway processes
+    tags:
+      - autogpt
+      - infinite-loop
+      - token-exhaustion
+      - autonomous-agents
+      - llm
+      - openai
+      - recursive-analysis
+      - critical-failure
+      - memory-exhaustion
+      - crash-loop
+      - rate-limiting
+    mitigation: |
+      - Implement loop detection to identify and break recursive self-analysis patterns
+      - Add resource consumption thresholds (tokens, memory, API calls) with automatic shutdown
+      - Create task depth limits to prevent unlimited recursion
+      - Implement circuit breakers that trigger after repeated similar failures
+      - Add explicit blacklist for self-referential task creation
+      - Monitor token usage rate and implement exponential backoff
+      - Use separate monitoring process to detect and kill runaway AutoGPT instances
+      - Implement task deduplication to prevent identical recursive operations
+    references:
+      - https://github.com/Significant-Gravitas/AutoGPT/issues/1994
+      - https://github.com/Significant-Gravitas/AutoGPT/issues/3766
+      - https://github.com/Significant-Gravitas/AutoGPT/issues/1543
+      - https://jina.ai/news/auto-gpt-unmasked-hype-hard-truths-production-pitfalls/
+    applications:
+      - name: autogpt
+        version: ">=0.3.0"
+      - name: openai
+        version: ">=0.27.0"
+    impact: Complete system failure with resource exhaustion, potential financial losses from API overconsumption
+    impactScore: 9
+    mitigationScore: 3
+    reports: 15
+  metadata:
+    kind: prequel
+    id: 8qy5Et9NbNGgGxhBP7umKa
+    gen: 1
+  rule:
+    set:
+      window: 30s
+      event:
+        source: cre.log.autogpt
+      match:
+        - value: 'Entering recursive analysis loop'
+        - value: 'COMMAND = analyze_code'
+        - value: 'recursion depth'
+        - value: 'RecursionError: maximum recursion depth exceeded'
diff --git a/rules/cre-2025-0200/test.log b/rules/cre-2025-0200/test.log
@@ -0,0 +1,35 @@
+2025-08-31 14:23:45.234 [INFO] [autogpt.main] Starting AutoGPT v0.5.1 with goal: "Optimize my Python code for better performance"
+2025-08-31 14:23:45.567 [INFO] [autogpt.llm] Initializing OpenAI API client with model gpt-4
+2025-08-31 14:23:46.102 [INFO] [autogpt.agent] Agent initialized with memory backend: LocalCache
+2025-08-31 14:23:47.234 [INFO] [autogpt.agent] COMMAND = analyze_code args: {"code": "def slow_function():\\n    result = []\\n    for i in range(1000000):\\n        result.append(i**2)\\n    return result"}
+2025-08-31 14:23:48.567 [ERROR] [autogpt.commands] Error executing analyze_code: JSONDecodeError in response
+2025-08-31 14:23:48.890 [WARN] [autogpt.agent] Entering recursive analysis loop to debug previous error
+2025-08-31 14:23:49.234 [INFO] [autogpt.agent] THOUGHTS: Previous command failed, need to analyze what went wrong
+2025-08-31 14:23:49.567 [INFO] [autogpt.agent] NEXT ACTION: COMMAND = analyze_code args: {"code": "analyze_code function from autogpt/commands/analyze_code.py", "recursion depth": 1}
+2025-08-31 14:23:50.123 [DEBUG] [autogpt.memory] Storing error context, current size: 2.3MB
+2025-08-31 14:23:50.890 [ERROR] [autogpt.commands] Error executing analyze_code: Cannot analyze own execution context
+2025-08-31 14:23:51.234 [WARN] [autogpt.agent] Thinking... need to fix my own error handling
+2025-08-31 14:23:51.678 [INFO] [autogpt.agent] COMMAND = analyze_code args: {"code": "autogpt error handling module", "recursion depth": 2}
+2025-08-31 14:23:52.345 [DEBUG] [autogpt.memory] Memory usage increasing: 5.7MB, token count: 8234
+2025-08-31 14:23:52.890 [ERROR] [autogpt.llm] API request failed: context length exceeded
+2025-08-31 14:23:53.234 [INFO] [autogpt.agent] SYSTEM: Potential loop detected but continuing to resolve errors
+2025-08-31 14:23:53.567 [INFO] [autogpt.agent] THOUGHTS: Error analyzing previous attempt, need deeper analysis
+2025-08-31 14:23:54.012 [INFO] [autogpt.agent] NEXT ACTION: COMMAND = analyze_code args: {"code": "full autogpt execution trace", "recursion depth": 3}
+2025-08-31 14:23:54.678 [WARN] [autogpt.monitor] Task queue growing: 12 pending tasks
+2025-08-31 14:23:55.234 [DEBUG] [autogpt.memory] Memory usage: 12.4MB, token count: 15672
+2025-08-31 14:23:55.890 [ERROR] [autogpt.commands] RecursionError: maximum recursion depth exceeded while calling analyze_code
+2025-08-31 14:23:56.345 [CRITICAL] [autogpt.agent] Task queue overflow: 47 pending recursive tasks
+2025-08-31 14:23:56.789 [INFO] [autogpt.agent] COMMAND = analyze_code args: {"code": "recursion error in analyze_code", "recursion depth": 4}
+2025-08-31 14:23:57.234 [ERROR] [autogpt.llm] openai.error.RateLimitError: Rate limit reached for gpt-4 in organization
+2025-08-31 14:23:57.567 [WARN] [autogpt.monitor] Token consumption rate: 2341 tokens/second
+2025-08-31 14:23:58.012 [DEBUG] [autogpt.memory] Memory usage critical: 45.8MB, token count: 42318
+2025-08-31 14:23:58.456 [ERROR] [autogpt.agent] Too many pending tasks: 89 in queue
+2025-08-31 14:23:58.890 [INFO] [autogpt.agent] THOUGHTS: Still analyzing previous errors, must understand the recursion
+2025-08-31 14:23:59.234 [INFO] [autogpt.agent] NEXT ACTION: COMMAND = analyze_code args: {"code": "entire autogpt error stack", "recursion depth": 5}
+2025-08-31 14:23:59.678 [CRITICAL] [autogpt.monitor] JavaScript heap out of memory
+2025-08-31 14:24:00.123 [ERROR] [autogpt.memory] MemoryError: Cannot allocate memory for context storage
+2025-08-31 14:24:00.456 [CRITICAL] [autogpt.agent] Task buffer exceeded: 156 recursive analyze_code calls pending
+2025-08-31 14:24:00.789 [ERROR] [autogpt.llm] API rate limit exceeded: 429 Too Many Requests
+2025-08-31 14:24:01.123 [FATAL] [autogpt.main] AutoGPT crashed: Unrecoverable recursive loop detected
+2025-08-31 14:24:01.234 [INFO] [autogpt.cleanup] Emergency shutdown initiated
+2025-08-31 14:24:01.345 [ERROR] [autogpt.cleanup] Failed to save state: Out of memory
diff --git a/rules/tags/categories.yaml b/rules/tags/categories.yaml
@@ -243,14 +243,4 @@ categories:
     displayName: MongoDB Startup Failure
     description: |
       Failures that prevent MongoDB from starting successfully due to corrupted metadata, invalid configurations,
-      or unrecoverable internal errors (e.g., WiredTiger metadata corruption). These failures often require manual repair or backup restoration.
-  - name: supabase-problem
-    displayName: Supabase Problems  
-    description: |
-      Problems specific to Supabase self-hosted deployments including authentication failures, database connectivity issues,
-      storage misconfigurations, realtime service crashes, and infrastructure-related failures that affect the entire Supabase stack.
-  - name: realtime-problem
-    displayName: Realtime Problems
-    description: |
-      Failures in real-time communication systems including WebSocket connection issues, real-time subscription failures,
-      and problems with live data streaming that affect user experience in interactive applications.
+      or unrecoverable internal errors (e.g., WiredTiger metadata corruption). These failures often require manual repair or backup restoration.
diff --git a/rules/tags/tags.yaml b/rules/tags/tags.yaml
@@ -847,16 +847,4 @@ tags:
     description: Issues with Kubernetes pod scheduling due to resource constraints or networking problems
   - name: cluster-scaling
     displayName: Cluster Scaling
-    description: Problems related to Kubernetes cluster scaling operations and capacity management
-  - name: supabase
-    displayName: Supabase
-    description: Problems related to Supabase self-hosted deployments and services
-  - name: gotrue
-    displayName: GoTrue
-    description: Problems related to Supabase's GoTrue authentication service
-  - name: realtime
-    displayName: Realtime
-    description: Problems related to Supabase's realtime service and WebSocket connections
-  - name: self-hosted
-    displayName: Self-Hosted
-    description: Problems specific to self-hosted deployments and infrastructure
+    description: Problems related to Kubernetes cluster scaling operations and capacity management