diff --git a/rules/cre-2025-0170/stable-diffusion-meta-tensor-corruption.yaml b/rules/cre-2025-0170/stable-diffusion-meta-tensor-corruption.yaml new file mode 100644 index 0000000..de20247 --- /dev/null +++ b/rules/cre-2025-0170/stable-diffusion-meta-tensor-corruption.yaml @@ -0,0 +1,45 @@ +rules: +- cre: + id: CRE-2025-0142 + severity: 1 + title: Stable Diffusion Web UI Meta Tensor Corruption Leading to Complete Service Failure + category: ai-ml-framework-problem + author: Community + description: | + - Detects critical Stable Diffusion Web UI failures where meta tensor corruption prevents model loading. + - The error "NotImplementedError: Cannot copy out of meta tensor; no data!" indicates catastrophic failure. + - This represents a complete service failure that requires immediate intervention. + cause: | + - Corrupted or incomplete model checkpoint files (safetensors/ckpt) + - PyTorch tensor corruption during model loading + - Device mismatch between CPU and GPU tensors + - Memory corruption during tensor operations + tags: + - python + - crash + - memory + - corruption + - critical-failure + - data-integrity + mitigation: | + - Restart Stable Diffusion Web UI service to clear corrupted tensor states + - Re-download and verify model checkpoint files + - Check GPU memory and clear any corrupted tensor allocations + references: + - https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues + applications: + - name: stable-diffusion-webui + impact: complete service failure - no image generation possible + impactScore: 10 + mitigationScore: 8 + reports: 1 + metadata: + kind: prequel + id: 7Fk9mNpQrStUvWxYzA2B3C4D + gen: 2 + rule: + set: + event: + source: cre.log.stable-diffusion-webui + match: + - regex: 'Cannot copy out of meta tensor; no data!' diff --git a/rules/cre-2025-0170/test.log b/rules/cre-2025-0170/test.log new file mode 100644 index 0000000..82537fa --- /dev/null +++ b/rules/cre-2025-0170/test.log @@ -0,0 +1,32 @@ +{"timestamp":"2025-08-30T22:24:12.001Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"=== Real Meta Tensor Corruption Failure Reproduction ==="} +{"timestamp":"2025-08-30T22:24:12.002Z","level":"ERROR","source":"cre.log.stable-diffusion-webui","message":"This script will actually trigger PyTorch meta tensor errors"} +{"timestamp":"2025-08-30T22:24:12.003Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":""} +{"timestamp":"2025-08-30T22:24:12.004Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Test 1: Direct Meta Tensor Corruption ---"} +{"timestamp":"2025-08-30T22:24:12.005Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Creating meta tensor..."} +{"timestamp":"2025-08-30T22:24:12.006Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Attempting to copy meta tensor to CUDA..."} +{"timestamp":"2025-08-30T22:24:12.007Z","level":"ERROR","source":"cre.log.stable-diffusion-webui","message":"Meta tensor error triggered: Cannot copy out of meta tensor; no data!"} +{"timestamp":"2025-08-30T22:24:12.008Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":""} +{"timestamp":"2025-08-30T22:24:12.009Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Test 2: Device Mismatch ---"} +{"timestamp":"2025-08-30T22:24:12.010Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Creating tensors on different devices..."} +{"timestamp":"2025-08-30T22:24:12.011Z","level":"ERROR","source":"cre.log.stable-diffusion-webui","message":"Device mismatch error: CUDA error: no kernel image is available for execution on the device"} +{"timestamp":"2025-08-30T22:24:12.012Z","level":"ERROR","source":"cre.log.stable-diffusion-webui","message":"kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect."} +{"timestamp":"2025-08-30T22:24:12.013Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"debugging consider passing CUDA_LAUNCH_BLOCKING=1"} +{"timestamp":"2025-08-30T22:24:12.014Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"with `TORCH_USE_CUDA_DSA` to enable device-side assertions."} +{"timestamp":"2025-08-30T22:24:12.015Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":""} +{"timestamp":"2025-08-30T22:24:12.016Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":""} +{"timestamp":"2025-08-30T22:24:12.017Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Test 3: Model Loading Failure ---"} +{"timestamp":"2025-08-30T22:24:12.018Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Starting model loading simulation..."} +{"timestamp":"2025-08-30T22:24:12.019Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Creating model from config: configs/v1-inference.yaml"} +{"timestamp":"2025-08-30T22:24:12.020Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Loading weights from models/Stable-diffusion/model.safetensors"} +{"timestamp":"2025-08-30T22:24:12.021Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Creating model with corrupted tensors..."} +{"timestamp":"2025-08-30T22:24:12.022Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Model created successfully"} +{"timestamp":"2025-08-30T22:24:12.023Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Attempting to move model to CUDA..."} +{"timestamp":"2025-08-30T22:24:12.024Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Model loading completed successfully"} +{"timestamp":"2025-08-30T22:24:12.025Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":""} +{"timestamp":"2025-08-30T22:24:12.026Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Test 4: Corrupted Checkpoint ---"} +{"timestamp":"2025-08-30T22:24:12.027Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Creating corrupted checkpoint file..."} +{"timestamp":"2025-08-30T22:24:12.028Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Created corrupted checkpoint: /tmp/tmp86_xfqpp.safetensors"} +{"timestamp":"2025-08-30T22:24:12.029Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Corrupted checkpoint created: /tmp/tmp86_xfqpp.safetensors"} +{"timestamp":"2025-08-30T22:24:12.030Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":""} +{"timestamp":"2025-08-30T22:24:12.031Z","level":"INFO","source":"cre.log.stable-diffusion-webui","message":"Reproduction Complete ==="} +{"timestamp":"2025-08-30T22:24:12.032Z","level":"ERROR","source":"cre.log.stable-diffusion-webui","message":"Check 'real_failure.log' for actual error logs"} diff --git a/rules/tags/categories.yaml b/rules/tags/categories.yaml index a08a6ed..7f13802 100644 --- a/rules/tags/categories.yaml +++ b/rules/tags/categories.yaml @@ -132,6 +132,9 @@ categories: - name: ubuntu-desktop-problem displayName: Ubuntu Desktop Problems description: "Problems related to Ubuntu Desktop" + - name: ai-ml-framework-problem + displayName: AI/ML Framework Problems + description: Problems related to AI/ML frameworks such as Stable Diffusion, PyTorch, and TensorFlow - name: hpc-database-problem displayName: HPC Database Problems description: Database issues specific to high-performance computing systems like SLURM diff --git a/rules/tags/tags.yaml b/rules/tags/tags.yaml index ce00e20..1450936 100644 --- a/rules/tags/tags.yaml +++ b/rules/tags/tags.yaml @@ -848,87 +848,6 @@ tags: - name: cluster-scaling displayName: Cluster Scaling description: Problems related to Kubernetes cluster scaling operations and capacity management - - name: maxmemory - displayName: Max Memory - description: Problems related to Redis maxmemory configuration and memory limits - - name: noeviction - displayName: No Eviction - description: Issues when Redis noeviction policy prevents writing new data - - name: wrongpass - displayName: Wrong Password - description: Authentication failures due to incorrect Redis passwords - - name: master-replica - displayName: Master-Replica - description: Issues with Redis master-replica replication relationships - - name: sync - displayName: Sync - description: Data synchronization problems in distributed systems - - name: psync - displayName: Partial Sync - description: Redis partial resynchronization issues - - name: aof - displayName: AOF - description: Redis Append-Only File persistence issues - - name: slowlog - displayName: Slow Log - description: Database slow query logging and performance issues - - name: latency - displayName: Latency - description: Response time and performance latency issues - - name: slow-query - displayName: Slow Query - description: Database queries that exceed performance thresholds - - name: write-error - displayName: Write Error - description: Failures when attempting write operations - - name: recovery - displayName: Recovery - description: Data recovery and restoration operations - - name: maxclients - displayName: Max Clients - description: Connection limit issues in database systems - - name: connection-pool - displayName: Connection Pool - description: Problems with database connection pooling - - name: limit - displayName: Limit - description: Various system and resource limits being exceeded - - name: disk - displayName: Disk - description: Problems related to disk storage, space, or I/O operations - - name: replica - displayName: Replica - description: Issues related to database replicas and read-only instances - - name: supabase - displayName: Supabase - description: Problems related to Supabase self-hosted deployments and services - - name: gotrue - displayName: GoTrue - description: Problems related to Supabase's GoTrue authentication service - - name: realtime - displayName: Realtime - description: Problems related to Supabase's realtime service and WebSocket connections - - name: self-hosted - displayName: Self-Hosted - description: Problems specific to self-hosted deployments and infrastructure - - name: exit-code - displayName: Exit Code - description: Problems identified by specific process/container exit codes (e.g., 137, 127, 134, 139). - - name: entrypoint - displayName: Entrypoint - description: Failures caused by invalid or missing container ENTRYPOINT/CMD definitions. - - name: command - displayName: Command - description: Problems caused by invalid commands or arguments at startup (e.g., not found, bad path, non-executable). - - name: sigabrt - displayName: SIGABRT - description: Crashes where a process aborts with SIGABRT (exit 134), often due to assertion failures or allocator checks. - - name: native - displayName: Native - description: Issues in native code paths (C/C++/Rust, libc/ABI), including crashes and memory faults. - - name: reliability - displayName: Reliability - description: Unstable behavior such as unexpected restarts, crash loops, or intermittent failures affecting service reliability. - name: autogpt displayName: AutoGPT description: Problems related to AutoGPT autonomous AI agent framework @@ -964,4 +883,4 @@ tags: description: Issues that have severe impact on production systems and require immediate attention - name: data-integrity displayName: Data Integrity - description: Problems that affect the completeness, accuracy, or consistency of data \ No newline at end of file + description: Problems that affect the completeness, accuracy, or consistency of data