fix: Use available VRAM instead of total for auto-config

ambsd · ambsd · commit 96580e88c396 · 2026-01-29T12:05:30.000+02:00
- Detect free VRAM using torch.cuda.mem_get_info() for more accurate threshold decisions
- Accounts for VRAM used by other apps (desktop, browser, etc.)
- Add VRAM logging after KV cache setup for debugging
- Log both free and total VRAM during startup

Helps 8GB GPU users understand actual available memory.
diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py
@@ -116,33 +116,43 @@ def detect_optimal_gpu_config() -> dict:
 
     for i in range(num_gpus):
         props = torch.cuda.get_device_properties(i)
-        vram_gb = props.total_memory / (1024 ** 3)
+        total_vram_gb = props.total_memory / (1024 ** 3)
+        # Get available/free VRAM (total - already allocated)
+        with torch.cuda.device(i):
+            free_vram_bytes = torch.cuda.mem_get_info()[0]  # (free, total)
+            free_vram_gb = free_vram_bytes / (1024 ** 3)
         compute_cap = props.major + props.minor / 10
         gpu_info[i] = {
             "name": props.name,
-            "vram_gb": vram_gb,
+            "vram_gb": total_vram_gb,
+            "vram_free_gb": free_vram_gb,
             "compute_capability": compute_cap,
             "supports_flash_attention": compute_cap >= 7.0,
         }
-        total_vram += vram_gb
-        if vram_gb > max_vram:
-            max_vram = vram_gb
+        total_vram += total_vram_gb
+        # Use FREE VRAM for decision making, not total
+        if free_vram_gb > max_vram:
+            max_vram = free_vram_gb
             max_vram_gpu = i
         if compute_cap > max_compute:
             max_compute = compute_cap
             max_compute_gpu = i
 
     result["gpu_info"] = gpu_info
 
-    # Log detected GPUs
+    # Log detected GPUs with both total and available VRAM
     print(f"\n[Auto-Config] Detected {num_gpus} GPU(s):", flush=True)
     for i, info in gpu_info.items():
         fa_status = "✓ Flash Attention" if info["supports_flash_attention"] else "✗ No Flash Attention"
-        print(f"  GPU {i}: {info['name']} ({info['vram_gb']:.1f} GB, SM {info['compute_capability']}) - {fa_status}", flush=True)
+        vram_status = f"{info['vram_free_gb']:.1f}GB free / {info['vram_gb']:.1f}GB total"
+        print(f"  GPU {i}: {info['name']} ({vram_status}, SM {info['compute_capability']}) - {fa_status}", flush=True)
 
     # Decision logic for single GPU
     if num_gpus == 1:
-        vram = gpu_info[0]["vram_gb"]
+        # Use FREE VRAM for threshold decisions (accounts for VRAM used by other apps)
+        vram = gpu_info[0]["vram_free_gb"]
+        total_vram = gpu_info[0]["vram_gb"]
+        print(f"[Auto-Config] Using FREE VRAM ({vram:.1f}GB) for configuration (total: {total_vram:.1f}GB)", flush=True)
 
         if vram >= VRAM_THRESHOLD_FULL_PRECISION:
             # 20GB+: Full precision, no swapping needed
@@ -594,6 +604,13 @@ def generate_with_callback(inputs, callback=None, **kwargs):
         bs_size = 2 if cfg_scale != 1.0 else 1
         pipeline.mula.setup_caches(bs_size)
 
+        # Log VRAM usage after cache setup
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated() / 1024**3
+            reserved = torch.cuda.memory_reserved() / 1024**3
+            free = torch.cuda.mem_get_info()[0] / 1024**3
+            print(f"[VRAM] After cache setup: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {free:.2f}GB free", flush=True)
+
         with torch.autocast(device_type=pipeline.mula_device.type, dtype=pipeline.mula_dtype):
             curr_token = pipeline.mula.generate_frame(
                 tokens=prompt_tokens,