Skip to content

Commit 96580e8

Browse files
author
ambsd
committed
fix: Use available VRAM instead of total for auto-config
- Detect free VRAM using torch.cuda.mem_get_info() for more accurate threshold decisions - Accounts for VRAM used by other apps (desktop, browser, etc.) - Add VRAM logging after KV cache setup for debugging - Log both free and total VRAM during startup Helps 8GB GPU users understand actual available memory.
1 parent 5c3e404 commit 96580e8

File tree

1 file changed

+25
-8
lines changed

1 file changed

+25
-8
lines changed

backend/app/services/music_service.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,33 +116,43 @@ def detect_optimal_gpu_config() -> dict:
116116

117117
for i in range(num_gpus):
118118
props = torch.cuda.get_device_properties(i)
119-
vram_gb = props.total_memory / (1024 ** 3)
119+
total_vram_gb = props.total_memory / (1024 ** 3)
120+
# Get available/free VRAM (total - already allocated)
121+
with torch.cuda.device(i):
122+
free_vram_bytes = torch.cuda.mem_get_info()[0] # (free, total)
123+
free_vram_gb = free_vram_bytes / (1024 ** 3)
120124
compute_cap = props.major + props.minor / 10
121125
gpu_info[i] = {
122126
"name": props.name,
123-
"vram_gb": vram_gb,
127+
"vram_gb": total_vram_gb,
128+
"vram_free_gb": free_vram_gb,
124129
"compute_capability": compute_cap,
125130
"supports_flash_attention": compute_cap >= 7.0,
126131
}
127-
total_vram += vram_gb
128-
if vram_gb > max_vram:
129-
max_vram = vram_gb
132+
total_vram += total_vram_gb
133+
# Use FREE VRAM for decision making, not total
134+
if free_vram_gb > max_vram:
135+
max_vram = free_vram_gb
130136
max_vram_gpu = i
131137
if compute_cap > max_compute:
132138
max_compute = compute_cap
133139
max_compute_gpu = i
134140

135141
result["gpu_info"] = gpu_info
136142

137-
# Log detected GPUs
143+
# Log detected GPUs with both total and available VRAM
138144
print(f"\n[Auto-Config] Detected {num_gpus} GPU(s):", flush=True)
139145
for i, info in gpu_info.items():
140146
fa_status = "✓ Flash Attention" if info["supports_flash_attention"] else "✗ No Flash Attention"
141-
print(f" GPU {i}: {info['name']} ({info['vram_gb']:.1f} GB, SM {info['compute_capability']}) - {fa_status}", flush=True)
147+
vram_status = f"{info['vram_free_gb']:.1f}GB free / {info['vram_gb']:.1f}GB total"
148+
print(f" GPU {i}: {info['name']} ({vram_status}, SM {info['compute_capability']}) - {fa_status}", flush=True)
142149

143150
# Decision logic for single GPU
144151
if num_gpus == 1:
145-
vram = gpu_info[0]["vram_gb"]
152+
# Use FREE VRAM for threshold decisions (accounts for VRAM used by other apps)
153+
vram = gpu_info[0]["vram_free_gb"]
154+
total_vram = gpu_info[0]["vram_gb"]
155+
print(f"[Auto-Config] Using FREE VRAM ({vram:.1f}GB) for configuration (total: {total_vram:.1f}GB)", flush=True)
146156

147157
if vram >= VRAM_THRESHOLD_FULL_PRECISION:
148158
# 20GB+: Full precision, no swapping needed
@@ -594,6 +604,13 @@ def generate_with_callback(inputs, callback=None, **kwargs):
594604
bs_size = 2 if cfg_scale != 1.0 else 1
595605
pipeline.mula.setup_caches(bs_size)
596606

607+
# Log VRAM usage after cache setup
608+
if torch.cuda.is_available():
609+
allocated = torch.cuda.memory_allocated() / 1024**3
610+
reserved = torch.cuda.memory_reserved() / 1024**3
611+
free = torch.cuda.mem_get_info()[0] / 1024**3
612+
print(f"[VRAM] After cache setup: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {free:.2f}GB free", flush=True)
613+
597614
with torch.autocast(device_type=pipeline.mula_device.type, dtype=pipeline.mula_dtype):
598615
curr_token = pipeline.mula.generate_frame(
599616
tokens=prompt_tokens,

0 commit comments

Comments
 (0)