@@ -116,33 +116,43 @@ def detect_optimal_gpu_config() -> dict:
116116
117117 for i in range (num_gpus ):
118118 props = torch .cuda .get_device_properties (i )
119- vram_gb = props .total_memory / (1024 ** 3 )
119+ total_vram_gb = props .total_memory / (1024 ** 3 )
120+ # Get available/free VRAM (total - already allocated)
121+ with torch .cuda .device (i ):
122+ free_vram_bytes = torch .cuda .mem_get_info ()[0 ] # (free, total)
123+ free_vram_gb = free_vram_bytes / (1024 ** 3 )
120124 compute_cap = props .major + props .minor / 10
121125 gpu_info [i ] = {
122126 "name" : props .name ,
123- "vram_gb" : vram_gb ,
127+ "vram_gb" : total_vram_gb ,
128+ "vram_free_gb" : free_vram_gb ,
124129 "compute_capability" : compute_cap ,
125130 "supports_flash_attention" : compute_cap >= 7.0 ,
126131 }
127- total_vram += vram_gb
128- if vram_gb > max_vram :
129- max_vram = vram_gb
132+ total_vram += total_vram_gb
133+ # Use FREE VRAM for decision making, not total
134+ if free_vram_gb > max_vram :
135+ max_vram = free_vram_gb
130136 max_vram_gpu = i
131137 if compute_cap > max_compute :
132138 max_compute = compute_cap
133139 max_compute_gpu = i
134140
135141 result ["gpu_info" ] = gpu_info
136142
137- # Log detected GPUs
143+ # Log detected GPUs with both total and available VRAM
138144 print (f"\n [Auto-Config] Detected { num_gpus } GPU(s):" , flush = True )
139145 for i , info in gpu_info .items ():
140146 fa_status = "✓ Flash Attention" if info ["supports_flash_attention" ] else "✗ No Flash Attention"
141- print (f" GPU { i } : { info ['name' ]} ({ info ['vram_gb' ]:.1f} GB, SM { info ['compute_capability' ]} ) - { fa_status } " , flush = True )
147+ vram_status = f"{ info ['vram_free_gb' ]:.1f} GB free / { info ['vram_gb' ]:.1f} GB total"
148+ print (f" GPU { i } : { info ['name' ]} ({ vram_status } , SM { info ['compute_capability' ]} ) - { fa_status } " , flush = True )
142149
143150 # Decision logic for single GPU
144151 if num_gpus == 1 :
145- vram = gpu_info [0 ]["vram_gb" ]
152+ # Use FREE VRAM for threshold decisions (accounts for VRAM used by other apps)
153+ vram = gpu_info [0 ]["vram_free_gb" ]
154+ total_vram = gpu_info [0 ]["vram_gb" ]
155+ print (f"[Auto-Config] Using FREE VRAM ({ vram :.1f} GB) for configuration (total: { total_vram :.1f} GB)" , flush = True )
146156
147157 if vram >= VRAM_THRESHOLD_FULL_PRECISION :
148158 # 20GB+: Full precision, no swapping needed
@@ -594,6 +604,13 @@ def generate_with_callback(inputs, callback=None, **kwargs):
594604 bs_size = 2 if cfg_scale != 1.0 else 1
595605 pipeline .mula .setup_caches (bs_size )
596606
607+ # Log VRAM usage after cache setup
608+ if torch .cuda .is_available ():
609+ allocated = torch .cuda .memory_allocated () / 1024 ** 3
610+ reserved = torch .cuda .memory_reserved () / 1024 ** 3
611+ free = torch .cuda .mem_get_info ()[0 ] / 1024 ** 3
612+ print (f"[VRAM] After cache setup: { allocated :.2f} GB allocated, { reserved :.2f} GB reserved, { free :.2f} GB free" , flush = True )
613+
597614 with torch .autocast (device_type = pipeline .mula_device .type , dtype = pipeline .mula_dtype ):
598615 curr_token = pipeline .mula .generate_frame (
599616 tokens = prompt_tokens ,
0 commit comments