@@ -180,6 +180,17 @@ class OffloadMode(Enum):
180180 AUTO = auto () # Automatically choose based on available memory
181181
182182
183+ @dataclass
184+ class OffloadReason :
185+ """Structured reason for offload mode resolution."""
186+
187+ key : str
188+ detail : Optional [str ] = None
189+
190+ def __str__ (self ):
191+ return f"{ self .key } : { self .detail } " if self .detail else self .key
192+
193+
183194@dataclass
184195class OffloadConfig :
185196 """Configuration for offloading behavior."""
@@ -762,11 +773,11 @@ def get_available_gpu_memory(self) -> float:
762773 Returns
763774 -------
764775 float
765- Available GPU memory in megabytes, or infinity if CUDA is not
776+ Available GPU memory in megabytes, or 0.0 if CUDA is not
766777 available or execution device is CPU.
767778 """
768779 if not torch .cuda .is_available () or self .exe_device .type != "cuda" :
769- return float ( "inf" )
780+ return 0.0
770781 torch .cuda .synchronize ()
771782 torch .cuda .empty_cache ()
772783 return torch .cuda .mem_get_info (self .exe_device )[0 ] / (1024 * 1024 )
@@ -863,62 +874,96 @@ def estimate_safe_batch_size(
863874
864875 def _resolve_offload_mode (
865876 self , output_mb : float , gpu_free_mb : float , cpu_free_mb : float , disk_free_mb : float
866- ) -> Tuple [OffloadMode , str ]:
867- """Resolve actual offload mode for AUTO, returns (mode, reason).
877+ ) -> Tuple [OffloadMode , OffloadReason ]:
878+ """Resolve actual offload mode, returns (mode, reason).
879+
880+ For user-requested modes, the requested mode is used if it fits.
881+ Otherwise, modes fall back: GPU -> CPU -> DISK -> CPU(swap as last resort).
882+
883+ For AUTO mode, the priority is:
884+ GPU (if within threshold) -> CPU -> DISK -> CPU(swap as last resort).
868885
869886 Note: CPU mode can use either pinned or non-pinned memory.
870887 - Pinned memory: faster for async GPU-CPU transfers, but locks physical memory
871888 - Non-pinned memory: slower transfers, but can use virtual memory (swap)
872-
873- For large outputs, we prefer non-pinned CPU memory over disk when CPU memory is available.
874- Disk offload is only available when disk_offload_dir is set.
875889 """
876- # Disk offload is only available if disk_offload_dir is configured
877- disk_available = self .disk_offload_dir is not None
878- effective_disk = max (0 , disk_free_mb - self .disk_min_free_mb ) if disk_available else 0
879890
891+ has_gpu = gpu_free_mb > 0
892+ has_disk = self .disk_offload_dir is not None
893+ effective_disk = max (0 , disk_free_mb - self .disk_min_free_mb ) if has_disk else 0
894+
895+ safe_cpu_mb = cpu_free_mb * self .cpu_safety_factor
896+ safe_disk_mb = effective_disk * self .disk_safety_factor
897+
898+ gpu_fits = has_gpu and output_mb <= gpu_free_mb
899+ cpu_fits = output_mb <= safe_cpu_mb
900+ disk_fits = has_disk and output_mb <= safe_disk_mb
901+
902+ # User-requested mode with cascading fallback
903+ # If the requested mode fails, downgrade one tier at a time: GPU -> CPU -> DISK -> CPU (swap as last resort)
880904 if self .offload_mode != OffloadMode .AUTO :
881- # User explicitly requested a mode - respect it
882- if self .offload_mode == OffloadMode .DISK and not disk_available :
883- raise ValueError (
884- "Disk offload requested but disk_offload_dir is not configured. "
885- "Please specify disk_offload_dir in the configuration."
886- )
887- # For CPU mode, only check if there's enough CPU memory (pinned vs non-pinned handled in allocate)
888- if self .offload_mode == OffloadMode .CPU :
889- if output_mb > cpu_free_mb * self .cpu_safety_factor :
890- # Not enough CPU memory, fall back to disk if available
891- if disk_available :
892- return OffloadMode .DISK , f"cpu_requested_but_oom_disk_fallback"
893- # No disk available, CPU will be attempted anyway (may fail or use swap)
894- return self .offload_mode , "user_requested"
895-
896- # AUTO logic
897- if gpu_free_mb == float ("inf" ):
898- # No GPU, must use CPU or disk
899- if cpu_free_mb * self .cpu_safety_factor >= output_mb :
900- return OffloadMode .CPU , "no_gpu_cpu_fits"
901- if disk_available and effective_disk * self .disk_safety_factor >= output_mb :
902- return OffloadMode .DISK , "no_gpu_disk_fallback"
903- # Nothing fits well, try CPU anyway (may use swap)
904- return OffloadMode .CPU , "no_gpu_no_disk_cpu_fallback"
905-
906- # Check if output fits on GPU
907- output_pct = output_mb / max (gpu_free_mb , 1e-6 )
908- if output_pct <= self .auto_offload_threshold :
909- return OffloadMode .GPU , f"output_pct({ output_pct :.3f} )<=threshold({ self .auto_offload_threshold :.3f} )"
910-
911- # GPU too tight - decide between CPU and disk
912- # Prefer CPU if there's enough memory (pinned or not)
913- if cpu_free_mb * self .cpu_safety_factor >= output_mb :
914- return OffloadMode .CPU , "gpu_tight_cpu_fits"
915-
916- # CPU too tight, try disk if available
917- if disk_available and effective_disk * self .disk_safety_factor >= output_mb :
918- return OffloadMode .DISK , "gpu_tight_cpu_tight_disk_fits"
919-
920- # Nothing fits well, use CPU as it can use swap if needed
921- return OffloadMode .CPU , "all_tight_cpu_fallback"
905+ requested = self .offload_mode
906+
907+ if requested == OffloadMode .GPU :
908+ if gpu_fits :
909+ return OffloadMode .GPU , OffloadReason (
910+ "user_gpu_fits" , f"{ output_mb :.0f} MB <= { gpu_free_mb :.0f} MB gpu free"
911+ )
912+ elif cpu_fits :
913+ return OffloadMode .CPU , OffloadReason ("user_gpu_fails" , "gpu (requested) tight -> cpu" )
914+ elif disk_fits :
915+ return OffloadMode .DISK , OffloadReason ("user_gpu_fails" , "gpu (requested) tight, cpu tight -> disk" )
916+ else :
917+ return OffloadMode .CPU , OffloadReason (
918+ "user_gpu_fails" , "gpu (requested) tight, cpu tight, disk tight -> cpu (swap)"
919+ )
920+
921+ if requested == OffloadMode .CPU :
922+ if cpu_fits :
923+ return OffloadMode .CPU , OffloadReason (
924+ "user_cpu_fits" , f"{ output_mb :.0f} MB <= { safe_cpu_mb :.0f} MB safe cpu free"
925+ )
926+ elif disk_fits :
927+ return OffloadMode .DISK , OffloadReason ("user_cpu_fails" , "cpu (requested) tight -> disk" )
928+ else :
929+ return OffloadMode .CPU , OffloadReason (
930+ "user_cpu_fails" , "cpu (requested) tight, disk tight -> cpu (swap)"
931+ )
932+
933+ if requested == OffloadMode .DISK :
934+ if not has_disk :
935+ raise ValueError (
936+ "Disk offload requested but disk_offload_dir is not configured. "
937+ "Please specify disk_offload_dir in the configuration."
938+ )
939+
940+ if disk_fits :
941+ return OffloadMode .DISK , OffloadReason (
942+ "user_disk_fits" , f"{ output_mb :.0f} MB <= { safe_disk_mb :.0f} MB safe disk free"
943+ )
944+ else :
945+ return OffloadMode .CPU , OffloadReason ("user_disk_fails" , "disk (requested) tight -> cpu (swap)" )
946+
947+ # AUTO mode
948+ output_pct = output_mb / max (gpu_free_mb , 1e-6 ) if has_gpu else 1.0
949+ gpu_within_threshold = has_gpu and output_pct <= self .auto_offload_threshold
950+
951+ if gpu_within_threshold :
952+ return OffloadMode .GPU , OffloadReason (
953+ "auto_gpu_fits" ,
954+ f"{ output_mb :.0f} MB <= { self .auto_offload_threshold * gpu_free_mb :.0f} MB safe gpu free" ,
955+ )
956+ elif cpu_fits :
957+ return OffloadMode .CPU , OffloadReason (
958+ "auto_cpu_fits" , f"gpu tight -> cpu ({ output_mb :.0f} MB <= { safe_cpu_mb :.0f} MB safe cpu free)"
959+ )
960+ elif disk_fits :
961+ return OffloadMode .DISK , OffloadReason (
962+ "auto_disk_fits" ,
963+ f"gpu tight, cpu tight -> disk ({ output_mb :.0f} MB <= { safe_disk_mb :.0f} MB safe disk free)" ,
964+ )
965+ else :
966+ return OffloadMode .CPU , OffloadReason ("auto_cpu_swap" , "gpu tight, cpu tight, disk tight -> cpu (swap)" )
922967
923968 def _allocate_output_buffer (
924969 self ,
0 commit comments