@@ -143,6 +143,8 @@ def __init__(
143143 self ._model_uid_to_addr : Dict [str , str ] = {}
144144 self ._model_uid_to_recover_count : Dict [str , Optional [int ]] = {}
145145 self ._model_uid_to_launch_args : Dict [str , Dict ] = {}
146+ self ._gpu_memory_info : Dict [int , Dict [str , Union [int , float ]]] = {}
147+ self ._model_memory_usage : Dict [str , int ] = {}
146148
147149 if XINFERENCE_DISABLE_METRICS :
148150 logger .info (
@@ -495,22 +497,124 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
495497 break
496498 if allocated_non_embedding_rerank_models :
497499 user_specified_allocated_devices .add (dev )
498- allocated_devices = set (self ._gpu_to_model_uid .keys ()).union (
499- user_specified_allocated_devices
500- )
501- if n_gpu > len (self ._total_gpu_devices ) - len (allocated_devices ):
502- raise RuntimeError ("No available slot found for the model" )
503500
504- devices : List [int ] = [
501+ # Check for completely available GPUs first
502+ completely_available_gpus = [
505503 dev
506504 for dev in self ._total_gpu_devices
507505 if dev not in self ._gpu_to_model_uid
508506 and dev not in user_specified_allocated_devices
509- ][:n_gpu ]
510- for dev in devices :
507+ ]
508+
509+ if len (completely_available_gpus ) >= n_gpu :
510+ # We have enough completely available GPUs
511+ devices = completely_available_gpus [:n_gpu ]
512+ for dev in devices :
513+ self ._gpu_to_model_uid [int (dev )] = model_uid
514+ logger .info (f"Allocated completely available GPUs: { devices } " )
515+ return sorted (devices )
516+
517+ # Not enough completely available GPUs, try memory-aware allocation
518+ logger .info (
519+ f"Not enough completely available GPUs, trying memory-aware allocation"
520+ )
521+
522+ # Initialize memory tracking if not already done
523+ if not self ._gpu_memory_info :
524+ self ._initialize_gpu_memory_tracking ()
525+
526+ # Try to allocate based on available memory
527+ selected_devices = []
528+
529+ # First, use any completely available GPUs
530+ for dev in completely_available_gpus :
531+ selected_devices .append (dev )
532+ self ._gpu_to_model_uid [int (dev )] = model_uid
533+ if len (selected_devices ) == n_gpu :
534+ break
535+
536+ # If we still need more GPUs, select those with most available memory
537+ if len (selected_devices ) < n_gpu :
538+ remaining_needed = n_gpu - len (selected_devices )
539+
540+ # Get GPUs sorted by available memory (most available first)
541+ candidate_gpus = [
542+ dev for dev in self ._total_gpu_devices if dev not in selected_devices
543+ ]
544+
545+ gpu_memory_list = []
546+ for dev in candidate_gpus :
547+ self ._update_gpu_memory_info (dev )
548+ available_memory = self ._gpu_memory_info [dev ]["available" ]
549+ gpu_memory_list .append ((dev , available_memory ))
550+
551+ # Sort by available memory (descending)
552+ gpu_memory_list .sort (key = lambda x : x [1 ], reverse = True )
553+
554+ # Select GPUs with most available memory
555+ for dev , available_memory in gpu_memory_list [:remaining_needed ]:
556+ selected_devices .append (dev )
557+ self ._gpu_to_model_uid [int (dev )] = model_uid
558+ logger .info (
559+ f"Selected GPU { dev } with { available_memory } MB available memory"
560+ )
561+
562+ if len (selected_devices ) != n_gpu :
563+ raise RuntimeError ("No available slot found for the model" )
564+
565+ logger .info (f"Allocated GPUs using memory-aware strategy: { selected_devices } " )
566+ return sorted (selected_devices )
567+
568+ def allocate_devices_for_model (
569+ self ,
570+ model_uid : str ,
571+ model_name : str ,
572+ model_size : Union [int , str ],
573+ model_format : Optional [str ],
574+ quantization : Optional [str ],
575+ n_gpu : int = 1 ,
576+ ) -> List [int ]:
577+ """
578+ Enhanced GPU allocation that considers model memory requirements.
579+ """
580+ # Estimate memory usage for this model
581+ estimated_memory_mb = self ._estimate_model_memory_usage (
582+ model_name , model_size , model_format , quantization
583+ )
584+
585+ self ._model_memory_usage [model_uid ] = estimated_memory_mb
586+
587+ # Try to find GPUs that can accommodate the model
588+ suitable_gpus = []
589+
590+ for gpu_idx in self ._total_gpu_devices :
591+ if self ._can_fit_model_on_gpu (gpu_idx , estimated_memory_mb ):
592+ suitable_gpus .append (gpu_idx )
593+
594+ if len (suitable_gpus ) >= n_gpu :
595+ # We have enough suitable GPUs
596+ selected = suitable_gpus [:n_gpu ]
597+ else :
598+ # Not enough GPUs with sufficient memory, but try anyway
599+ logger .warning (
600+ f"Only found { len (suitable_gpus )} GPUs with sufficient memory, proceeding with allocation"
601+ )
602+ # Use the GPU with most available memory
603+ best_gpu = self ._get_gpu_with_most_available_memory ()
604+ selected = [best_gpu ]
605+
606+ # Update tracking
607+ for dev in selected :
511608 self ._gpu_to_model_uid [int (dev )] = model_uid
609+ # Update memory usage tracking
610+ if dev in self ._gpu_memory_info :
611+ self ._gpu_memory_info [dev ]["used" ] += estimated_memory_mb
612+ self ._gpu_memory_info [dev ]["available" ] -= estimated_memory_mb
512613
513- return sorted (devices )
614+ logger .info (
615+ f"Allocated GPUs for model { model_name } : { selected } , estimated memory: { estimated_memory_mb } MB"
616+ )
617+ return sorted (selected )
514618
515619 async def allocate_devices_with_gpu_idx (
516620 self , model_uid : str , model_type : str , gpu_idx : List [int ]
@@ -574,6 +678,30 @@ def release_devices(self, model_uid: str):
574678 for model_info in model_infos :
575679 self ._user_specified_gpu_to_model_uids [dev ].remove (model_info )
576680
681+ # Update GPU memory tracking
682+ if model_uid in self ._model_memory_usage :
683+ released_memory = self ._model_memory_usage [model_uid ]
684+ logger .info (
685+ f"Releasing { released_memory } MB of memory for model { model_uid } "
686+ )
687+
688+ # Update memory info for all GPUs
689+ for dev in devices :
690+ if dev in self ._gpu_memory_info :
691+ self ._gpu_memory_info [dev ]["used" ] = max (
692+ 0 , self ._gpu_memory_info [dev ]["used" ] - released_memory
693+ )
694+ self ._gpu_memory_info [dev ]["available" ] = min (
695+ self ._gpu_memory_info [dev ]["total" ],
696+ self ._gpu_memory_info [dev ]["available" ] + released_memory ,
697+ )
698+ logger .info (
699+ f"Updated GPU { dev } memory tracking: used={ self ._gpu_memory_info [dev ]['used' ]} MB, available={ self ._gpu_memory_info [dev ]['available' ]} MB"
700+ )
701+
702+ # Remove model from memory usage tracking
703+ del self ._model_memory_usage [model_uid ]
704+
577705 async def _create_subpool (
578706 self ,
579707 model_uid : str ,
@@ -1478,6 +1606,131 @@ def update_model_status(self, model_uid: str, **kwargs):
14781606 def get_model_status (self , model_uid : str ):
14791607 return self ._model_uid_to_model_status .get (model_uid )
14801608
1609+ def _initialize_gpu_memory_tracking (self ):
1610+ """Initialize GPU memory tracking for all available GPUs"""
1611+ try :
1612+ import pynvml
1613+
1614+ pynvml .nvmlInit ()
1615+ for gpu_idx in self ._total_gpu_devices :
1616+ handle = pynvml .nvmlDeviceGetHandleByIndex (gpu_idx )
1617+ mem_info = pynvml .nvmlDeviceGetMemoryInfo (handle )
1618+ self ._gpu_memory_info [gpu_idx ] = {
1619+ "total" : mem_info .total // (1024 ** 2 ), # Convert to MB
1620+ "used" : mem_info .used // (1024 ** 2 ),
1621+ "available" : mem_info .free // (1024 ** 2 ),
1622+ }
1623+ logger .info (
1624+ f"Initialized GPU memory tracking for { len (self ._total_gpu_devices )} GPUs"
1625+ )
1626+ except ImportError :
1627+ logger .warning ("pynvml not available, GPU memory tracking disabled" )
1628+ # Fallback to basic tracking without actual memory info
1629+ for gpu_idx in self ._total_gpu_devices :
1630+ self ._gpu_memory_info [gpu_idx ] = {"total" : 0 , "used" : 0 , "available" : 0 }
1631+ except Exception as e :
1632+ logger .error (f"Failed to initialize GPU memory tracking: { e } " )
1633+ for gpu_idx in self ._total_gpu_devices :
1634+ self ._gpu_memory_info [gpu_idx ] = {"total" : 0 , "used" : 0 , "available" : 0 }
1635+
1636+ def _update_gpu_memory_info (self , gpu_idx : int ):
1637+ """Update memory information for a specific GPU"""
1638+ try :
1639+ import pynvml
1640+
1641+ handle = pynvml .nvmlDeviceGetHandleByIndex (gpu_idx )
1642+ mem_info = pynvml .nvmlDeviceGetMemoryInfo (handle )
1643+ self ._gpu_memory_info [gpu_idx ] = {
1644+ "total" : mem_info .total // (1024 ** 2 ),
1645+ "used" : mem_info .used // (1024 ** 2 ),
1646+ "available" : mem_info .free // (1024 ** 2 ),
1647+ }
1648+ except Exception as e :
1649+ logger .debug (f"Failed to update GPU { gpu_idx } memory info: { e } " )
1650+
1651+ def _get_gpu_with_most_available_memory (self ) -> int :
1652+ """Find the GPU with the most available memory"""
1653+ self ._initialize_gpu_memory_tracking () if not self ._gpu_memory_info else None
1654+
1655+ max_available_gpu = - 1
1656+ max_available_memory : Union [int , float ] = - 1
1657+
1658+ for gpu_idx in self ._total_gpu_devices :
1659+ self ._update_gpu_memory_info (gpu_idx )
1660+ available_memory = self ._gpu_memory_info [gpu_idx ]["available" ]
1661+
1662+ if available_memory > max_available_memory :
1663+ max_available_memory = available_memory
1664+ max_available_gpu = gpu_idx
1665+
1666+ if max_available_gpu == - 1 :
1667+ raise RuntimeError ("No suitable GPU found" )
1668+
1669+ logger .info (
1670+ f"Selected GPU { max_available_gpu } with { max_available_memory } MB available memory"
1671+ )
1672+ return max_available_gpu
1673+
1674+ def _estimate_model_memory_usage (
1675+ self ,
1676+ model_name : str ,
1677+ model_size : Union [int , str ],
1678+ model_format : Optional [str ],
1679+ quantization : Optional [str ],
1680+ ) -> int :
1681+ """Estimate memory usage for a model based on its characteristics"""
1682+ # Basic estimation logic - this can be enhanced with more sophisticated calculations
1683+ if isinstance (model_size , str ):
1684+ # Convert string size like "7B" to integer
1685+ if "B" in model_size :
1686+ size_gb = float (model_size .replace ("B" , "" ))
1687+ else :
1688+ size_gb = float (model_size )
1689+ else :
1690+ size_gb = float (model_size )
1691+
1692+ # Base memory estimation (rough calculation)
1693+ base_memory_mb = int (size_gb * 1024 * 1.5 ) # 1.5GB per billion parameters
1694+
1695+ # Adjust based on quantization
1696+ if quantization :
1697+ if "4bit" in quantization .lower () or "4-bit" in quantization .lower ():
1698+ base_memory_mb = base_memory_mb // 3
1699+ elif "8bit" in quantization .lower () or "8-bit" in quantization .lower ():
1700+ base_memory_mb = base_memory_mb // 2
1701+
1702+ # Adjust based on format
1703+ if model_format :
1704+ if "gguf" in model_format .lower ():
1705+ base_memory_mb = int (
1706+ base_memory_mb * 0.8
1707+ ) # GGUF is generally more memory efficient
1708+
1709+ # Add some buffer for overhead
1710+ base_memory_mb = int (base_memory_mb * 1.2 )
1711+
1712+ logger .debug (f"Estimated memory usage for { model_name } : { base_memory_mb } MB" )
1713+ return base_memory_mb
1714+
1715+ def _can_fit_model_on_gpu (self , gpu_idx : int , estimated_memory_mb : int ) -> bool :
1716+ """Check if a model can fit on a specific GPU"""
1717+ if gpu_idx not in self ._gpu_memory_info :
1718+ self ._update_gpu_memory_info (gpu_idx )
1719+
1720+ available_memory = self ._gpu_memory_info [gpu_idx ]["available" ]
1721+ can_fit = estimated_memory_mb <= available_memory
1722+
1723+ if can_fit :
1724+ logger .info (
1725+ f"Model can fit on GPU { gpu_idx } : needs { estimated_memory_mb } MB, has { available_memory } MB available"
1726+ )
1727+ else :
1728+ logger .warning (
1729+ f"Model cannot fit on GPU { gpu_idx } : needs { estimated_memory_mb } MB, has { available_memory } MB available"
1730+ )
1731+
1732+ return can_fit
1733+
14811734 @staticmethod
14821735 def record_metrics (name , op , kwargs ):
14831736 record_metrics (name , op , kwargs )
0 commit comments