@@ -143,6 +143,8 @@ def __init__(
143143        self ._model_uid_to_addr : Dict [str , str ] =  {}
144144        self ._model_uid_to_recover_count : Dict [str , Optional [int ]] =  {}
145145        self ._model_uid_to_launch_args : Dict [str , Dict ] =  {}
146+         self ._gpu_memory_info : Dict [int , Dict [str , Union [int , float ]]] =  {}
147+         self ._model_memory_usage : Dict [str , int ] =  {}
146148
147149        if  XINFERENCE_DISABLE_METRICS :
148150            logger .info (
@@ -495,22 +497,124 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
495497                    break 
496498            if  allocated_non_embedding_rerank_models :
497499                user_specified_allocated_devices .add (dev )
498-         allocated_devices  =  set (self ._gpu_to_model_uid .keys ()).union (
499-             user_specified_allocated_devices 
500-         )
501-         if  n_gpu  >  len (self ._total_gpu_devices ) -  len (allocated_devices ):
502-             raise  RuntimeError ("No available slot found for the model" )
503500
504-         devices : List [int ] =  [
501+         # Check for completely available GPUs first 
502+         completely_available_gpus  =  [
505503            dev 
506504            for  dev  in  self ._total_gpu_devices 
507505            if  dev  not  in self ._gpu_to_model_uid 
508506            and  dev  not  in user_specified_allocated_devices 
509-         ][:n_gpu ]
510-         for  dev  in  devices :
507+         ]
508+ 
509+         if  len (completely_available_gpus ) >=  n_gpu :
510+             # We have enough completely available GPUs 
511+             devices  =  completely_available_gpus [:n_gpu ]
512+             for  dev  in  devices :
513+                 self ._gpu_to_model_uid [int (dev )] =  model_uid 
514+             logger .info (f"Allocated completely available GPUs: { devices }  )
515+             return  sorted (devices )
516+ 
517+         # Not enough completely available GPUs, try memory-aware allocation 
518+         logger .info (
519+             f"Not enough completely available GPUs, trying memory-aware allocation" 
520+         )
521+ 
522+         # Initialize memory tracking if not already done 
523+         if  not  self ._gpu_memory_info :
524+             self ._initialize_gpu_memory_tracking ()
525+ 
526+         # Try to allocate based on available memory 
527+         selected_devices  =  []
528+ 
529+         # First, use any completely available GPUs 
530+         for  dev  in  completely_available_gpus :
531+             selected_devices .append (dev )
532+             self ._gpu_to_model_uid [int (dev )] =  model_uid 
533+             if  len (selected_devices ) ==  n_gpu :
534+                 break 
535+ 
536+         # If we still need more GPUs, select those with most available memory 
537+         if  len (selected_devices ) <  n_gpu :
538+             remaining_needed  =  n_gpu  -  len (selected_devices )
539+ 
540+             # Get GPUs sorted by available memory (most available first) 
541+             candidate_gpus  =  [
542+                 dev  for  dev  in  self ._total_gpu_devices  if  dev  not  in selected_devices 
543+             ]
544+ 
545+             gpu_memory_list  =  []
546+             for  dev  in  candidate_gpus :
547+                 self ._update_gpu_memory_info (dev )
548+                 available_memory  =  self ._gpu_memory_info [dev ]["available" ]
549+                 gpu_memory_list .append ((dev , available_memory ))
550+ 
551+             # Sort by available memory (descending) 
552+             gpu_memory_list .sort (key = lambda  x : x [1 ], reverse = True )
553+ 
554+             # Select GPUs with most available memory 
555+             for  dev , available_memory  in  gpu_memory_list [:remaining_needed ]:
556+                 selected_devices .append (dev )
557+                 self ._gpu_to_model_uid [int (dev )] =  model_uid 
558+                 logger .info (
559+                     f"Selected GPU { dev } { available_memory }  
560+                 )
561+ 
562+         if  len (selected_devices ) !=  n_gpu :
563+             raise  RuntimeError ("No available slot found for the model" )
564+ 
565+         logger .info (f"Allocated GPUs using memory-aware strategy: { selected_devices }  )
566+         return  sorted (selected_devices )
567+ 
568+     def  allocate_devices_for_model (
569+         self ,
570+         model_uid : str ,
571+         model_name : str ,
572+         model_size : Union [int , str ],
573+         model_format : Optional [str ],
574+         quantization : Optional [str ],
575+         n_gpu : int  =  1 ,
576+     ) ->  List [int ]:
577+         """ 
578+         Enhanced GPU allocation that considers model memory requirements. 
579+         """ 
580+         # Estimate memory usage for this model 
581+         estimated_memory_mb  =  self ._estimate_model_memory_usage (
582+             model_name , model_size , model_format , quantization 
583+         )
584+ 
585+         self ._model_memory_usage [model_uid ] =  estimated_memory_mb 
586+ 
587+         # Try to find GPUs that can accommodate the model 
588+         suitable_gpus  =  []
589+ 
590+         for  gpu_idx  in  self ._total_gpu_devices :
591+             if  self ._can_fit_model_on_gpu (gpu_idx , estimated_memory_mb ):
592+                 suitable_gpus .append (gpu_idx )
593+ 
594+         if  len (suitable_gpus ) >=  n_gpu :
595+             # We have enough suitable GPUs 
596+             selected  =  suitable_gpus [:n_gpu ]
597+         else :
598+             # Not enough GPUs with sufficient memory, but try anyway 
599+             logger .warning (
600+                 f"Only found { len (suitable_gpus )}  
601+             )
602+             # Use the GPU with most available memory 
603+             best_gpu  =  self ._get_gpu_with_most_available_memory ()
604+             selected  =  [best_gpu ]
605+ 
606+         # Update tracking 
607+         for  dev  in  selected :
511608            self ._gpu_to_model_uid [int (dev )] =  model_uid 
609+             # Update memory usage tracking 
610+             if  dev  in  self ._gpu_memory_info :
611+                 self ._gpu_memory_info [dev ]["used" ] +=  estimated_memory_mb 
612+                 self ._gpu_memory_info [dev ]["available" ] -=  estimated_memory_mb 
512613
513-         return  sorted (devices )
614+         logger .info (
615+             f"Allocated GPUs for model { model_name } { selected } { estimated_memory_mb }  
616+         )
617+         return  sorted (selected )
514618
515619    async  def  allocate_devices_with_gpu_idx (
516620        self , model_uid : str , model_type : str , gpu_idx : List [int ]
@@ -574,6 +678,30 @@ def release_devices(self, model_uid: str):
574678            for  model_info  in  model_infos :
575679                self ._user_specified_gpu_to_model_uids [dev ].remove (model_info )
576680
681+         # Update GPU memory tracking 
682+         if  model_uid  in  self ._model_memory_usage :
683+             released_memory  =  self ._model_memory_usage [model_uid ]
684+             logger .info (
685+                 f"Releasing { released_memory } { model_uid }  
686+             )
687+ 
688+             # Update memory info for all GPUs 
689+             for  dev  in  devices :
690+                 if  dev  in  self ._gpu_memory_info :
691+                     self ._gpu_memory_info [dev ]["used" ] =  max (
692+                         0 , self ._gpu_memory_info [dev ]["used" ] -  released_memory 
693+                     )
694+                     self ._gpu_memory_info [dev ]["available" ] =  min (
695+                         self ._gpu_memory_info [dev ]["total" ],
696+                         self ._gpu_memory_info [dev ]["available" ] +  released_memory ,
697+                     )
698+                     logger .info (
699+                         f"Updated GPU { dev } { self ._gpu_memory_info [dev ]['used' ]} { self ._gpu_memory_info [dev ]['available' ]}  
700+                     )
701+ 
702+             # Remove model from memory usage tracking 
703+             del  self ._model_memory_usage [model_uid ]
704+ 
577705    async  def  _create_subpool (
578706        self ,
579707        model_uid : str ,
@@ -1478,6 +1606,131 @@ def update_model_status(self, model_uid: str, **kwargs):
14781606    def  get_model_status (self , model_uid : str ):
14791607        return  self ._model_uid_to_model_status .get (model_uid )
14801608
1609+     def  _initialize_gpu_memory_tracking (self ):
1610+         """Initialize GPU memory tracking for all available GPUs""" 
1611+         try :
1612+             import  pynvml 
1613+ 
1614+             pynvml .nvmlInit ()
1615+             for  gpu_idx  in  self ._total_gpu_devices :
1616+                 handle  =  pynvml .nvmlDeviceGetHandleByIndex (gpu_idx )
1617+                 mem_info  =  pynvml .nvmlDeviceGetMemoryInfo (handle )
1618+                 self ._gpu_memory_info [gpu_idx ] =  {
1619+                     "total" : mem_info .total  //  (1024 ** 2 ),  # Convert to MB 
1620+                     "used" : mem_info .used  //  (1024 ** 2 ),
1621+                     "available" : mem_info .free  //  (1024 ** 2 ),
1622+                 }
1623+             logger .info (
1624+                 f"Initialized GPU memory tracking for { len (self ._total_gpu_devices )}  
1625+             )
1626+         except  ImportError :
1627+             logger .warning ("pynvml not available, GPU memory tracking disabled" )
1628+             # Fallback to basic tracking without actual memory info 
1629+             for  gpu_idx  in  self ._total_gpu_devices :
1630+                 self ._gpu_memory_info [gpu_idx ] =  {"total" : 0 , "used" : 0 , "available" : 0 }
1631+         except  Exception  as  e :
1632+             logger .error (f"Failed to initialize GPU memory tracking: { e }  )
1633+             for  gpu_idx  in  self ._total_gpu_devices :
1634+                 self ._gpu_memory_info [gpu_idx ] =  {"total" : 0 , "used" : 0 , "available" : 0 }
1635+ 
1636+     def  _update_gpu_memory_info (self , gpu_idx : int ):
1637+         """Update memory information for a specific GPU""" 
1638+         try :
1639+             import  pynvml 
1640+ 
1641+             handle  =  pynvml .nvmlDeviceGetHandleByIndex (gpu_idx )
1642+             mem_info  =  pynvml .nvmlDeviceGetMemoryInfo (handle )
1643+             self ._gpu_memory_info [gpu_idx ] =  {
1644+                 "total" : mem_info .total  //  (1024 ** 2 ),
1645+                 "used" : mem_info .used  //  (1024 ** 2 ),
1646+                 "available" : mem_info .free  //  (1024 ** 2 ),
1647+             }
1648+         except  Exception  as  e :
1649+             logger .debug (f"Failed to update GPU { gpu_idx } { e }  )
1650+ 
1651+     def  _get_gpu_with_most_available_memory (self ) ->  int :
1652+         """Find the GPU with the most available memory""" 
1653+         self ._initialize_gpu_memory_tracking () if  not  self ._gpu_memory_info  else  None 
1654+ 
1655+         max_available_gpu  =  - 1 
1656+         max_available_memory : Union [int , float ] =  - 1 
1657+ 
1658+         for  gpu_idx  in  self ._total_gpu_devices :
1659+             self ._update_gpu_memory_info (gpu_idx )
1660+             available_memory  =  self ._gpu_memory_info [gpu_idx ]["available" ]
1661+ 
1662+             if  available_memory  >  max_available_memory :
1663+                 max_available_memory  =  available_memory 
1664+                 max_available_gpu  =  gpu_idx 
1665+ 
1666+         if  max_available_gpu  ==  - 1 :
1667+             raise  RuntimeError ("No suitable GPU found" )
1668+ 
1669+         logger .info (
1670+             f"Selected GPU { max_available_gpu } { max_available_memory }  
1671+         )
1672+         return  max_available_gpu 
1673+ 
1674+     def  _estimate_model_memory_usage (
1675+         self ,
1676+         model_name : str ,
1677+         model_size : Union [int , str ],
1678+         model_format : Optional [str ],
1679+         quantization : Optional [str ],
1680+     ) ->  int :
1681+         """Estimate memory usage for a model based on its characteristics""" 
1682+         # Basic estimation logic - this can be enhanced with more sophisticated calculations 
1683+         if  isinstance (model_size , str ):
1684+             # Convert string size like "7B" to integer 
1685+             if  "B"  in  model_size :
1686+                 size_gb  =  float (model_size .replace ("B" , "" ))
1687+             else :
1688+                 size_gb  =  float (model_size )
1689+         else :
1690+             size_gb  =  float (model_size )
1691+ 
1692+         # Base memory estimation (rough calculation) 
1693+         base_memory_mb  =  int (size_gb  *  1024  *  1.5 )  # 1.5GB per billion parameters 
1694+ 
1695+         # Adjust based on quantization 
1696+         if  quantization :
1697+             if  "4bit"  in  quantization .lower () or  "4-bit"  in  quantization .lower ():
1698+                 base_memory_mb  =  base_memory_mb  //  3 
1699+             elif  "8bit"  in  quantization .lower () or  "8-bit"  in  quantization .lower ():
1700+                 base_memory_mb  =  base_memory_mb  //  2 
1701+ 
1702+         # Adjust based on format 
1703+         if  model_format :
1704+             if  "gguf"  in  model_format .lower ():
1705+                 base_memory_mb  =  int (
1706+                     base_memory_mb  *  0.8 
1707+                 )  # GGUF is generally more memory efficient 
1708+ 
1709+         # Add some buffer for overhead 
1710+         base_memory_mb  =  int (base_memory_mb  *  1.2 )
1711+ 
1712+         logger .debug (f"Estimated memory usage for { model_name } { base_memory_mb }  )
1713+         return  base_memory_mb 
1714+ 
1715+     def  _can_fit_model_on_gpu (self , gpu_idx : int , estimated_memory_mb : int ) ->  bool :
1716+         """Check if a model can fit on a specific GPU""" 
1717+         if  gpu_idx  not  in self ._gpu_memory_info :
1718+             self ._update_gpu_memory_info (gpu_idx )
1719+ 
1720+         available_memory  =  self ._gpu_memory_info [gpu_idx ]["available" ]
1721+         can_fit  =  estimated_memory_mb  <=  available_memory 
1722+ 
1723+         if  can_fit :
1724+             logger .info (
1725+                 f"Model can fit on GPU { gpu_idx } { estimated_memory_mb } { available_memory }  
1726+             )
1727+         else :
1728+             logger .warning (
1729+                 f"Model cannot fit on GPU { gpu_idx } { estimated_memory_mb } { available_memory }  
1730+             )
1731+ 
1732+         return  can_fit 
1733+ 
14811734    @staticmethod  
14821735    def  record_metrics (name , op , kwargs ):
14831736        record_metrics (name , op , kwargs )
0 commit comments