Skip to content

Commit 7cebef6

Browse files
committed
FEAT: support replicas on single GPU
1 parent 2d7d298 commit 7cebef6

File tree

4 files changed

+311
-13
lines changed

4 files changed

+311
-13
lines changed

xinference/api/restful_api.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,11 +1225,29 @@ async def launch_model(
12251225

12261226
if isinstance(gpu_idx, int):
12271227
gpu_idx = [gpu_idx]
1228-
if gpu_idx:
1229-
if len(gpu_idx) % replica:
1228+
1229+
# Check if single-GPU multi-replica is enabled
1230+
from ..constants import XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA
1231+
1232+
if XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA:
1233+
# Enhanced replica validation with single-GPU multi-replica support
1234+
if gpu_idx and len(gpu_idx) > 1 and len(gpu_idx) % replica:
1235+
# Only keep the restriction when multiple GPUs are specified
1236+
raise HTTPException(
1237+
status_code=400,
1238+
detail="Invalid input. When using multiple GPUs, the count must be a multiple of replica.",
1239+
)
1240+
# Allow single-GPU multi-replica deployment when enabled
1241+
if gpu_idx and len(gpu_idx) == 1 and replica > 1:
1242+
logger.info(
1243+
f"Single-GPU multi-replica deployment enabled: {replica} replicas on 1 GPU"
1244+
)
1245+
else:
1246+
# Traditional behavior - strict multiple requirement
1247+
if gpu_idx and len(gpu_idx) % replica:
12301248
raise HTTPException(
12311249
status_code=400,
1232-
detail="Invalid input. Allocated gpu must be a multiple of replica.",
1250+
detail="Invalid input. Allocated gpu must be a multiple of replica. Set XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA=1 to enable single-GPU multi-replica deployment.",
12331251
)
12341252

12351253
if peft_model_config is not None:

xinference/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@
3434
XINFERENCE_ENV_SSE_PING_ATTEMPTS_SECONDS = "XINFERENCE_SSE_PING_ATTEMPTS_SECONDS"
3535
XINFERENCE_ENV_MAX_TOKENS = "XINFERENCE_MAX_TOKENS"
3636
XINFERENCE_ENV_ALLOWED_IPS = "XINFERENCE_ALLOWED_IPS"
37+
XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA = (
38+
"XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA"
39+
)
3740

3841

3942
def get_xinference_home() -> str:
@@ -112,3 +115,6 @@ def get_xinference_home() -> str:
112115
else None
113116
)
114117
XINFERENCE_ALLOWED_IPS = os.getenv(XINFERENCE_ENV_ALLOWED_IPS)
118+
XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA = bool(
119+
int(os.getenv(XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA, "1"))
120+
) # Enable by default

xinference/core/utils.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,12 +250,33 @@ def parse_model_version(model_version: str, model_type: str) -> Tuple:
250250
def assign_replica_gpu(
251251
_replica_model_uid: str, replica: int, gpu_idx: Optional[Union[int, List[int]]]
252252
) -> Optional[List[int]]:
253+
"""
254+
Enhanced GPU assignment for replica models.
255+
Supports single-GPU multi-replica deployment by intelligently allocating GPUs.
256+
"""
253257
model_uid, rep_id = parse_replica_model_uid(_replica_model_uid)
254258
rep_id, replica = int(rep_id), int(replica)
259+
255260
if isinstance(gpu_idx, int):
256261
gpu_idx = [gpu_idx]
262+
257263
if isinstance(gpu_idx, list) and gpu_idx:
258-
return gpu_idx[rep_id::replica]
264+
# When we have enough GPUs for round-robin allocation
265+
if len(gpu_idx) >= replica:
266+
return gpu_idx[rep_id::replica]
267+
else:
268+
# Support single-GPU multi-replica deployment
269+
# All replicas will share the same GPU (or GPUs if more than 1 but less than replica count)
270+
# This allows multiple replicas to run on the same GPU using memory-aware scheduling
271+
if len(gpu_idx) == 1:
272+
# Single GPU case - all replicas use the same GPU
273+
return gpu_idx
274+
else:
275+
# Multiple GPUs but fewer than replicas - distribute as evenly as possible
276+
# This enables better resource utilization
277+
assigned_gpu = gpu_idx[rep_id % len(gpu_idx)]
278+
return [assigned_gpu]
279+
259280
return gpu_idx
260281

261282

xinference/core/worker.py

Lines changed: 262 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ def __init__(
143143
self._model_uid_to_addr: Dict[str, str] = {}
144144
self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
145145
self._model_uid_to_launch_args: Dict[str, Dict] = {}
146+
self._gpu_memory_info: Dict[int, Dict[str, Union[int, float]]] = {}
147+
self._model_memory_usage: Dict[str, int] = {}
146148

147149
if XINFERENCE_DISABLE_METRICS:
148150
logger.info(
@@ -495,22 +497,124 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
495497
break
496498
if allocated_non_embedding_rerank_models:
497499
user_specified_allocated_devices.add(dev)
498-
allocated_devices = set(self._gpu_to_model_uid.keys()).union(
499-
user_specified_allocated_devices
500-
)
501-
if n_gpu > len(self._total_gpu_devices) - len(allocated_devices):
502-
raise RuntimeError("No available slot found for the model")
503500

504-
devices: List[int] = [
501+
# Check for completely available GPUs first
502+
completely_available_gpus = [
505503
dev
506504
for dev in self._total_gpu_devices
507505
if dev not in self._gpu_to_model_uid
508506
and dev not in user_specified_allocated_devices
509-
][:n_gpu]
510-
for dev in devices:
507+
]
508+
509+
if len(completely_available_gpus) >= n_gpu:
510+
# We have enough completely available GPUs
511+
devices = completely_available_gpus[:n_gpu]
512+
for dev in devices:
513+
self._gpu_to_model_uid[int(dev)] = model_uid
514+
logger.info(f"Allocated completely available GPUs: {devices}")
515+
return sorted(devices)
516+
517+
# Not enough completely available GPUs, try memory-aware allocation
518+
logger.info(
519+
f"Not enough completely available GPUs, trying memory-aware allocation"
520+
)
521+
522+
# Initialize memory tracking if not already done
523+
if not self._gpu_memory_info:
524+
self._initialize_gpu_memory_tracking()
525+
526+
# Try to allocate based on available memory
527+
selected_devices = []
528+
529+
# First, use any completely available GPUs
530+
for dev in completely_available_gpus:
531+
selected_devices.append(dev)
532+
self._gpu_to_model_uid[int(dev)] = model_uid
533+
if len(selected_devices) == n_gpu:
534+
break
535+
536+
# If we still need more GPUs, select those with most available memory
537+
if len(selected_devices) < n_gpu:
538+
remaining_needed = n_gpu - len(selected_devices)
539+
540+
# Get GPUs sorted by available memory (most available first)
541+
candidate_gpus = [
542+
dev for dev in self._total_gpu_devices if dev not in selected_devices
543+
]
544+
545+
gpu_memory_list = []
546+
for dev in candidate_gpus:
547+
self._update_gpu_memory_info(dev)
548+
available_memory = self._gpu_memory_info[dev]["available"]
549+
gpu_memory_list.append((dev, available_memory))
550+
551+
# Sort by available memory (descending)
552+
gpu_memory_list.sort(key=lambda x: x[1], reverse=True)
553+
554+
# Select GPUs with most available memory
555+
for dev, available_memory in gpu_memory_list[:remaining_needed]:
556+
selected_devices.append(dev)
557+
self._gpu_to_model_uid[int(dev)] = model_uid
558+
logger.info(
559+
f"Selected GPU {dev} with {available_memory}MB available memory"
560+
)
561+
562+
if len(selected_devices) != n_gpu:
563+
raise RuntimeError("No available slot found for the model")
564+
565+
logger.info(f"Allocated GPUs using memory-aware strategy: {selected_devices}")
566+
return sorted(selected_devices)
567+
568+
def allocate_devices_for_model(
569+
self,
570+
model_uid: str,
571+
model_name: str,
572+
model_size: Union[int, str],
573+
model_format: Optional[str],
574+
quantization: Optional[str],
575+
n_gpu: int = 1,
576+
) -> List[int]:
577+
"""
578+
Enhanced GPU allocation that considers model memory requirements.
579+
"""
580+
# Estimate memory usage for this model
581+
estimated_memory_mb = self._estimate_model_memory_usage(
582+
model_name, model_size, model_format, quantization
583+
)
584+
585+
self._model_memory_usage[model_uid] = estimated_memory_mb
586+
587+
# Try to find GPUs that can accommodate the model
588+
suitable_gpus = []
589+
590+
for gpu_idx in self._total_gpu_devices:
591+
if self._can_fit_model_on_gpu(gpu_idx, estimated_memory_mb):
592+
suitable_gpus.append(gpu_idx)
593+
594+
if len(suitable_gpus) >= n_gpu:
595+
# We have enough suitable GPUs
596+
selected = suitable_gpus[:n_gpu]
597+
else:
598+
# Not enough GPUs with sufficient memory, but try anyway
599+
logger.warning(
600+
f"Only found {len(suitable_gpus)} GPUs with sufficient memory, proceeding with allocation"
601+
)
602+
# Use the GPU with most available memory
603+
best_gpu = self._get_gpu_with_most_available_memory()
604+
selected = [best_gpu]
605+
606+
# Update tracking
607+
for dev in selected:
511608
self._gpu_to_model_uid[int(dev)] = model_uid
609+
# Update memory usage tracking
610+
if dev in self._gpu_memory_info:
611+
self._gpu_memory_info[dev]["used"] += estimated_memory_mb
612+
self._gpu_memory_info[dev]["available"] -= estimated_memory_mb
512613

513-
return sorted(devices)
614+
logger.info(
615+
f"Allocated GPUs for model {model_name}: {selected}, estimated memory: {estimated_memory_mb}MB"
616+
)
617+
return sorted(selected)
514618

515619
async def allocate_devices_with_gpu_idx(
516620
self, model_uid: str, model_type: str, gpu_idx: List[int]
@@ -574,6 +678,30 @@ def release_devices(self, model_uid: str):
574678
for model_info in model_infos:
575679
self._user_specified_gpu_to_model_uids[dev].remove(model_info)
576680

681+
# Update GPU memory tracking
682+
if model_uid in self._model_memory_usage:
683+
released_memory = self._model_memory_usage[model_uid]
684+
logger.info(
685+
f"Releasing {released_memory}MB of memory for model {model_uid}"
686+
)
687+
688+
# Update memory info for all GPUs
689+
for dev in devices:
690+
if dev in self._gpu_memory_info:
691+
self._gpu_memory_info[dev]["used"] = max(
692+
0, self._gpu_memory_info[dev]["used"] - released_memory
693+
)
694+
self._gpu_memory_info[dev]["available"] = min(
695+
self._gpu_memory_info[dev]["total"],
696+
self._gpu_memory_info[dev]["available"] + released_memory,
697+
)
698+
logger.info(
699+
f"Updated GPU {dev} memory tracking: used={self._gpu_memory_info[dev]['used']}MB, available={self._gpu_memory_info[dev]['available']}MB"
700+
)
701+
702+
# Remove model from memory usage tracking
703+
del self._model_memory_usage[model_uid]
704+
577705
async def _create_subpool(
578706
self,
579707
model_uid: str,
@@ -1478,6 +1606,131 @@ def update_model_status(self, model_uid: str, **kwargs):
14781606
def get_model_status(self, model_uid: str):
14791607
return self._model_uid_to_model_status.get(model_uid)
14801608

1609+
def _initialize_gpu_memory_tracking(self):
1610+
"""Initialize GPU memory tracking for all available GPUs"""
1611+
try:
1612+
import pynvml
1613+
1614+
pynvml.nvmlInit()
1615+
for gpu_idx in self._total_gpu_devices:
1616+
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
1617+
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
1618+
self._gpu_memory_info[gpu_idx] = {
1619+
"total": mem_info.total // (1024**2), # Convert to MB
1620+
"used": mem_info.used // (1024**2),
1621+
"available": mem_info.free // (1024**2),
1622+
}
1623+
logger.info(
1624+
f"Initialized GPU memory tracking for {len(self._total_gpu_devices)} GPUs"
1625+
)
1626+
except ImportError:
1627+
logger.warning("pynvml not available, GPU memory tracking disabled")
1628+
# Fallback to basic tracking without actual memory info
1629+
for gpu_idx in self._total_gpu_devices:
1630+
self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0}
1631+
except Exception as e:
1632+
logger.error(f"Failed to initialize GPU memory tracking: {e}")
1633+
for gpu_idx in self._total_gpu_devices:
1634+
self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0}
1635+
1636+
def _update_gpu_memory_info(self, gpu_idx: int):
1637+
"""Update memory information for a specific GPU"""
1638+
try:
1639+
import pynvml
1640+
1641+
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
1642+
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
1643+
self._gpu_memory_info[gpu_idx] = {
1644+
"total": mem_info.total // (1024**2),
1645+
"used": mem_info.used // (1024**2),
1646+
"available": mem_info.free // (1024**2),
1647+
}
1648+
except Exception as e:
1649+
logger.debug(f"Failed to update GPU {gpu_idx} memory info: {e}")
1650+
1651+
def _get_gpu_with_most_available_memory(self) -> int:
1652+
"""Find the GPU with the most available memory"""
1653+
self._initialize_gpu_memory_tracking() if not self._gpu_memory_info else None
1654+
1655+
max_available_gpu = -1
1656+
max_available_memory: Union[int, float] = -1
1657+
1658+
for gpu_idx in self._total_gpu_devices:
1659+
self._update_gpu_memory_info(gpu_idx)
1660+
available_memory = self._gpu_memory_info[gpu_idx]["available"]
1661+
1662+
if available_memory > max_available_memory:
1663+
max_available_memory = available_memory
1664+
max_available_gpu = gpu_idx
1665+
1666+
if max_available_gpu == -1:
1667+
raise RuntimeError("No suitable GPU found")
1668+
1669+
logger.info(
1670+
f"Selected GPU {max_available_gpu} with {max_available_memory}MB available memory"
1671+
)
1672+
return max_available_gpu
1673+
1674+
def _estimate_model_memory_usage(
1675+
self,
1676+
model_name: str,
1677+
model_size: Union[int, str],
1678+
model_format: Optional[str],
1679+
quantization: Optional[str],
1680+
) -> int:
1681+
"""Estimate memory usage for a model based on its characteristics"""
1682+
# Basic estimation logic - this can be enhanced with more sophisticated calculations
1683+
if isinstance(model_size, str):
1684+
# Convert string size like "7B" to integer
1685+
if "B" in model_size:
1686+
size_gb = float(model_size.replace("B", ""))
1687+
else:
1688+
size_gb = float(model_size)
1689+
else:
1690+
size_gb = float(model_size)
1691+
1692+
# Base memory estimation (rough calculation)
1693+
base_memory_mb = int(size_gb * 1024 * 1.5) # 1.5GB per billion parameters
1694+
1695+
# Adjust based on quantization
1696+
if quantization:
1697+
if "4bit" in quantization.lower() or "4-bit" in quantization.lower():
1698+
base_memory_mb = base_memory_mb // 3
1699+
elif "8bit" in quantization.lower() or "8-bit" in quantization.lower():
1700+
base_memory_mb = base_memory_mb // 2
1701+
1702+
# Adjust based on format
1703+
if model_format:
1704+
if "gguf" in model_format.lower():
1705+
base_memory_mb = int(
1706+
base_memory_mb * 0.8
1707+
) # GGUF is generally more memory efficient
1708+
1709+
# Add some buffer for overhead
1710+
base_memory_mb = int(base_memory_mb * 1.2)
1711+
1712+
logger.debug(f"Estimated memory usage for {model_name}: {base_memory_mb}MB")
1713+
return base_memory_mb
1714+
1715+
def _can_fit_model_on_gpu(self, gpu_idx: int, estimated_memory_mb: int) -> bool:
1716+
"""Check if a model can fit on a specific GPU"""
1717+
if gpu_idx not in self._gpu_memory_info:
1718+
self._update_gpu_memory_info(gpu_idx)
1719+
1720+
available_memory = self._gpu_memory_info[gpu_idx]["available"]
1721+
can_fit = estimated_memory_mb <= available_memory
1722+
1723+
if can_fit:
1724+
logger.info(
1725+
f"Model can fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
1726+
)
1727+
else:
1728+
logger.warning(
1729+
f"Model cannot fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
1730+
)
1731+
1732+
return can_fit
1733+
14811734
@staticmethod
14821735
def record_metrics(name, op, kwargs):
14831736
record_metrics(name, op, kwargs)

0 commit comments

Comments
 (0)