From 7cebef6be46fecd2b8732e27999a7b4d16dc9cf3 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Fri, 17 Oct 2025 11:58:08 +0800 Subject: [PATCH 1/5] FEAT: support replicas on single GPU --- xinference/api/restful_api.py | 24 ++- xinference/constants.py | 6 + xinference/core/utils.py | 23 ++- xinference/core/worker.py | 271 ++++++++++++++++++++++++++++++++-- 4 files changed, 311 insertions(+), 13 deletions(-) diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py index 84c7b18d80..ff4ecdc226 100644 --- a/xinference/api/restful_api.py +++ b/xinference/api/restful_api.py @@ -1225,11 +1225,29 @@ async def launch_model( if isinstance(gpu_idx, int): gpu_idx = [gpu_idx] - if gpu_idx: - if len(gpu_idx) % replica: + + # Check if single-GPU multi-replica is enabled + from ..constants import XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA + + if XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA: + # Enhanced replica validation with single-GPU multi-replica support + if gpu_idx and len(gpu_idx) > 1 and len(gpu_idx) % replica: + # Only keep the restriction when multiple GPUs are specified + raise HTTPException( + status_code=400, + detail="Invalid input. When using multiple GPUs, the count must be a multiple of replica.", + ) + # Allow single-GPU multi-replica deployment when enabled + if gpu_idx and len(gpu_idx) == 1 and replica > 1: + logger.info( + f"Single-GPU multi-replica deployment enabled: {replica} replicas on 1 GPU" + ) + else: + # Traditional behavior - strict multiple requirement + if gpu_idx and len(gpu_idx) % replica: raise HTTPException( status_code=400, - detail="Invalid input. Allocated gpu must be a multiple of replica.", + detail="Invalid input. Allocated gpu must be a multiple of replica. Set XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA=1 to enable single-GPU multi-replica deployment.", ) if peft_model_config is not None: diff --git a/xinference/constants.py b/xinference/constants.py index 1c12ef331a..457e548cab 100644 --- a/xinference/constants.py +++ b/xinference/constants.py @@ -34,6 +34,9 @@ XINFERENCE_ENV_SSE_PING_ATTEMPTS_SECONDS = "XINFERENCE_SSE_PING_ATTEMPTS_SECONDS" XINFERENCE_ENV_MAX_TOKENS = "XINFERENCE_MAX_TOKENS" XINFERENCE_ENV_ALLOWED_IPS = "XINFERENCE_ALLOWED_IPS" +XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA = ( + "XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA" +) def get_xinference_home() -> str: @@ -112,3 +115,6 @@ def get_xinference_home() -> str: else None ) XINFERENCE_ALLOWED_IPS = os.getenv(XINFERENCE_ENV_ALLOWED_IPS) +XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA = bool( + int(os.getenv(XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA, "1")) +) # Enable by default diff --git a/xinference/core/utils.py b/xinference/core/utils.py index 7037ff0226..25968cb3f1 100644 --- a/xinference/core/utils.py +++ b/xinference/core/utils.py @@ -250,12 +250,33 @@ def parse_model_version(model_version: str, model_type: str) -> Tuple: def assign_replica_gpu( _replica_model_uid: str, replica: int, gpu_idx: Optional[Union[int, List[int]]] ) -> Optional[List[int]]: + """ + Enhanced GPU assignment for replica models. + Supports single-GPU multi-replica deployment by intelligently allocating GPUs. + """ model_uid, rep_id = parse_replica_model_uid(_replica_model_uid) rep_id, replica = int(rep_id), int(replica) + if isinstance(gpu_idx, int): gpu_idx = [gpu_idx] + if isinstance(gpu_idx, list) and gpu_idx: - return gpu_idx[rep_id::replica] + # When we have enough GPUs for round-robin allocation + if len(gpu_idx) >= replica: + return gpu_idx[rep_id::replica] + else: + # Support single-GPU multi-replica deployment + # All replicas will share the same GPU (or GPUs if more than 1 but less than replica count) + # This allows multiple replicas to run on the same GPU using memory-aware scheduling + if len(gpu_idx) == 1: + # Single GPU case - all replicas use the same GPU + return gpu_idx + else: + # Multiple GPUs but fewer than replicas - distribute as evenly as possible + # This enables better resource utilization + assigned_gpu = gpu_idx[rep_id % len(gpu_idx)] + return [assigned_gpu] + return gpu_idx diff --git a/xinference/core/worker.py b/xinference/core/worker.py index 3a211b19e3..eed809d5cd 100644 --- a/xinference/core/worker.py +++ b/xinference/core/worker.py @@ -143,6 +143,8 @@ def __init__( self._model_uid_to_addr: Dict[str, str] = {} self._model_uid_to_recover_count: Dict[str, Optional[int]] = {} self._model_uid_to_launch_args: Dict[str, Dict] = {} + self._gpu_memory_info: Dict[int, Dict[str, Union[int, float]]] = {} + self._model_memory_usage: Dict[str, int] = {} if XINFERENCE_DISABLE_METRICS: logger.info( @@ -495,22 +497,124 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]: break if allocated_non_embedding_rerank_models: user_specified_allocated_devices.add(dev) - allocated_devices = set(self._gpu_to_model_uid.keys()).union( - user_specified_allocated_devices - ) - if n_gpu > len(self._total_gpu_devices) - len(allocated_devices): - raise RuntimeError("No available slot found for the model") - devices: List[int] = [ + # Check for completely available GPUs first + completely_available_gpus = [ dev for dev in self._total_gpu_devices if dev not in self._gpu_to_model_uid and dev not in user_specified_allocated_devices - ][:n_gpu] - for dev in devices: + ] + + if len(completely_available_gpus) >= n_gpu: + # We have enough completely available GPUs + devices = completely_available_gpus[:n_gpu] + for dev in devices: + self._gpu_to_model_uid[int(dev)] = model_uid + logger.info(f"Allocated completely available GPUs: {devices}") + return sorted(devices) + + # Not enough completely available GPUs, try memory-aware allocation + logger.info( + f"Not enough completely available GPUs, trying memory-aware allocation" + ) + + # Initialize memory tracking if not already done + if not self._gpu_memory_info: + self._initialize_gpu_memory_tracking() + + # Try to allocate based on available memory + selected_devices = [] + + # First, use any completely available GPUs + for dev in completely_available_gpus: + selected_devices.append(dev) + self._gpu_to_model_uid[int(dev)] = model_uid + if len(selected_devices) == n_gpu: + break + + # If we still need more GPUs, select those with most available memory + if len(selected_devices) < n_gpu: + remaining_needed = n_gpu - len(selected_devices) + + # Get GPUs sorted by available memory (most available first) + candidate_gpus = [ + dev for dev in self._total_gpu_devices if dev not in selected_devices + ] + + gpu_memory_list = [] + for dev in candidate_gpus: + self._update_gpu_memory_info(dev) + available_memory = self._gpu_memory_info[dev]["available"] + gpu_memory_list.append((dev, available_memory)) + + # Sort by available memory (descending) + gpu_memory_list.sort(key=lambda x: x[1], reverse=True) + + # Select GPUs with most available memory + for dev, available_memory in gpu_memory_list[:remaining_needed]: + selected_devices.append(dev) + self._gpu_to_model_uid[int(dev)] = model_uid + logger.info( + f"Selected GPU {dev} with {available_memory}MB available memory" + ) + + if len(selected_devices) != n_gpu: + raise RuntimeError("No available slot found for the model") + + logger.info(f"Allocated GPUs using memory-aware strategy: {selected_devices}") + return sorted(selected_devices) + + def allocate_devices_for_model( + self, + model_uid: str, + model_name: str, + model_size: Union[int, str], + model_format: Optional[str], + quantization: Optional[str], + n_gpu: int = 1, + ) -> List[int]: + """ + Enhanced GPU allocation that considers model memory requirements. + """ + # Estimate memory usage for this model + estimated_memory_mb = self._estimate_model_memory_usage( + model_name, model_size, model_format, quantization + ) + + self._model_memory_usage[model_uid] = estimated_memory_mb + + # Try to find GPUs that can accommodate the model + suitable_gpus = [] + + for gpu_idx in self._total_gpu_devices: + if self._can_fit_model_on_gpu(gpu_idx, estimated_memory_mb): + suitable_gpus.append(gpu_idx) + + if len(suitable_gpus) >= n_gpu: + # We have enough suitable GPUs + selected = suitable_gpus[:n_gpu] + else: + # Not enough GPUs with sufficient memory, but try anyway + logger.warning( + f"Only found {len(suitable_gpus)} GPUs with sufficient memory, proceeding with allocation" + ) + # Use the GPU with most available memory + best_gpu = self._get_gpu_with_most_available_memory() + selected = [best_gpu] + + # Update tracking + for dev in selected: self._gpu_to_model_uid[int(dev)] = model_uid + # Update memory usage tracking + if dev in self._gpu_memory_info: + self._gpu_memory_info[dev]["used"] += estimated_memory_mb + self._gpu_memory_info[dev]["available"] -= estimated_memory_mb - return sorted(devices) + logger.info( + f"Allocated GPUs for model {model_name}: {selected}, estimated memory: {estimated_memory_mb}MB" + ) + return sorted(selected) async def allocate_devices_with_gpu_idx( self, model_uid: str, model_type: str, gpu_idx: List[int] @@ -574,6 +678,30 @@ def release_devices(self, model_uid: str): for model_info in model_infos: self._user_specified_gpu_to_model_uids[dev].remove(model_info) + # Update GPU memory tracking + if model_uid in self._model_memory_usage: + released_memory = self._model_memory_usage[model_uid] + logger.info( + f"Releasing {released_memory}MB of memory for model {model_uid}" + ) + + # Update memory info for all GPUs + for dev in devices: + if dev in self._gpu_memory_info: + self._gpu_memory_info[dev]["used"] = max( + 0, self._gpu_memory_info[dev]["used"] - released_memory + ) + self._gpu_memory_info[dev]["available"] = min( + self._gpu_memory_info[dev]["total"], + self._gpu_memory_info[dev]["available"] + released_memory, + ) + logger.info( + f"Updated GPU {dev} memory tracking: used={self._gpu_memory_info[dev]['used']}MB, available={self._gpu_memory_info[dev]['available']}MB" + ) + + # Remove model from memory usage tracking + del self._model_memory_usage[model_uid] + async def _create_subpool( self, model_uid: str, @@ -1478,6 +1606,131 @@ def update_model_status(self, model_uid: str, **kwargs): def get_model_status(self, model_uid: str): return self._model_uid_to_model_status.get(model_uid) + def _initialize_gpu_memory_tracking(self): + """Initialize GPU memory tracking for all available GPUs""" + try: + import pynvml + + pynvml.nvmlInit() + for gpu_idx in self._total_gpu_devices: + handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx) + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + self._gpu_memory_info[gpu_idx] = { + "total": mem_info.total // (1024**2), # Convert to MB + "used": mem_info.used // (1024**2), + "available": mem_info.free // (1024**2), + } + logger.info( + f"Initialized GPU memory tracking for {len(self._total_gpu_devices)} GPUs" + ) + except ImportError: + logger.warning("pynvml not available, GPU memory tracking disabled") + # Fallback to basic tracking without actual memory info + for gpu_idx in self._total_gpu_devices: + self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0} + except Exception as e: + logger.error(f"Failed to initialize GPU memory tracking: {e}") + for gpu_idx in self._total_gpu_devices: + self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0} + + def _update_gpu_memory_info(self, gpu_idx: int): + """Update memory information for a specific GPU""" + try: + import pynvml + + handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx) + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + self._gpu_memory_info[gpu_idx] = { + "total": mem_info.total // (1024**2), + "used": mem_info.used // (1024**2), + "available": mem_info.free // (1024**2), + } + except Exception as e: + logger.debug(f"Failed to update GPU {gpu_idx} memory info: {e}") + + def _get_gpu_with_most_available_memory(self) -> int: + """Find the GPU with the most available memory""" + self._initialize_gpu_memory_tracking() if not self._gpu_memory_info else None + + max_available_gpu = -1 + max_available_memory: Union[int, float] = -1 + + for gpu_idx in self._total_gpu_devices: + self._update_gpu_memory_info(gpu_idx) + available_memory = self._gpu_memory_info[gpu_idx]["available"] + + if available_memory > max_available_memory: + max_available_memory = available_memory + max_available_gpu = gpu_idx + + if max_available_gpu == -1: + raise RuntimeError("No suitable GPU found") + + logger.info( + f"Selected GPU {max_available_gpu} with {max_available_memory}MB available memory" + ) + return max_available_gpu + + def _estimate_model_memory_usage( + self, + model_name: str, + model_size: Union[int, str], + model_format: Optional[str], + quantization: Optional[str], + ) -> int: + """Estimate memory usage for a model based on its characteristics""" + # Basic estimation logic - this can be enhanced with more sophisticated calculations + if isinstance(model_size, str): + # Convert string size like "7B" to integer + if "B" in model_size: + size_gb = float(model_size.replace("B", "")) + else: + size_gb = float(model_size) + else: + size_gb = float(model_size) + + # Base memory estimation (rough calculation) + base_memory_mb = int(size_gb * 1024 * 1.5) # 1.5GB per billion parameters + + # Adjust based on quantization + if quantization: + if "4bit" in quantization.lower() or "4-bit" in quantization.lower(): + base_memory_mb = base_memory_mb // 3 + elif "8bit" in quantization.lower() or "8-bit" in quantization.lower(): + base_memory_mb = base_memory_mb // 2 + + # Adjust based on format + if model_format: + if "gguf" in model_format.lower(): + base_memory_mb = int( + base_memory_mb * 0.8 + ) # GGUF is generally more memory efficient + + # Add some buffer for overhead + base_memory_mb = int(base_memory_mb * 1.2) + + logger.debug(f"Estimated memory usage for {model_name}: {base_memory_mb}MB") + return base_memory_mb + + def _can_fit_model_on_gpu(self, gpu_idx: int, estimated_memory_mb: int) -> bool: + """Check if a model can fit on a specific GPU""" + if gpu_idx not in self._gpu_memory_info: + self._update_gpu_memory_info(gpu_idx) + + available_memory = self._gpu_memory_info[gpu_idx]["available"] + can_fit = estimated_memory_mb <= available_memory + + if can_fit: + logger.info( + f"Model can fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available" + ) + else: + logger.warning( + f"Model cannot fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available" + ) + + return can_fit + @staticmethod def record_metrics(name, op, kwargs): record_metrics(name, op, kwargs) From ecabe2c36539d3876f28d8495148161468c1a35e Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Fri, 17 Oct 2025 14:18:07 +0800 Subject: [PATCH 2/5] Fix CI test for test_worker.py --- xinference/core/tests/test_worker.py | 7 +++++++ xinference/core/worker.py | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py index d1e06a7ff7..6c8a8785f1 100644 --- a/xinference/core/tests/test_worker.py +++ b/xinference/core/tests/test_worker.py @@ -29,6 +29,13 @@ def __init__( cuda_devices: List[int], ): super().__init__(supervisor_address, main_pool, cuda_devices) + self._gpu_memory_info = {} + for gpu_idx in cuda_devices: + self._gpu_memory_info[gpu_idx] = { + "total": 24000, + "used": 0, + "available": 24000 + } async def __post_create__(self): pass diff --git a/xinference/core/worker.py b/xinference/core/worker.py index eed809d5cd..b21ef8f600 100644 --- a/xinference/core/worker.py +++ b/xinference/core/worker.py @@ -538,8 +538,13 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]: remaining_needed = n_gpu - len(selected_devices) # Get GPUs sorted by available memory (most available first) + # Exclude GPUs that are already allocated by user_specified models candidate_gpus = [ - dev for dev in self._total_gpu_devices if dev not in selected_devices + dev + for dev in self._total_gpu_devices + if dev not in selected_devices + and dev not in self._gpu_to_model_uid + and dev not in user_specified_allocated_devices ] gpu_memory_list = [] From 00225f09af999b3cfcd68dd2ab978653b0d4998d Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Fri, 17 Oct 2025 14:21:33 +0800 Subject: [PATCH 3/5] Fix CI test for test_worker.py --- xinference/core/tests/test_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py index 6c8a8785f1..5bf99620ba 100644 --- a/xinference/core/tests/test_worker.py +++ b/xinference/core/tests/test_worker.py @@ -34,7 +34,7 @@ def __init__( self._gpu_memory_info[gpu_idx] = { "total": 24000, "used": 0, - "available": 24000 + "available": 24000, } async def __post_create__(self): From d4821195c12b1a27af3d3c468fc69b3e42d76e1d Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Mon, 20 Oct 2025 16:38:11 +0800 Subject: [PATCH 4/5] add launch doc --- .../zh_CN/LC_MESSAGES/user_guide/launch.po | 109 ++++++++++++++++-- doc/source/user_guide/launch.rst | 36 ++++++ 2 files changed, 134 insertions(+), 11 deletions(-) diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po index bcb925f19c..ac5891ed35 100644 --- a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po +++ b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: Xinference \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2025-08-02 23:15+0800\n" +"POT-Creation-Date: 2025-10-20 16:28+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -17,7 +17,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.14.0\n" +"Generated-By: Babel 2.17.0\n" #: ../../source/user_guide/launch.rst:5 msgid "Model Launching Instructions" @@ -46,11 +46,86 @@ msgstr "" "两张 GPU 上。Xinference 会自动进行负载均衡,确保请求均匀分配到多张卡上。" "用户看到的仍是一个模型,这大大提升了整体资源利用率。" -#: ../../source/user_guide/launch.rst:18 +#: ../../source/user_guide/launch.rst:17 +msgid "Traditional Multi-Instance Deployment:" +msgstr "旧版本多实例部署:" + +#: ../../source/user_guide/launch.rst:19 +msgid "" +"When you have multiple GPU cards, each capable of hosting one model " +"instance, you can set the number of instances equal to the number of " +"GPUs. For example:" +msgstr "当您拥有多张GPU显卡时,每张显卡可承载一个模型实例,此时可将实例数量设置为等于GPU数量。例如:" + +#: ../../source/user_guide/launch.rst:21 +msgid "2 GPUs, 2 instances: Each GPU runs one model instance" +msgstr "2张GPU,2个实例:每张GPU运行一个模型实例" + +#: ../../source/user_guide/launch.rst:22 +msgid "4 GPUs, 4 instances: Each GPU runs one model instance" +msgstr "4张GPU,4个实例:每张GPU运行一个模型实例" + +#: ../../source/user_guide/launch.rst:26 +msgid "Introduce a new environment variable:" +msgstr "引入一个新的环境变量:" + +#: ../../source/user_guide/launch.rst:32 +msgid "" +"Control whether to enable the single GPU multi-copy feature Default " +"value: 1" +msgstr "控制是否启用单GPU多副本功能,默认值:1" + +#: ../../source/user_guide/launch.rst:35 +msgid "New Feature: Smart Replica Deployment" +msgstr "新功能:智能副本部署" + +#: ../../source/user_guide/launch.rst:37 +msgid "Single GPU Multi-Replica" +msgstr "单GPU多副本" + +#: ../../source/user_guide/launch.rst:39 +msgid "New Support: Run multiple model replicas even with just one GPU." +msgstr "新增支持:即使仅有一块GPU,也能运行多个模型副本。" + +#: ../../source/user_guide/launch.rst:41 +msgid "Scenario: You have 1 GPU with sufficient VRAM" +msgstr "场景:您拥有1个GPU且显存充足" + +#: ../../source/user_guide/launch.rst:42 +msgid "Configuration: Replica Count = 3, GPU Count = 1" +msgstr "配置:副本数量=3,GPU数量=1" + +#: ../../source/user_guide/launch.rst:43 +msgid "Result: 3 model instances running on the same GPU, sharing GPU resources" +msgstr "结果:3个模型实例,在同一GPU上运行,共享GPU资源" + +#: ../../source/user_guide/launch.rst:45 +msgid "Hybrid GPU Allocation" +msgstr "混合GPU分配" + +#: ../../source/user_guide/launch.rst:47 +msgid "" +"Smart Allocation: Number of replicas may differ from GPU count; system " +"intelligently distributes" +msgstr "智能分配: 副本数可以不等于GPU数量,系统会智能分配" + +#: ../../source/user_guide/launch.rst:49 +msgid "Scenario: You have 2 GPUs and need 3 replicas" +msgstr "场景: 你有2张GPU,需要3个副本" + +#: ../../source/user_guide/launch.rst:50 +msgid "Configuration: Replicas=3, GPUs=2" +msgstr "配置: 副本数=3,GPU数量=2" + +#: ../../source/user_guide/launch.rst:51 +msgid "Result: GPU0 runs 2 instances, GPU1 runs 1 instance" +msgstr "结果: GPU0运行2个实例,GPU1运行1个实例" + +#: ../../source/user_guide/launch.rst:54 msgid "Set Environment Variables" msgstr "设置环境变量" -#: ../../source/user_guide/launch.rst:22 +#: ../../source/user_guide/launch.rst:58 msgid "" "Sometimes, we want to specify environment variables for a particular " "model at runtime. Since v1.8.1, Xinference provides the capability to " @@ -60,21 +135,21 @@ msgstr "" "有时我们希望在运行时为特定模型指定环境变量。从 v1.8.1 开始,Xinference " "提供了单独配置环境变量的功能,无需在启动 Xinference 前设置。" -#: ../../source/user_guide/launch.rst:25 +#: ../../source/user_guide/launch.rst:61 msgid "For Web UI." msgstr "针对 Web UI。" -#: ../../source/user_guide/launch.rst:31 +#: ../../source/user_guide/launch.rst:67 msgid "" "When using the command line, use ``--env`` to specify an environment " "variable." msgstr "命令行使用时,使用 ``--env`` 指定环境变量。" -#: ../../source/user_guide/launch.rst:33 +#: ../../source/user_guide/launch.rst:69 msgid "Example usage:" msgstr "示例用法:" -#: ../../source/user_guide/launch.rst:39 +#: ../../source/user_guide/launch.rst:75 msgid "" "Take vLLM as an example: it has versions V1 and V0, and by default, it " "automatically determines which version to use. If you want to force the " @@ -85,13 +160,25 @@ msgstr "" "在加载模型时强制通过设置 ``VLLM_USE_V1=0`` 来使用 V0,可以指定该环境变量" "。" -#: ../../source/user_guide/launch.rst:43 +#: ../../source/user_guide/launch.rst:79 msgid "Configuring Model Virtual Environment" msgstr "配置模型虚拟空间" -#: ../../source/user_guide/launch.rst:47 +#: ../../source/user_guide/launch.rst:83 msgid "" "For this part, please refer to :ref:`toggling virtual environments and " "customizing dependencies `." -msgstr "对于这部分,请参考 :ref:`开关虚拟空间和定制依赖 `。" +msgstr "" +"对于这部分,请参考 :ref:`开关虚拟空间和定制依赖 `。" + +#~ msgid "" +#~ "Scenario: You have 2 GPUs and need" +#~ " 3 replicas Configuration: Replicas=3, " +#~ "GPUs=2 Result: GPU0 runs 2 instances," +#~ " GPU1 runs 1 instance" +#~ msgstr "" +#~ "场景: 你有2张GPU,需要3个副本" +#~ "配置: 副本数=3,GPU数量=2结果:" +#~ " GPU0运行2个实例,GPU1运行1个实例" diff --git a/doc/source/user_guide/launch.rst b/doc/source/user_guide/launch.rst index aac59bc321..cabf3c29cf 100644 --- a/doc/source/user_guide/launch.rst +++ b/doc/source/user_guide/launch.rst @@ -14,6 +14,42 @@ you can set the replica count to 2. This way, two identical instances of the mod Xinference automatically load-balances requests to ensure even distribution across multiple GPUs. Meanwhile, users see it as a single model, which greatly improves overall resource utilization. +Traditional Multi-Instance Deployment: + +When you have multiple GPU cards, each capable of hosting one model instance, you can set the number of instances equal to the number of GPUs. For example: + +- 2 GPUs, 2 instances: Each GPU runs one model instance +- 4 GPUs, 4 instances: Each GPU runs one model instance + +.. versionadded:: v1.11.1 + +Introduce a new environment variable: + +.. code-block:: bash + + XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA + +Control whether to enable the single GPU multi-copy feature +Default value: 1 + +New Feature: Smart Replica Deployment + +1. Single GPU Multi-Replica + +New Support: Run multiple model replicas even with just one GPU. + +- Scenario: You have 1 GPU with sufficient VRAM +- Configuration: Replica Count = 3, GPU Count = 1 +- Result: 3 model instances running on the same GPU, sharing GPU resources + +2. Hybrid GPU Allocation + +Smart Allocation: Number of replicas may differ from GPU count; system intelligently distributes + +- Scenario: You have 2 GPUs and need 3 replicas +- Configuration: Replicas=3, GPUs=2 +- Result: GPU0 runs 2 instances, GPU1 runs 1 instance + Set Environment Variables ========================= From 9afaf2c932260fe2034a817b73ad5c76e5c02cb3 Mon Sep 17 00:00:00 2001 From: OliverBryant <2713999266@qq.com> Date: Wed, 22 Oct 2025 14:14:01 +0800 Subject: [PATCH 5/5] Supplementary Doc --- .../locale/zh_CN/LC_MESSAGES/user_guide/launch.po | 10 ---------- doc/source/user_guide/launch.rst | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po index ac5891ed35..ff9199818a 100644 --- a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po +++ b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po @@ -172,13 +172,3 @@ msgstr "" "对于这部分,请参考 :ref:`开关虚拟空间和定制依赖 `。" -#~ msgid "" -#~ "Scenario: You have 2 GPUs and need" -#~ " 3 replicas Configuration: Replicas=3, " -#~ "GPUs=2 Result: GPU0 runs 2 instances," -#~ " GPU1 runs 1 instance" -#~ msgstr "" -#~ "场景: 你有2张GPU,需要3个副本" -#~ "配置: 副本数=3,GPU数量=2结果:" -#~ " GPU0运行2个实例,GPU1运行1个实例" - diff --git a/doc/source/user_guide/launch.rst b/doc/source/user_guide/launch.rst index cabf3c29cf..062a63c47b 100644 --- a/doc/source/user_guide/launch.rst +++ b/doc/source/user_guide/launch.rst @@ -21,7 +21,7 @@ When you have multiple GPU cards, each capable of hosting one model instance, yo - 2 GPUs, 2 instances: Each GPU runs one model instance - 4 GPUs, 4 instances: Each GPU runs one model instance -.. versionadded:: v1.11.1 +.. versionadded:: v1.12.0 Introduce a new environment variable: