From 7cebef6be46fecd2b8732e27999a7b4d16dc9cf3 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 17 Oct 2025 11:58:08 +0800
Subject: [PATCH 1/5] FEAT: support replicas on single GPU

---
 xinference/api/restful_api.py |  24 ++-
 xinference/constants.py       |   6 +
 xinference/core/utils.py      |  23 ++-
 xinference/core/worker.py     | 271 ++++++++++++++++++++++++++++++++--
 4 files changed, 311 insertions(+), 13 deletions(-)

diff --git a/xinference/api/restful_api.py b/xinference/api/restful_api.py
index 84c7b18d80..ff4ecdc226 100644
--- a/xinference/api/restful_api.py
+++ b/xinference/api/restful_api.py
@@ -1225,11 +1225,29 @@ async def launch_model(
 
         if isinstance(gpu_idx, int):
             gpu_idx = [gpu_idx]
-        if gpu_idx:
-            if len(gpu_idx) % replica:
+
+        # Check if single-GPU multi-replica is enabled
+        from ..constants import XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA
+
+        if XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA:
+            # Enhanced replica validation with single-GPU multi-replica support
+            if gpu_idx and len(gpu_idx) > 1 and len(gpu_idx) % replica:
+                # Only keep the restriction when multiple GPUs are specified
+                raise HTTPException(
+                    status_code=400,
+                    detail="Invalid input. When using multiple GPUs, the count must be a multiple of replica.",
+                )
+            # Allow single-GPU multi-replica deployment when enabled
+            if gpu_idx and len(gpu_idx) == 1 and replica > 1:
+                logger.info(
+                    f"Single-GPU multi-replica deployment enabled: {replica} replicas on 1 GPU"
+                )
+        else:
+            # Traditional behavior - strict multiple requirement
+            if gpu_idx and len(gpu_idx) % replica:
                 raise HTTPException(
                     status_code=400,
-                    detail="Invalid input. Allocated gpu must be a multiple of replica.",
+                    detail="Invalid input. Allocated gpu must be a multiple of replica. Set XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA=1 to enable single-GPU multi-replica deployment.",
                 )
 
         if peft_model_config is not None:
diff --git a/xinference/constants.py b/xinference/constants.py
index 1c12ef331a..457e548cab 100644
--- a/xinference/constants.py
+++ b/xinference/constants.py
@@ -34,6 +34,9 @@
 XINFERENCE_ENV_SSE_PING_ATTEMPTS_SECONDS = "XINFERENCE_SSE_PING_ATTEMPTS_SECONDS"
 XINFERENCE_ENV_MAX_TOKENS = "XINFERENCE_MAX_TOKENS"
 XINFERENCE_ENV_ALLOWED_IPS = "XINFERENCE_ALLOWED_IPS"
+XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA = (
+    "XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA"
+)
 
 
 def get_xinference_home() -> str:
@@ -112,3 +115,6 @@ def get_xinference_home() -> str:
     else None
 )
 XINFERENCE_ALLOWED_IPS = os.getenv(XINFERENCE_ENV_ALLOWED_IPS)
+XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA = bool(
+    int(os.getenv(XINFERENCE_ENV_ENABLE_SINGLE_GPU_MULTI_REPLICA, "1"))
+)  # Enable by default
diff --git a/xinference/core/utils.py b/xinference/core/utils.py
index 7037ff0226..25968cb3f1 100644
--- a/xinference/core/utils.py
+++ b/xinference/core/utils.py
@@ -250,12 +250,33 @@ def parse_model_version(model_version: str, model_type: str) -> Tuple:
 def assign_replica_gpu(
     _replica_model_uid: str, replica: int, gpu_idx: Optional[Union[int, List[int]]]
 ) -> Optional[List[int]]:
+    """
+    Enhanced GPU assignment for replica models.
+    Supports single-GPU multi-replica deployment by intelligently allocating GPUs.
+    """
     model_uid, rep_id = parse_replica_model_uid(_replica_model_uid)
     rep_id, replica = int(rep_id), int(replica)
+
     if isinstance(gpu_idx, int):
         gpu_idx = [gpu_idx]
+
     if isinstance(gpu_idx, list) and gpu_idx:
-        return gpu_idx[rep_id::replica]
+        # When we have enough GPUs for round-robin allocation
+        if len(gpu_idx) >= replica:
+            return gpu_idx[rep_id::replica]
+        else:
+            # Support single-GPU multi-replica deployment
+            # All replicas will share the same GPU (or GPUs if more than 1 but less than replica count)
+            # This allows multiple replicas to run on the same GPU using memory-aware scheduling
+            if len(gpu_idx) == 1:
+                # Single GPU case - all replicas use the same GPU
+                return gpu_idx
+            else:
+                # Multiple GPUs but fewer than replicas - distribute as evenly as possible
+                # This enables better resource utilization
+                assigned_gpu = gpu_idx[rep_id % len(gpu_idx)]
+                return [assigned_gpu]
+
     return gpu_idx
 
 
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index 3a211b19e3..eed809d5cd 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -143,6 +143,8 @@ def __init__(
         self._model_uid_to_addr: Dict[str, str] = {}
         self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
         self._model_uid_to_launch_args: Dict[str, Dict] = {}
+        self._gpu_memory_info: Dict[int, Dict[str, Union[int, float]]] = {}
+        self._model_memory_usage: Dict[str, int] = {}
 
         if XINFERENCE_DISABLE_METRICS:
             logger.info(
@@ -495,22 +497,124 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
                     break
             if allocated_non_embedding_rerank_models:
                 user_specified_allocated_devices.add(dev)
-        allocated_devices = set(self._gpu_to_model_uid.keys()).union(
-            user_specified_allocated_devices
-        )
-        if n_gpu > len(self._total_gpu_devices) - len(allocated_devices):
-            raise RuntimeError("No available slot found for the model")
 
-        devices: List[int] = [
+        # Check for completely available GPUs first
+        completely_available_gpus = [
             dev
             for dev in self._total_gpu_devices
             if dev not in self._gpu_to_model_uid
             and dev not in user_specified_allocated_devices
-        ][:n_gpu]
-        for dev in devices:
+        ]
+
+        if len(completely_available_gpus) >= n_gpu:
+            # We have enough completely available GPUs
+            devices = completely_available_gpus[:n_gpu]
+            for dev in devices:
+                self._gpu_to_model_uid[int(dev)] = model_uid
+            logger.info(f"Allocated completely available GPUs: {devices}")
+            return sorted(devices)
+
+        # Not enough completely available GPUs, try memory-aware allocation
+        logger.info(
+            f"Not enough completely available GPUs, trying memory-aware allocation"
+        )
+
+        # Initialize memory tracking if not already done
+        if not self._gpu_memory_info:
+            self._initialize_gpu_memory_tracking()
+
+        # Try to allocate based on available memory
+        selected_devices = []
+
+        # First, use any completely available GPUs
+        for dev in completely_available_gpus:
+            selected_devices.append(dev)
+            self._gpu_to_model_uid[int(dev)] = model_uid
+            if len(selected_devices) == n_gpu:
+                break
+
+        # If we still need more GPUs, select those with most available memory
+        if len(selected_devices) < n_gpu:
+            remaining_needed = n_gpu - len(selected_devices)
+
+            # Get GPUs sorted by available memory (most available first)
+            candidate_gpus = [
+                dev for dev in self._total_gpu_devices if dev not in selected_devices
+            ]
+
+            gpu_memory_list = []
+            for dev in candidate_gpus:
+                self._update_gpu_memory_info(dev)
+                available_memory = self._gpu_memory_info[dev]["available"]
+                gpu_memory_list.append((dev, available_memory))
+
+            # Sort by available memory (descending)
+            gpu_memory_list.sort(key=lambda x: x[1], reverse=True)
+
+            # Select GPUs with most available memory
+            for dev, available_memory in gpu_memory_list[:remaining_needed]:
+                selected_devices.append(dev)
+                self._gpu_to_model_uid[int(dev)] = model_uid
+                logger.info(
+                    f"Selected GPU {dev} with {available_memory}MB available memory"
+                )
+
+        if len(selected_devices) != n_gpu:
+            raise RuntimeError("No available slot found for the model")
+
+        logger.info(f"Allocated GPUs using memory-aware strategy: {selected_devices}")
+        return sorted(selected_devices)
+
+    def allocate_devices_for_model(
+        self,
+        model_uid: str,
+        model_name: str,
+        model_size: Union[int, str],
+        model_format: Optional[str],
+        quantization: Optional[str],
+        n_gpu: int = 1,
+    ) -> List[int]:
+        """
+        Enhanced GPU allocation that considers model memory requirements.
+        """
+        # Estimate memory usage for this model
+        estimated_memory_mb = self._estimate_model_memory_usage(
+            model_name, model_size, model_format, quantization
+        )
+
+        self._model_memory_usage[model_uid] = estimated_memory_mb
+
+        # Try to find GPUs that can accommodate the model
+        suitable_gpus = []
+
+        for gpu_idx in self._total_gpu_devices:
+            if self._can_fit_model_on_gpu(gpu_idx, estimated_memory_mb):
+                suitable_gpus.append(gpu_idx)
+
+        if len(suitable_gpus) >= n_gpu:
+            # We have enough suitable GPUs
+            selected = suitable_gpus[:n_gpu]
+        else:
+            # Not enough GPUs with sufficient memory, but try anyway
+            logger.warning(
+                f"Only found {len(suitable_gpus)} GPUs with sufficient memory, proceeding with allocation"
+            )
+            # Use the GPU with most available memory
+            best_gpu = self._get_gpu_with_most_available_memory()
+            selected = [best_gpu]
+
+        # Update tracking
+        for dev in selected:
             self._gpu_to_model_uid[int(dev)] = model_uid
+            # Update memory usage tracking
+            if dev in self._gpu_memory_info:
+                self._gpu_memory_info[dev]["used"] += estimated_memory_mb
+                self._gpu_memory_info[dev]["available"] -= estimated_memory_mb
 
-        return sorted(devices)
+        logger.info(
+            f"Allocated GPUs for model {model_name}: {selected}, estimated memory: {estimated_memory_mb}MB"
+        )
+        return sorted(selected)
 
     async def allocate_devices_with_gpu_idx(
         self, model_uid: str, model_type: str, gpu_idx: List[int]
@@ -574,6 +678,30 @@ def release_devices(self, model_uid: str):
             for model_info in model_infos:
                 self._user_specified_gpu_to_model_uids[dev].remove(model_info)
 
+        # Update GPU memory tracking
+        if model_uid in self._model_memory_usage:
+            released_memory = self._model_memory_usage[model_uid]
+            logger.info(
+                f"Releasing {released_memory}MB of memory for model {model_uid}"
+            )
+
+            # Update memory info for all GPUs
+            for dev in devices:
+                if dev in self._gpu_memory_info:
+                    self._gpu_memory_info[dev]["used"] = max(
+                        0, self._gpu_memory_info[dev]["used"] - released_memory
+                    )
+                    self._gpu_memory_info[dev]["available"] = min(
+                        self._gpu_memory_info[dev]["total"],
+                        self._gpu_memory_info[dev]["available"] + released_memory,
+                    )
+                    logger.info(
+                        f"Updated GPU {dev} memory tracking: used={self._gpu_memory_info[dev]['used']}MB, available={self._gpu_memory_info[dev]['available']}MB"
+                    )
+
+            # Remove model from memory usage tracking
+            del self._model_memory_usage[model_uid]
+
     async def _create_subpool(
         self,
         model_uid: str,
@@ -1478,6 +1606,131 @@ def update_model_status(self, model_uid: str, **kwargs):
     def get_model_status(self, model_uid: str):
         return self._model_uid_to_model_status.get(model_uid)
 
+    def _initialize_gpu_memory_tracking(self):
+        """Initialize GPU memory tracking for all available GPUs"""
+        try:
+            import pynvml
+
+            pynvml.nvmlInit()
+            for gpu_idx in self._total_gpu_devices:
+                handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
+                mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+                self._gpu_memory_info[gpu_idx] = {
+                    "total": mem_info.total // (1024**2),  # Convert to MB
+                    "used": mem_info.used // (1024**2),
+                    "available": mem_info.free // (1024**2),
+                }
+            logger.info(
+                f"Initialized GPU memory tracking for {len(self._total_gpu_devices)} GPUs"
+            )
+        except ImportError:
+            logger.warning("pynvml not available, GPU memory tracking disabled")
+            # Fallback to basic tracking without actual memory info
+            for gpu_idx in self._total_gpu_devices:
+                self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0}
+        except Exception as e:
+            logger.error(f"Failed to initialize GPU memory tracking: {e}")
+            for gpu_idx in self._total_gpu_devices:
+                self._gpu_memory_info[gpu_idx] = {"total": 0, "used": 0, "available": 0}
+
+    def _update_gpu_memory_info(self, gpu_idx: int):
+        """Update memory information for a specific GPU"""
+        try:
+            import pynvml
+
+            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_idx)
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            self._gpu_memory_info[gpu_idx] = {
+                "total": mem_info.total // (1024**2),
+                "used": mem_info.used // (1024**2),
+                "available": mem_info.free // (1024**2),
+            }
+        except Exception as e:
+            logger.debug(f"Failed to update GPU {gpu_idx} memory info: {e}")
+
+    def _get_gpu_with_most_available_memory(self) -> int:
+        """Find the GPU with the most available memory"""
+        self._initialize_gpu_memory_tracking() if not self._gpu_memory_info else None
+
+        max_available_gpu = -1
+        max_available_memory: Union[int, float] = -1
+
+        for gpu_idx in self._total_gpu_devices:
+            self._update_gpu_memory_info(gpu_idx)
+            available_memory = self._gpu_memory_info[gpu_idx]["available"]
+
+            if available_memory > max_available_memory:
+                max_available_memory = available_memory
+                max_available_gpu = gpu_idx
+
+        if max_available_gpu == -1:
+            raise RuntimeError("No suitable GPU found")
+
+        logger.info(
+            f"Selected GPU {max_available_gpu} with {max_available_memory}MB available memory"
+        )
+        return max_available_gpu
+
+    def _estimate_model_memory_usage(
+        self,
+        model_name: str,
+        model_size: Union[int, str],
+        model_format: Optional[str],
+        quantization: Optional[str],
+    ) -> int:
+        """Estimate memory usage for a model based on its characteristics"""
+        # Basic estimation logic - this can be enhanced with more sophisticated calculations
+        if isinstance(model_size, str):
+            # Convert string size like "7B" to integer
+            if "B" in model_size:
+                size_gb = float(model_size.replace("B", ""))
+            else:
+                size_gb = float(model_size)
+        else:
+            size_gb = float(model_size)
+
+        # Base memory estimation (rough calculation)
+        base_memory_mb = int(size_gb * 1024 * 1.5)  # 1.5GB per billion parameters
+
+        # Adjust based on quantization
+        if quantization:
+            if "4bit" in quantization.lower() or "4-bit" in quantization.lower():
+                base_memory_mb = base_memory_mb // 3
+            elif "8bit" in quantization.lower() or "8-bit" in quantization.lower():
+                base_memory_mb = base_memory_mb // 2
+
+        # Adjust based on format
+        if model_format:
+            if "gguf" in model_format.lower():
+                base_memory_mb = int(
+                    base_memory_mb * 0.8
+                )  # GGUF is generally more memory efficient
+
+        # Add some buffer for overhead
+        base_memory_mb = int(base_memory_mb * 1.2)
+
+        logger.debug(f"Estimated memory usage for {model_name}: {base_memory_mb}MB")
+        return base_memory_mb
+
+    def _can_fit_model_on_gpu(self, gpu_idx: int, estimated_memory_mb: int) -> bool:
+        """Check if a model can fit on a specific GPU"""
+        if gpu_idx not in self._gpu_memory_info:
+            self._update_gpu_memory_info(gpu_idx)
+
+        available_memory = self._gpu_memory_info[gpu_idx]["available"]
+        can_fit = estimated_memory_mb <= available_memory
+
+        if can_fit:
+            logger.info(
+                f"Model can fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
+            )
+        else:
+            logger.warning(
+                f"Model cannot fit on GPU {gpu_idx}: needs {estimated_memory_mb}MB, has {available_memory}MB available"
+            )
+
+        return can_fit
+
     @staticmethod
     def record_metrics(name, op, kwargs):
         record_metrics(name, op, kwargs)

From ecabe2c36539d3876f28d8495148161468c1a35e Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 17 Oct 2025 14:18:07 +0800
Subject: [PATCH 2/5] Fix CI test for test_worker.py

---
 xinference/core/tests/test_worker.py | 7 +++++++
 xinference/core/worker.py            | 7 ++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index d1e06a7ff7..6c8a8785f1 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -29,6 +29,13 @@ def __init__(
         cuda_devices: List[int],
     ):
         super().__init__(supervisor_address, main_pool, cuda_devices)
+        self._gpu_memory_info = {}
+        for gpu_idx in cuda_devices:
+            self._gpu_memory_info[gpu_idx] = {
+                "total": 24000,
+                "used": 0,
+                "available": 24000
+            }
 
     async def __post_create__(self):
         pass
diff --git a/xinference/core/worker.py b/xinference/core/worker.py
index eed809d5cd..b21ef8f600 100644
--- a/xinference/core/worker.py
+++ b/xinference/core/worker.py
@@ -538,8 +538,13 @@ def allocate_devices(self, model_uid: str, n_gpu: int) -> List[int]:
             remaining_needed = n_gpu - len(selected_devices)
 
             # Get GPUs sorted by available memory (most available first)
+            # Exclude GPUs that are already allocated by user_specified models
             candidate_gpus = [
-                dev for dev in self._total_gpu_devices if dev not in selected_devices
+                dev
+                for dev in self._total_gpu_devices
+                if dev not in selected_devices
+                and dev not in self._gpu_to_model_uid
+                and dev not in user_specified_allocated_devices
             ]
 
             gpu_memory_list = []

From 00225f09af999b3cfcd68dd2ab978653b0d4998d Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Fri, 17 Oct 2025 14:21:33 +0800
Subject: [PATCH 3/5] Fix CI test for test_worker.py

---
 xinference/core/tests/test_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/core/tests/test_worker.py b/xinference/core/tests/test_worker.py
index 6c8a8785f1..5bf99620ba 100644
--- a/xinference/core/tests/test_worker.py
+++ b/xinference/core/tests/test_worker.py
@@ -34,7 +34,7 @@ def __init__(
             self._gpu_memory_info[gpu_idx] = {
                 "total": 24000,
                 "used": 0,
-                "available": 24000
+                "available": 24000,
             }
 
     async def __post_create__(self):

From d4821195c12b1a27af3d3c468fc69b3e42d76e1d Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Mon, 20 Oct 2025 16:38:11 +0800
Subject: [PATCH 4/5] add launch doc

---
 .../zh_CN/LC_MESSAGES/user_guide/launch.po    | 109 ++++++++++++++++--
 doc/source/user_guide/launch.rst              |  36 ++++++
 2 files changed, 134 insertions(+), 11 deletions(-)

diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
index bcb925f19c..ac5891ed35 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
@@ -8,7 +8,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: Xinference \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2025-08-02 23:15+0800\n"
+"POT-Creation-Date: 2025-10-20 16:28+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
@@ -17,7 +17,7 @@ msgstr ""
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.14.0\n"
+"Generated-By: Babel 2.17.0\n"
 
 #: ../../source/user_guide/launch.rst:5
 msgid "Model Launching Instructions"
@@ -46,11 +46,86 @@ msgstr ""
 "两张 GPU 上。Xinference 会自动进行负载均衡，确保请求均匀分配到多张卡上。"
 "用户看到的仍是一个模型，这大大提升了整体资源利用率。"
 
-#: ../../source/user_guide/launch.rst:18
+#: ../../source/user_guide/launch.rst:17
+msgid "Traditional Multi-Instance Deployment："
+msgstr "旧版本多实例部署："
+
+#: ../../source/user_guide/launch.rst:19
+msgid ""
+"When you have multiple GPU cards, each capable of hosting one model "
+"instance, you can set the number of instances equal to the number of "
+"GPUs. For example:"
+msgstr "当您拥有多张GPU显卡时，每张显卡可承载一个模型实例，此时可将实例数量设置为等于GPU数量。例如:"
+
+#: ../../source/user_guide/launch.rst:21
+msgid "2 GPUs, 2 instances: Each GPU runs one model instance"
+msgstr "2张GPU，2个实例：每张GPU运行一个模型实例"
+
+#: ../../source/user_guide/launch.rst:22
+msgid "4 GPUs, 4 instances: Each GPU runs one model instance"
+msgstr "4张GPU，4个实例：每张GPU运行一个模型实例"
+
+#: ../../source/user_guide/launch.rst:26
+msgid "Introduce a new environment variable:"
+msgstr "引入一个新的环境变量:"
+
+#: ../../source/user_guide/launch.rst:32
+msgid ""
+"Control whether to enable the single GPU multi-copy feature Default "
+"value: 1"
+msgstr "控制是否启用单GPU多副本功能，默认值：1"
+
+#: ../../source/user_guide/launch.rst:35
+msgid "New Feature: Smart Replica Deployment"
+msgstr "新功能：智能副本部署"
+
+#: ../../source/user_guide/launch.rst:37
+msgid "Single GPU Multi-Replica"
+msgstr "单GPU多副本"
+
+#: ../../source/user_guide/launch.rst:39
+msgid "New Support: Run multiple model replicas even with just one GPU."
+msgstr "新增支持：即使仅有一块GPU，也能运行多个模型副本。"
+
+#: ../../source/user_guide/launch.rst:41
+msgid "Scenario: You have 1 GPU with sufficient VRAM"
+msgstr "场景：您拥有1个GPU且显存充足"
+
+#: ../../source/user_guide/launch.rst:42
+msgid "Configuration: Replica Count = 3, GPU Count = 1"
+msgstr "配置：副本数量=3，GPU数量=1"
+
+#: ../../source/user_guide/launch.rst:43
+msgid "Result: 3 model instances running on the same GPU, sharing GPU resources"
+msgstr "结果：3个模型实例，在同一GPU上运行，共享GPU资源"
+
+#: ../../source/user_guide/launch.rst:45
+msgid "Hybrid GPU Allocation"
+msgstr "混合GPU分配"
+
+#: ../../source/user_guide/launch.rst:47
+msgid ""
+"Smart Allocation: Number of replicas may differ from GPU count; system "
+"intelligently distributes"
+msgstr "智能分配: 副本数可以不等于GPU数量，系统会智能分配"
+
+#: ../../source/user_guide/launch.rst:49
+msgid "Scenario: You have 2 GPUs and need 3 replicas"
+msgstr "场景: 你有2张GPU，需要3个副本"
+
+#: ../../source/user_guide/launch.rst:50
+msgid "Configuration: Replicas=3, GPUs=2"
+msgstr "配置: 副本数=3，GPU数量=2"
+
+#: ../../source/user_guide/launch.rst:51
+msgid "Result: GPU0 runs 2 instances, GPU1 runs 1 instance"
+msgstr "结果: GPU0运行2个实例，GPU1运行1个实例"
+
+#: ../../source/user_guide/launch.rst:54
 msgid "Set Environment Variables"
 msgstr "设置环境变量"
 
-#: ../../source/user_guide/launch.rst:22
+#: ../../source/user_guide/launch.rst:58
 msgid ""
 "Sometimes, we want to specify environment variables for a particular "
 "model at runtime. Since v1.8.1, Xinference provides the capability to "
@@ -60,21 +135,21 @@ msgstr ""
 "有时我们希望在运行时为特定模型指定环境变量。从 v1.8.1 开始，Xinference "
 "提供了单独配置环境变量的功能，无需在启动 Xinference 前设置。"
 
-#: ../../source/user_guide/launch.rst:25
+#: ../../source/user_guide/launch.rst:61
 msgid "For Web UI."
 msgstr "针对 Web UI。"
 
-#: ../../source/user_guide/launch.rst:31
+#: ../../source/user_guide/launch.rst:67
 msgid ""
 "When using the command line, use ``--env`` to specify an environment "
 "variable."
 msgstr "命令行使用时，使用 ``--env`` 指定环境变量。"
 
-#: ../../source/user_guide/launch.rst:33
+#: ../../source/user_guide/launch.rst:69
 msgid "Example usage:"
 msgstr "示例用法："
 
-#: ../../source/user_guide/launch.rst:39
+#: ../../source/user_guide/launch.rst:75
 msgid ""
 "Take vLLM as an example: it has versions V1 and V0, and by default, it "
 "automatically determines which version to use. If you want to force the "
@@ -85,13 +160,25 @@ msgstr ""
 "在加载模型时强制通过设置 ``VLLM_USE_V1=0`` 来使用 V0，可以指定该环境变量"
 "。"
 
-#: ../../source/user_guide/launch.rst:43
+#: ../../source/user_guide/launch.rst:79
 msgid "Configuring Model Virtual Environment"
 msgstr "配置模型虚拟空间"
 
-#: ../../source/user_guide/launch.rst:47
+#: ../../source/user_guide/launch.rst:83
 msgid ""
 "For this part, please refer to :ref:`toggling virtual environments and "
 "customizing dependencies <model_launching_virtualenv>`."
-msgstr "对于这部分，请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_virtualenv>`。"
+msgstr ""
+"对于这部分，请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_"
+"virtualenv>`。"
+
+#~ msgid ""
+#~ "Scenario: You have 2 GPUs and need"
+#~ " 3 replicas Configuration: Replicas=3, "
+#~ "GPUs=2 Result: GPU0 runs 2 instances,"
+#~ " GPU1 runs 1 instance"
+#~ msgstr ""
+#~ "场景: 你有2张GPU，需要3个副本"
+#~ "配置: 副本数=3，GPU数量=2结果:"
+#~ " GPU0运行2个实例，GPU1运行1个实例"
 
diff --git a/doc/source/user_guide/launch.rst b/doc/source/user_guide/launch.rst
index aac59bc321..cabf3c29cf 100644
--- a/doc/source/user_guide/launch.rst
+++ b/doc/source/user_guide/launch.rst
@@ -14,6 +14,42 @@ you can set the replica count to 2. This way, two identical instances of the mod
 Xinference automatically load-balances requests to ensure even distribution across multiple GPUs.
 Meanwhile, users see it as a single model, which greatly improves overall resource utilization.
 
+Traditional Multi-Instance Deployment：
+
+When you have multiple GPU cards, each capable of hosting one model instance, you can set the number of instances equal to the number of GPUs. For example:
+
+- 2 GPUs, 2 instances: Each GPU runs one model instance
+- 4 GPUs, 4 instances: Each GPU runs one model instance
+
+.. versionadded:: v1.11.1
+
+Introduce a new environment variable:
+
+.. code-block:: bash
+
+    XINFERENCE_ENABLE_SINGLE_GPU_MULTI_REPLICA
+
+Control whether to enable the single GPU multi-copy feature
+Default value: 1
+
+New Feature: Smart Replica Deployment
+
+1. Single GPU Multi-Replica
+
+New Support: Run multiple model replicas even with just one GPU.
+
+- Scenario: You have 1 GPU with sufficient VRAM
+- Configuration: Replica Count = 3, GPU Count = 1
+- Result: 3 model instances running on the same GPU, sharing GPU resources
+
+2. Hybrid GPU Allocation
+
+Smart Allocation: Number of replicas may differ from GPU count; system intelligently distributes
+
+- Scenario: You have 2 GPUs and need 3 replicas
+- Configuration: Replicas=3, GPUs=2
+- Result: GPU0 runs 2 instances, GPU1 runs 1 instance
+
 Set Environment Variables
 =========================
 

From 9afaf2c932260fe2034a817b73ad5c76e5c02cb3 Mon Sep 17 00:00:00 2001
From: OliverBryant <2713999266@qq.com>
Date: Wed, 22 Oct 2025 14:14:01 +0800
Subject: [PATCH 5/5] Supplementary Doc

---
 .../locale/zh_CN/LC_MESSAGES/user_guide/launch.po      | 10 ----------
 doc/source/user_guide/launch.rst                       |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
index ac5891ed35..ff9199818a 100644
--- a/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
+++ b/doc/source/locale/zh_CN/LC_MESSAGES/user_guide/launch.po
@@ -172,13 +172,3 @@ msgstr ""
 "对于这部分，请参考 :ref:`开关虚拟空间和定制依赖 <model_launching_"
 "virtualenv>`。"
 
-#~ msgid ""
-#~ "Scenario: You have 2 GPUs and need"
-#~ " 3 replicas Configuration: Replicas=3, "
-#~ "GPUs=2 Result: GPU0 runs 2 instances,"
-#~ " GPU1 runs 1 instance"
-#~ msgstr ""
-#~ "场景: 你有2张GPU，需要3个副本"
-#~ "配置: 副本数=3，GPU数量=2结果:"
-#~ " GPU0运行2个实例，GPU1运行1个实例"
-
diff --git a/doc/source/user_guide/launch.rst b/doc/source/user_guide/launch.rst
index cabf3c29cf..062a63c47b 100644
--- a/doc/source/user_guide/launch.rst
+++ b/doc/source/user_guide/launch.rst
@@ -21,7 +21,7 @@ When you have multiple GPU cards, each capable of hosting one model instance, yo
 - 2 GPUs, 2 instances: Each GPU runs one model instance
 - 4 GPUs, 4 instances: Each GPU runs one model instance
 
-.. versionadded:: v1.11.1
+.. versionadded:: v1.12.0
 
 Introduce a new environment variable: