REF: Modified the batch lock logic (#4162)

OliverBryant · web-flow · commit 1a82be8ecc4d · 2025-10-22T14:03:57.000+08:00
diff --git a/xinference/core/model.py b/xinference/core/model.py
@@ -206,10 +206,6 @@ def __init__(
     ):
         super().__init__()
 
-        from ..model.llm.llama_cpp.core import XllamaCppModel
-        from ..model.llm.lmdeploy.core import LMDeployModel
-        from ..model.llm.sglang.core import SGLANGModel
-        from ..model.llm.transformers.core import PytorchModel
         from ..model.llm.vllm.core import VLLMModel
 
         self._supervisor_address = supervisor_address
@@ -223,12 +219,7 @@ def __init__(
         self._pending_requests: asyncio.Queue = asyncio.Queue()
         self._handle_pending_requests_task = None
         self._lock = (
-            None
-            if isinstance(
-                self._model,
-                (PytorchModel, VLLMModel, SGLANGModel, LMDeployModel, XllamaCppModel),
-            )
-            else asyncio.locks.Lock()
+            None if getattr(self._model, "allow_batch", False) else asyncio.locks.Lock()
         )
         self._worker_ref = None
         self._progress_tracker_ref = None
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
@@ -45,6 +45,8 @@ def get_llm_version_infos():
 
 
 class LLM(abc.ABC):
+    allow_batch = False
+
     def __init__(
         self,
         replica_model_uid: str,
diff --git a/xinference/model/llm/llama_cpp/core.py b/xinference/model/llm/llama_cpp/core.py
@@ -40,6 +40,8 @@ def __init__(self, msg):
 
 
 class XllamaCppModel(LLM, ChatModelMixin):
+    allow_batch = True
+
     def __init__(
         self,
         model_uid: str,
diff --git a/xinference/model/llm/lmdeploy/core.py b/xinference/model/llm/lmdeploy/core.py
@@ -73,6 +73,8 @@ class LMDeployGenerateConfig(TypedDict, total=False):
 
 
 class LMDeployModel(LLM):
+    allow_batch = True
+
     def __init__(
         self,
         model_uid: str,
diff --git a/xinference/model/llm/sglang/core.py b/xinference/model/llm/sglang/core.py
@@ -137,6 +137,8 @@ class SGLANGGenerateConfig(TypedDict, total=False):
 
 
 class SGLANGModel(LLM):
+    allow_batch = True
+
     def __init__(
         self,
         model_uid: str,
diff --git a/xinference/model/llm/transformers/core.py b/xinference/model/llm/transformers/core.py
@@ -91,6 +91,8 @@ def decorator(cls):
 
 
 class PytorchModel(LLM):
+    allow_batch = True
+
     def __init__(
         self,
         model_uid: str,
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -302,6 +302,8 @@ class VLLMGenerateConfig(TypedDict, total=False):
 
 
 class VLLMModel(LLM):
+    allow_batch = True
+
     def __init__(
         self,
         model_uid: str,