apache
diff --git a/‎integration-test/src/test/java/org/apache/iotdb/ainode/it/AINodeConcurrentInferenceIT.java‎
Lines changed: 6 additions & 0 deletions b/‎integration-test/src/test/java/org/apache/iotdb/ainode/it/AINodeConcurrentInferenceIT.java‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎iotdb-core/ainode/iotdb/ainode/core/inference/inference_request_pool.py‎
Lines changed: 14 additions & 8 deletions b/‎iotdb-core/ainode/iotdb/ainode/core/inference/inference_request_pool.py‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎iotdb-core/ainode/iotdb/ainode/core/inference/pool_controller.py‎
Lines changed: 15 additions & 6 deletions b/‎iotdb-core/ainode/iotdb/ainode/core/inference/pool_controller.py‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎iotdb-core/ainode/iotdb/ainode/core/inference/pool_group.py‎
Lines changed: 6 additions & 0 deletions b/‎iotdb-core/ainode/iotdb/ainode/core/inference/pool_group.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎iotdb-core/ainode/iotdb/ainode/core/inference/pool_scheduler/abstract_pool_scheduler.py‎
Lines changed: 5 additions & 4 deletions b/‎iotdb-core/ainode/iotdb/ainode/core/inference/pool_scheduler/abstract_pool_scheduler.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎iotdb-core/ainode/iotdb/ainode/core/inference/pool_scheduler/basic_pool_scheduler.py‎
Lines changed: 32 additions & 29 deletions b/‎iotdb-core/ainode/iotdb/ainode/core/inference/pool_scheduler/basic_pool_scheduler.py‎
Lines changed: 32 additions & 29 deletions
diff --git a/‎iotdb-core/ainode/iotdb/ainode/core/manager/inference_manager.py‎
Lines changed: 4 additions & 2 deletions b/‎iotdb-core/ainode/iotdb/ainode/core/manager/inference_manager.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎iotdb-core/ainode/iotdb/ainode/core/manager/model_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎iotdb-core/ainode/iotdb/ainode/core/manager/model_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎iotdb-core/ainode/iotdb/ainode/core/model/model_storage.py‎
Lines changed: 7 additions & 0 deletions b/‎iotdb-core/ainode/iotdb/ainode/core/model/model_storage.py‎
Lines changed: 7 additions & 0 deletions
@@ -20,6 +20,8 @@
 package org.apache.iotdb.ainode.it;
 
 import org.apache.iotdb.it.env.EnvFactory;
+import org.apache.iotdb.it.framework.IoTDBTestRunner;
+import org.apache.iotdb.itbase.category.AIClusterIT;
 import org.apache.iotdb.itbase.env.BaseEnv;
 
 import com.google.common.collect.ImmutableMap;
@@ -28,6 +30,8 @@
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -42,6 +46,8 @@
 
 import static org.apache.iotdb.ainode.utils.AINodeTestUtils.concurrentInference;
 
+@RunWith(IoTDBTestRunner.class)
+@Category({AIClusterIT.class})
 public class AINodeConcurrentInferenceIT {
 
   private static final Logger LOGGER = LoggerFactory.getLogger(AINodeConcurrentInferenceIT.class);
 
@@ -36,6 +36,8 @@
 )
 from iotdb.ainode.core.log import Logger
 from iotdb.ainode.core.manager.model_manager import ModelManager
+from iotdb.ainode.core.model.model_enums import BuiltInModelType
+from iotdb.ainode.core.model.model_info import ModelInfo
 from iotdb.ainode.core.util.gpu_mapping import convert_device_id_to_torch_device
 
 
@@ -58,7 +60,7 @@ class InferenceRequestPool(mp.Process):
     def __init__(
         self,
         pool_id: int,
-        model_id: str,
+        model_info: ModelInfo,
         device: str,
         config: PretrainedConfig,
         request_queue: mp.Queue,
@@ -68,7 +70,7 @@ def __init__(
     ):
         super().__init__()
         self.pool_id = pool_id
-        self.model_id = model_id
+        self.model_info = model_info
         self.config = config
         self.pool_kwargs = pool_kwargs
         self.ready_event = ready_event
@@ -121,7 +123,7 @@ def _step(self):
 
         for requests in grouped_requests:
             batch_inputs = self._batcher.batch_request(requests).to(self.device)
-            if self.model_id == "sundial":
+            if self.model_info.model_type == BuiltInModelType.SUNDIAL.value:
                 batch_output = self._model.generate(
                     batch_inputs,
                     max_new_tokens=requests[0].max_new_tokens,
@@ -135,8 +137,7 @@ def _step(self):
                     cur_batch_size = request.batch_size
                     cur_output = batch_output[offset : offset + cur_batch_size]
                     offset += cur_batch_size
-                    # TODO Here we only considered the case where batchsize=1 in one request. If multi-variable adaptation is required in the future, modifications may be needed here, such as: `cur_output[0]` maybe not true in multi-variable scene
-                    request.write_step_output(cur_output[0].mean(dim=0))
+                    request.write_step_output(cur_output.mean(dim=1))
 
                     request.inference_pipeline.post_decode()
                     if request.is_finished():
@@ -153,7 +154,7 @@ def _step(self):
                         )
                         self._waiting_queue.put(request)
 
-            elif self.model_id == "timer_xl":
+            elif self.model_info.model_type == BuiltInModelType.TIMER_XL.value:
                 batch_output = self._model.generate(
                     batch_inputs,
                     max_new_tokens=requests[0].max_new_tokens,
@@ -194,7 +195,9 @@ def run(self):
         )
         self._model_manager = ModelManager()
         self._request_scheduler.device = self.device
-        self._model = self._model_manager.load_model(self.model_id, {}).to(self.device)
+        self._model = self._model_manager.load_model(self.model_info.model_id, {}).to(
+            self.device
+        )
         self.ready_event.set()
 
         activate_daemon = threading.Thread(
@@ -207,10 +210,13 @@ def run(self):
         )
         self._threads.append(execute_daemon)
         execute_daemon.start()
+        self._logger.info(
+            f"[Inference][Device-{self.device}][Pool-{self.pool_id}] InferenceRequestPool for model {self.model_info.model_id} is activated."
+        )
         for thread in self._threads:
             thread.join()
         self._logger.info(
-            f"[Inference][Device-{self.device}][Pool-{self.pool_id}] InferenceRequestPool for model {self.model_id} exited cleanly."
+            f"[Inference][Device-{self.device}][Pool-{self.pool_id}] InferenceRequestPool for model {self.model_info.model_id} exited cleanly."
         )
 
     def stop(self):
 
@@ -40,6 +40,8 @@
     ScaleActionType,
 )
 from iotdb.ainode.core.log import Logger
+from iotdb.ainode.core.manager.model_manager import ModelManager
+from iotdb.ainode.core.model.model_enums import BuiltInModelType
 from iotdb.ainode.core.model.sundial.configuration_sundial import SundialConfig
 from iotdb.ainode.core.model.timerxl.configuration_timer import TimerConfig
 from iotdb.ainode.core.util.atmoic_int import AtomicInt
@@ -48,6 +50,7 @@
 from iotdb.ainode.core.util.thread_name import ThreadName
 
 logger = Logger()
+MODEL_MANAGER = ModelManager()
 
 
 class PoolController:
@@ -169,7 +172,7 @@ def show_loaded_models(
             for model_id, device_map in self._request_pool_map.items():
                 if device_id in device_map:
                     pool_group = device_map[device_id]
-                    device_models[model_id] = pool_group.get_pool_count()
+                    device_models[model_id] = pool_group.get_running_pool_count()
             result[device_id] = device_models
         return result
 
@@ -191,7 +194,7 @@ def _load_model_task(self, model_id: str, device_id_list: list[str]):
         def _load_model_on_device_task(device_id: str):
             if not self.has_request_pools(model_id, device_id):
                 actions = self._pool_scheduler.schedule_load_model_to_device(
-                    model_id, device_id
+                    MODEL_MANAGER.get_model_info(model_id), device_id
                 )
                 for action in actions:
                     if action.action == ScaleActionType.SCALE_UP:
@@ -218,7 +221,7 @@ def _unload_model_task(self, model_id: str, device_id_list: list[str]):
         def _unload_model_on_device_task(device_id: str):
             if self.has_request_pools(model_id, device_id):
                 actions = self._pool_scheduler.schedule_unload_model_from_device(
-                    model_id, device_id
+                    MODEL_MANAGER.get_model_info(model_id), device_id
                 )
                 for action in actions:
                     if action.action == ScaleActionType.SCALE_DOWN:
@@ -253,13 +256,19 @@ def _expand_pools_on_device(self, model_id: str, device_id: str, count: int):
         def _expand_pool_on_device(*_):
             result_queue = mp.Queue()
             pool_id = self._new_pool_id.get_and_increment()
-            if model_id == "sundial":
+            model_info = MODEL_MANAGER.get_model_info(model_id)
+            model_type = model_info.model_type
+            if model_type == BuiltInModelType.SUNDIAL.value:
                 config = SundialConfig()
-            elif model_id == "timer_xl":
+            elif model_id == BuiltInModelType.TIMER_XL.value:
                 config = TimerConfig()
+            else:
+                raise InferenceModelInternalError(
+                    f"Unsupported model type {model_type} for loading model {model_id}"
+                )
             pool = InferenceRequestPool(
                 pool_id=pool_id,
-                model_id=model_id,
+                model_info=model_info,
                 device=device_id,
                 config=config,
                 request_queue=result_queue,
 
@@ -70,6 +70,12 @@ def get_pool_ids(self) -> list[int]:
     def get_pool_count(self) -> int:
         return len(self.pool_group)
 
+    def get_running_pool_count(self) -> int:
+        count = 0
+        for _, state in self.pool_states.items():
+            count += 1 if state == PoolState.RUNNING else 0
+        return count
+
     def dispatch_request(
         self, req: InferenceRequest, infer_proxy: InferenceRequestProxy
     ):
 
@@ -22,6 +22,7 @@
 from typing import Dict, List
 
 from iotdb.ainode.core.inference.pool_group import PoolGroup
+from iotdb.ainode.core.model.model_info import ModelInfo
 
 
 class ScaleActionType(Enum):
@@ -58,12 +59,12 @@ def schedule(self, model_id: str) -> List[ScaleAction]:
 
     @abstractmethod
     def schedule_load_model_to_device(
-        self, model_id: str, device_id: str
+        self, model_info: ModelInfo, device_id: str
     ) -> List[ScaleAction]:
         """
         Schedule a series of actions to load the model to the device.
         Args:
-            model_id: The model to be loaded.
+            model_info: The model to be loaded.
             device_id: The device to load the model to.
         Returns:
             A list of ScaleAction to be performed.
@@ -72,12 +73,12 @@ def schedule_load_model_to_device(
 
     @abstractmethod
     def schedule_unload_model_from_device(
-        self, model_id: str, device_id: str
+        self, model_info: ModelInfo, device_id: str
     ) -> List[ScaleAction]:
         """
         Schedule a series of actions to unload the model from the device.
         Args:
-            model_id: The model to be unloaded.
+            model_info: The model to be unloaded.
             device_id: The device to unload the model from.
         Returns:
             A list of ScaleAction to be performed.
 
@@ -28,23 +28,26 @@
     ScaleActionType,
 )
 from iotdb.ainode.core.log import Logger
+from iotdb.ainode.core.manager.model_manager import ModelManager
 from iotdb.ainode.core.manager.utils import (
     INFERENCE_EXTRA_MEMORY_RATIO,
     INFERENCE_MEMORY_USAGE_RATIO,
     MODEL_MEM_USAGE_MAP,
     estimate_pool_size,
     evaluate_system_resources,
 )
-from iotdb.ainode.core.model.model_info import BUILT_IN_LTSM_MAP
+from iotdb.ainode.core.model.model_info import BUILT_IN_LTSM_MAP, ModelInfo
 from iotdb.ainode.core.util.gpu_mapping import convert_device_id_to_torch_device
 
 logger = Logger()
 
+MODEL_MANAGER = ModelManager()
+
 
 def _estimate_shared_pool_size_by_total_mem(
     device: torch.device,
-    existing_model_ids: List[str],
-    new_model_id: Optional[str] = None,
+    existing_model_infos: List[ModelInfo],
+    new_model_info: Optional[ModelInfo] = None,
 ) -> Dict[str, int]:
     """
     Estimate pool counts for (existing_model_ids + new_model_id) by equally
@@ -54,17 +57,15 @@ def _estimate_shared_pool_size_by_total_mem(
         mapping {model_id: pool_num}
     """
     # Extract unique model IDs
-    all_models = existing_model_ids + (
-        [new_model_id] if new_model_id is not None else []
+    all_models = existing_model_infos + (
+        [new_model_info] if new_model_info is not None else []
     )
 
     # Seize memory usage for each model
     mem_usages: Dict[str, float] = {}
-    for model_id in all_models:
-        model_info = BUILT_IN_LTSM_MAP.get(model_id)
-        model_type = model_info.model_type
-        mem_usages[model_id] = (
-            MODEL_MEM_USAGE_MAP[model_type] * INFERENCE_EXTRA_MEMORY_RATIO
+    for model_info in all_models:
+        mem_usages[model_info.model_id] = (
+            MODEL_MEM_USAGE_MAP[model_info.model_type] * INFERENCE_EXTRA_MEMORY_RATIO
         )
 
     # Evaluate system resources and get TOTAL memory
@@ -84,14 +85,14 @@ def _estimate_shared_pool_size_by_total_mem(
 
     # Calculate pool allocation for each model
     allocation: Dict[str, int] = {}
-    for model_id in all_models:
-        pool_num = int(per_model_share // mem_usages[model_id])
+    for model_info in all_models:
+        pool_num = int(per_model_share // mem_usages[model_info.model_id])
         if pool_num <= 0:
             logger.warning(
-                f"[Inference][Device-{device}] Not enough TOTAL memory to guarantee at least 1 pool for model {model_id}, no pool will be scheduled for this model. "
-                f"Per-model share={per_model_share / 1024 ** 2:.2f} MB, need>={mem_usages[model_id] / 1024 ** 2:.2f} MB"
+                f"[Inference][Device-{device}] Not enough TOTAL memory to guarantee at least 1 pool for model {model_info.model_id}, no pool will be scheduled for this model. "
+                f"Per-model share={per_model_share / 1024 ** 2:.2f} MB, need>={mem_usages[model_info.model_id] / 1024 ** 2:.2f} MB"
             )
-        allocation[model_id] = pool_num
+        allocation[model_info.model_id] = pool_num
     logger.info(
         f"[Inference][Device-{device}] Shared pool allocation (by TOTAL memory): {allocation}"
     )
@@ -119,39 +120,41 @@ def schedule(self, model_id: str) -> List[ScaleAction]:
             return [ScaleAction(ScaleActionType.SCALE_UP, pool_num, model_id)]
 
     def schedule_load_model_to_device(
-        self, model_id: str, device_id: str
+        self, model_info: ModelInfo, device_id: str
     ) -> List[ScaleAction]:
-        existing_model_ids = [
-            existing_model_id
+        existing_model_infos = [
+            MODEL_MANAGER.get_model_info(existing_model_id)
             for existing_model_id, pool_group_map in self._request_pool_map.items()
-            if existing_model_id != model_id and device_id in pool_group_map
+            if existing_model_id != model_info.model_id and device_id in pool_group_map
         ]
         allocation_result = _estimate_shared_pool_size_by_total_mem(
             device=convert_device_id_to_torch_device(device_id),
-            existing_model_ids=existing_model_ids,
-            new_model_id=model_id,
+            existing_model_infos=existing_model_infos,
+            new_model_info=model_info,
         )
         return self._convert_allocation_result_to_scale_actions(
             allocation_result, device_id
         )
 
     def schedule_unload_model_from_device(
-        self, model_id: str, device_id: str
+        self, model_info: ModelInfo, device_id: str
     ) -> List[ScaleAction]:
-        existing_model_ids = [
-            existing_model_id
+        existing_model_infos = [
+            MODEL_MANAGER.get_model_info(existing_model_id)
             for existing_model_id, pool_group_map in self._request_pool_map.items()
-            if existing_model_id != model_id and device_id in pool_group_map
+            if existing_model_id != model_info.model_id and device_id in pool_group_map
         ]
         allocation_result = (
             _estimate_shared_pool_size_by_total_mem(
                 device=convert_device_id_to_torch_device(device_id),
-                existing_model_ids=existing_model_ids,
-                new_model_id=None,
+                existing_model_infos=existing_model_infos,
+                new_model_info=None,
             )
-            if len(existing_model_ids) > 0
-            else {model_id: 0}
+            if len(existing_model_infos) > 0
+            else {model_info.model_id: 0}
         )
+        if len(existing_model_infos) > 0:
+            allocation_result[model_info.model_id] = 0
         return self._convert_allocation_result_to_scale_actions(
             allocation_result, device_id
         )
 
@@ -47,6 +47,7 @@
 from iotdb.ainode.core.inference.utils import generate_req_id
 from iotdb.ainode.core.log import Logger
 from iotdb.ainode.core.manager.model_manager import ModelManager
+from iotdb.ainode.core.model.model_enums import BuiltInModelType
 from iotdb.ainode.core.model.sundial.configuration_sundial import SundialConfig
 from iotdb.ainode.core.model.sundial.modeling_sundial import SundialForPrediction
 from iotdb.ainode.core.model.timerxl.configuration_timer import TimerConfig
@@ -297,9 +298,10 @@ def _run(
                     data = np_data.view(np_data.dtype.newbyteorder())
                 # the inputs should be on CPU before passing to the inference request
                 inputs = torch.tensor(data).unsqueeze(0).float().to("cpu")
-                if model_id == "sundial":
+                model_type = self._model_manager.get_model_info(model_id).model_type
+                if model_type == BuiltInModelType.SUNDIAL.value:
                     inference_pipeline = TimerSundialInferencePipeline(SundialConfig())
-                elif model_id == "timer_xl":
+                elif model_type == BuiltInModelType.TIMER_XL.value:
                     inference_pipeline = TimerXLInferencePipeline(TimerConfig())
                 else:
                     raise InferenceModelInternalError(
 
@@ -144,6 +144,9 @@ def show_models(self, req: TShowModelsReq) -> TShowModelsResp:
     def register_built_in_model(self, model_info: ModelInfo):
         self.model_storage.register_built_in_model(model_info)
 
+    def get_model_info(self, model_id: str) -> ModelInfo:
+        return self.model_storage.get_model_info(model_id)
+
     def update_model_state(self, model_id: str, state: ModelStates):
         self.model_storage.update_model_state(model_id, state)
 
 
@@ -423,6 +423,13 @@ def register_built_in_model(self, model_info: ModelInfo):
         with self._lock_pool.get_lock(model_info.model_id).write_lock():
             self._model_info_map[model_info.model_id] = model_info
 
+    def get_model_info(self, model_id: str) -> ModelInfo:
+        with self._lock_pool.get_lock(model_id).read_lock():
+            if model_id in self._model_info_map:
+                return self._model_info_map[model_id]
+            else:
+                raise ValueError(f"Model {model_id} does not exist.")
+
     def update_model_state(self, model_id: str, state: ModelStates):
         with self._lock_pool.get_lock(model_id).write_lock():
             if model_id in self._model_info_map: