[AINode] Adding scheduler to support concurrent inference (#16005)

yunbow30944 · web-flow · commit f3e22205ce2f · 2025-07-28T00:08:21.000+08:00
diff --git a/iotdb-core/ainode/ainode/core/inference/inference_request_pool.py b/iotdb-core/ainode/ainode/core/inference/inference_request_pool.py
@@ -16,6 +16,7 @@
 # under the License.
 #
 
+import gc
 import random
 import threading
 import time
@@ -27,6 +28,7 @@
 
 from ainode.core.config import AINodeDescriptor
 from ainode.core.inference.inference_request import InferenceRequest
+from ainode.core.inference.scheduler.basic_scheduler import BasicScheduler
 from ainode.core.log import Logger
 from ainode.core.manager.model_manager import ModelManager
 
@@ -61,70 +63,92 @@ def __init__(
         self._model_manager = None
         self.device = None
 
-        # TODO: A scheduler is necessary for better handling following queues
         self._threads = []
         self._waiting_queue = request_queue  # Requests that are waiting to be processed
         self._running_queue = mp.Queue()  # Requests that are currently being processed
         self._finished_queue = result_queue  # Requests that are finished
+        self._scheduler = BasicScheduler(
+            self._waiting_queue, self._running_queue, self._finished_queue, self.pool_id
+        )
         self._stop_event = mp.Event()
 
         # Fix inference seed
         random.seed(self.FIX_SEED)
         torch.manual_seed(self.FIX_SEED)
         np.random.seed(self.FIX_SEED)
 
-    def memory_is_available(self, request):
-        # need test with several rounds of dummy data
-        pass
+    def _warm_up_and_estimate_memory(self):
+        # TODO: Test per token memory usage, add support for cpu in the future
+        torch.cuda.empty_cache()
+        gc.collect()
+        dummy_input = torch.zeros(
+            (1, self.config.input_token_len), dtype=torch.float32
+        ).to(self.device)
+
+        # force cuda synchronization to avoid any asynchronous memory allocation issues
+        torch.cuda.reset_peak_memory_stats(self.device)
+        torch.cuda.synchronize(self.device)
+        memory_before_warmup = torch.cuda.memory_allocated(self.device)
+        logger.info(
+            f"[Inference][Device-{self.device}][Pool-{self.pool_id}] Before warm-up, peak memory usage: {memory_before_warmup:.2f} bytes"
+        )
 
-    def _activate_requests(self):
-        if self._waiting_queue.empty():
-            return
-        request: InferenceRequest = self._waiting_queue.get()
-        # TODO: Check memory size before activating requests
-        request.inputs = request.inference_pipeline.preprocess_inputs(request.inputs)
-        request.mark_running()
-        logger.debug(
-            f"[Inference][Device-{self.device}][Pool-{self.pool_id}][ID-{request.req_id}] Request is activated with inputs shape {request.inputs.shape}"
+        # warm-up
+        with torch.no_grad():
+            self.model.generate(dummy_input, max_new_tokens=1)
+        torch.cuda.synchronize(self.device)
+        peak_memory_1_token = torch.cuda.max_memory_allocated(self.device)
+        logger.info(
+            f"[Inference][Device-{self.device}][Pool-{self.pool_id}] Baseline memory usage for 1 token: {peak_memory_1_token:.2f} bytes"
+        )
+        logger.info(
+            f"[Inference][Device-{self.device}][Pool-{self.pool_id}] Differentiation : {peak_memory_1_token-memory_before_warmup:.2f} bytes"
         )
-        self._running_queue.put(request)
+
+    def _activate_requests(self):
+        requests = self._scheduler.schedule_activate()
+        for request in requests:
+            request.inputs = request.inference_pipeline.preprocess_inputs(
+                request.inputs
+            )
+            request.mark_running()
+            self._running_queue.put(request)
+            logger.debug(
+                f"[Inference][Device-{self.device}][Pool-{self.pool_id}][ID-{request.req_id}] Request is activated with inputs shape {request.inputs.shape}"
+            )
 
     def _requests_activate_loop(self):
         while not self._stop_event.is_set():
             time.sleep(self.WAITING_INTERVAL_IN_MS / 1000)
             self._activate_requests()
 
     def _step(self):
-        if self._running_queue.empty():
-            return
+        requests = self._scheduler.schedule_step()
         # TODO: We need a batcher to accelerate the concurrent inference
-        # TODO: Check memory size before executing requests
-        request: InferenceRequest = self._running_queue.get()
-        inputs = request.inputs.to(self.device)
-        output = self.model.generate(
-            inputs,
-            max_new_tokens=request.max_new_tokens,
-            num_samples=10,
-            revin=True,
-        )
-        request.output_tensor = request.output_tensor.to(
-            self.device
-        )  # Ensure output tensor is on the same device
-        request.write_step_output(output[0].mean(dim=0))
-        request.inference_pipeline.post_decode()
-        if request.is_finished():
-            request.inference_pipeline.post_inference()
-            logger.debug(
-                f"[Inference][Device-{self.device}][Pool-{self.pool_id}][ID-{request.req_id}] Request is finished"
-            )
-            # ensure the output tensor is on CPU before sending to result queue
-            request.output_tensor = request.output_tensor.cpu()
-            self._finished_queue.put(request)
-        else:
-            logger.debug(
-                f"[Inference][Device-{self.device}][Pool-{self.pool_id}][ID-{request.req_id}] Request is not finished, re-queueing"
+        for request in requests:
+            request.inputs = request.inputs.to(self.device)
+            output = self.model.generate(
+                request.inputs,
+                max_new_tokens=request.max_new_tokens,
+                num_samples=10,
+                revin=True,
             )
-            self._waiting_queue.put(request)
+            request.output_tensor = request.output_tensor.to(self.device)
+            request.write_step_output(output[0].mean(dim=0))
+            request.inference_pipeline.post_decode()
+            if request.is_finished():
+                request.inference_pipeline.post_inference()
+                logger.debug(
+                    f"[Inference][Device-{self.device}][Pool-{self.pool_id}][ID-{request.req_id}] Request is finished"
+                )
+                # ensure the output tensor is on CPU before sending to result queue
+                request.output_tensor = request.output_tensor.cpu()
+                self._finished_queue.put(request)
+            else:
+                logger.debug(
+                    f"[Inference][Device-{self.device}][Pool-{self.pool_id}][ID-{request.req_id}] Request is not finished, re-queueing"
+                )
+                self._waiting_queue.put(request)
 
     def _requests_execute_loop(self):
         while not self._stop_event.is_set():
@@ -134,8 +158,11 @@ def _requests_execute_loop(self):
     def run(self):
         self._model_manager = ModelManager()
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self._scheduler.device = self.device
         self.model = self._model_manager.load_model(self.model_id, {}).to(self.device)
 
+        # self._warm_up_and_estimate_memory()
+
         activate_daemon = threading.Thread(
             target=self._requests_activate_loop, daemon=True
         )
@@ -151,3 +178,15 @@ def run(self):
 
     def stop(self):
         self._stop_event.set()
+        logger.info(
+            f"[Inference][Device-{self.device}][Pool-{self.pool_id}] Stopping and releasing resources."
+        )
+        try:
+            del self.model
+            if "cuda" in str(self.device):
+                torch.cuda.empty_cache()
+            gc.collect()
+        except Exception as e:
+            logger.warning(
+                f"[Inference][Device-{self.device}][Pool-{self.pool_id}] Failed to clean up: {e}"
+            )
diff --git a/iotdb-core/ainode/ainode/core/inference/scheduler/__init__.py b/iotdb-core/ainode/ainode/core/inference/scheduler/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
diff --git a/iotdb-core/ainode/ainode/core/inference/scheduler/abstract_scheduler.py b/iotdb-core/ainode/ainode/core/inference/scheduler/abstract_scheduler.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from abc import ABC, abstractmethod
+
+
+class AbstractScheduler(ABC):
+    """
+    Abstract base class for inference scheduling strategies.
+
+    This class defines the high-level interface for scheduling inference requests.
+    A scheduler is responsible for managing the execution order of inference tasks across different
+    stages: waiting, running, and finished.
+
+    Subclasses should implement specific scheduling logic.
+    """
+
+    def __init__(self, waiting_queue, running_queue, finished_queue):
+        """
+        Args:
+            waiting_queue: Queue containing inference requests that are waiting to be executed.
+            running_queue: Queue containing currently running inference tasks.
+            finished_queue: Queue containing completed inference tasks.
+        """
+        self.waiting_queue = waiting_queue
+        self.running_queue = running_queue
+        self.finished_queue = finished_queue
+
+    @abstractmethod
+    def schedule_activate(self) -> list:
+        """
+        Select one or more inference requests from the waiting queue that are ready to be activated and processed.
+
+        Returns:
+            List: A list of inference requests that will be moved to the running queue.
+        """
+        pass
+
+    @abstractmethod
+    def schedule_step(self) -> list:
+        """
+        Select one or more inference requests from the running queue that are ready to perform the next inference step.
+
+        Returns:
+            List: A list of inference requests that are scheduled to run an inference step.
+        """
+        pass
diff --git a/iotdb-core/ainode/ainode/core/inference/scheduler/basic_scheduler.py b/iotdb-core/ainode/ainode/core/inference/scheduler/basic_scheduler.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import os
+
+import psutil
+import torch
+
+from ainode.core.inference.inference_request import InferenceRequest
+from ainode.core.inference.scheduler.abstract_scheduler import AbstractScheduler
+from ainode.core.log import Logger
+
+logger = Logger()
+
+
+class BasicScheduler(AbstractScheduler):
+    """
+    A simple FIFO scheduler that selects requests based on memory availability and activation/step size.
+    """
+
+    def __init__(
+        self,
+        waiting_queue,
+        running_queue,
+        finished_queue,
+        pool_id,
+        max_memory_bytes=1 << 30,
+        max_activate_size=10,
+        max_step_size=10,
+    ):
+        super().__init__(waiting_queue, running_queue, finished_queue)
+        self.max_memory_bytes = max_memory_bytes
+        self.max_activate_size = max_activate_size
+        self.max_step_size = max_step_size
+        self.pool_id = pool_id
+        self.device = None
+
+    def memory_is_available(self):
+        if "cuda" in self.device.type:
+            used = torch.cuda.memory_allocated(self.device)
+            reserved = torch.cuda.memory_reserved(self.device)
+        elif "cpu" in self.device.type:
+            process = psutil.Process(os.getpid())
+            used = process.memory_info().rss
+            reserved = used
+        else:
+            used = 0
+            reserved = 0
+            logger.warning(
+                f"[Inference] Unsupported device type: {self.device.type}. Memory checks will not be performed."
+            )
+        logger.debug(
+            f"[Inference][Device-{self.device}][Pool-{self.pool_id}] "
+            f"Memory used: {used} bytes, Max memory: {self.max_memory_bytes} bytes"
+        )
+        return used < self.max_memory_bytes
+
+    def schedule_activate(self) -> list:
+        requests = []
+        while not self.waiting_queue.empty() and len(requests) < self.max_activate_size:
+            if not self.memory_is_available():
+                break
+            requests.append(self.waiting_queue.get())
+        return requests
+
+    def schedule_step(self) -> list:
+        requests = []
+        while not self.running_queue.empty() and len(requests) < self.max_step_size:
+            if not self.memory_is_available():
+                break
+            requests.append(self.running_queue.get())
+        return requests