impl rollout controller

chucai.dzq · chucai.dzq · commit e322d3c12286 · 2025-10-03T17:54:00.000+08:00
diff --git a/areal/api/controller_api.py b/areal/api/controller_api.py
@@ -458,7 +458,6 @@ def forward(
         """
         raise NotImplementedError()
 
-
 class RolloutController(abc.ABC):
     """A centralized controller that manages multiple distributed InferenceEngine workers for rollout generation.
 
@@ -508,21 +507,6 @@ def destroy(self):
         """Destroy the engine and release GPU memory for the local inference engine."""
         raise NotImplementedError()
 
-    async def agenerate(self, req: ModelRequest) -> ModelResponse:
-        """Asynchronously generate a response for the given request.
-
-        Parameters
-        ----------
-        req : ModelRequest
-            The model request containing input data and generation parameters
-
-        Returns
-        -------
-        ModelResponse
-            The generated response from the model
-        """
-        raise NotImplementedError()
-
     def update_weights(self, meta: WeightUpdateMeta) -> Future:
         """Update weights in the inference engine in a non-blocking manner.
 
@@ -571,7 +555,7 @@ def get_version(self) -> int:
 
     def submit(
         self,
-        data: Dict[str, Any],
+        data: DistributedBatch,
         workflow: Optional["RolloutWorkflow"] = None,
         workflow_builder: Optional[Callable] = None,
         should_accept: Callable | None = None,
@@ -623,7 +607,7 @@ def wait(self, count: int, timeout: float | None = None) -> DistributedBatch:
 
     def rollout_batch(
         self,
-        data: List[Dict[str, Any]],
+        data: DistributedBatch,
         workflow: Optional["RolloutWorkflow"] = None,
         workflow_builder: Optional[Callable] = None,
         should_accept: Callable | None = None,
@@ -652,7 +636,7 @@ def rollout_batch(
 
     def prepare_batch(
         self,
-        dataloader: StatefulDataLoader,
+        dataloader: DistributedBatch,
         workflow: Optional["RolloutWorkflow"] = None,
         workflow_builder: Optional[Callable] = None,
         should_accept: Callable | None = None,
@@ -688,31 +672,4 @@ def pause(self):
 
     def resume(self):
         """Resume request submission for async rollout."""
-        raise NotImplementedError()
-
-    def register_callback_to_all_worker(
-        self, method: str, callback: Callable, **kwargs
-    ):
-        """Register a callback function for the specified method across all workers.
-
-        Partial rollout API. After successful registration, the controller will poll
-        and call the specified method in a background thread. When the return value
-        is obtained, it will be used as a parameter to call the `callback` function.
-
-        Parameters
-        ----------
-        method : str
-            The name of the method to register the callback for
-        callback : Callable
-            The callback function to be called with the method's return value
-        **kwargs
-            Additional keyword arguments for the callback registration
-        """
-        raise NotImplementedError()
-
-    def abort_all_requests(self) -> None:
-        """Abort all ongoing requests in the inference engine.
-
-        Partial rollout API for canceling all queued and in-progress requests.
-        """
-        raise NotImplementedError()
+        raise NotImplementedError()
diff --git a/areal/api/engine_api.py b/areal/api/engine_api.py
@@ -555,3 +555,15 @@ def pause(self):
     def resume(self):
         """Resume request submission for async rollout."""
         raise NotImplementedError()
+
+    def get_scheduling_config(self) -> List[Scheduling]:
+        """Get the scheduling configuration for the engine.
+
+        This includes configuration such as container image, CPU/GPU/memory size.
+
+        Returns
+        -------
+        Scheduling
+            The scheduling configuration for the engine
+        """
+        raise NotImplementedError()
diff --git a/areal/api/scheduler_api.py b/areal/api/scheduler_api.py
@@ -16,7 +16,7 @@ class Worker:
 @dataclass
 class ScheduleStrategy:
     type: Literal["colocation", "separation", ""] = ""
-    uid: str = ""
+    target: str = ""
 
 
 @dataclass
diff --git a/areal/api/workflow_api.py b/areal/api/workflow_api.py
@@ -524,7 +524,7 @@ def rollout_batch(
 
     def prepare_batch(
         self,
-        dataloader: StatefulDataLoader,
+        dataloader: StatefulDataLoader | List[Dict[str, Any]],
         workflow: "RolloutWorkflow" | None = None,
         workflow_builder: Callable | None = None,
         should_accept: Callable | None = None,
@@ -533,28 +533,62 @@ def prepare_batch(
 
         See :meth:`~areal.api.engine_api.InferenceEngine.prepare_batch` for detailed documentation.
         """
-        if not hasattr(self, "data_generator"):
-            self.data_generator = cycle_dataloader(dataloader)
-        assert dataloader.batch_size is not None
-        while True:
-            # Submit at least two batches to allow maximum overlap
-            if (
-                self.get_capacity() + dataloader.batch_size > 0
-                and self.input_queue.qsize() + dataloader.batch_size
-                < self.input_queue.maxsize
-            ):
-                data = next(self.data_generator)
-                for item in data:
+        if isinstance(dataloader, StatefulDataLoader):
+            # 处理StatefulDataLoader类型 - 保持原有逻辑不变
+            if not hasattr(self, "data_generator"):
+                self.data_generator = cycle_dataloader(dataloader)
+            assert dataloader.batch_size is not None
+            batch_size = dataloader.batch_size
+            
+            while True:
+                # Submit at least two batches to allow maximum overlap
+                if (
+                    self.get_capacity() + batch_size > 0
+                    and self.input_queue.qsize() + batch_size
+                    < self.input_queue.maxsize
+                ):
+                    data = next(self.data_generator)
+                    for item in data:
+                        self.submit(
+                            item,
+                            workflow=workflow,
+                            workflow_builder=workflow_builder,
+                            should_accept=should_accept,
+                        )
+                try:
+                    return self.wait(batch_size, timeout=1)
+                except TimeoutError:
+                    pass
+        else:
+            self.data_list_index = 0
+            
+            # 对于List类型，使用固定的batch_size=1
+            batch_size = 1
+            
+            while True:
+                # Submit at least two batches to allow maximum overlap
+                if (
+                    self.get_capacity() + batch_size > 0
+                    and self.input_queue.qsize() + batch_size
+                    < self.input_queue.maxsize
+                ):
+                    # 从List中获取数据，支持循环访问
+                    if self.data_list_index >= len(dataloader):
+                        self.data_list_index = 0  # 循环访问
+                    
+                    item = dataloader[self.data_list_index]
+                    self.data_list_index += 1
+                    
                     self.submit(
                         item,
                         workflow=workflow,
                         workflow_builder=workflow_builder,
                         should_accept=should_accept,
                     )
-            try:
-                return self.wait(dataloader.batch_size, timeout=1)
-            except TimeoutError:
-                pass
+                try:
+                    return self.wait(batch_size, timeout=1)
+                except TimeoutError:
+                    pass
 
     def pause(self):
         """Pause request submission for async rollout.
diff --git a/areal/controller/rollout_controller.py b/areal/controller/rollout_controller.py
@@ -0,0 +1,133 @@
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import Any, Callable, Dict, List
+
+from tensordict import TensorDict, stack
+
+from areal.api.cli_args import InferenceEngineConfig
+from areal.api.controller_api import RolloutController, DistributedBatch
+from areal.api.engine_api import InferenceEngine
+from areal.api.io_struct import AllocationMode, WeightUpdateMeta
+from areal.api.workflow_api import RolloutWorkflow
+
+from areal.api.scheduler_api import Job, Scheduler, ScheduleStrategy, Worker
+from areal.controller.utils import create_engine_with_retry, rpc_call
+from areal.utils.data import concat_padded_tensors
+from areal.utils import logging
+from areal.utils.http import wait_future_ordered
+
+logger = logging.getLogger("DistributedRolloutController")
+
+
+class DistributedRolloutController(RolloutController):
+    def __init__(
+        self,
+        inf_engine: InferenceEngine,
+        config: InferenceEngineConfig,
+        scheduler: Scheduler,
+    ):
+        super().__init__(inf_engine, config, scheduler)
+        self.role: str = "rollout"
+        self.alloc_mode: AllocationMode
+        self.enable_colocate_mode: bool
+        self.dp_world_size: int
+        self.dp_head_workers: List[Worker]
+
+    def initialize(
+        self,
+        alloc_mode_str: str,
+        target: str,
+    ):
+        self.alloc_mode = AllocationMode.from_str(alloc_mode_str)
+        self.dp_world_size = self.alloc_mode.gen.world_size // self.alloc_mode.gen.dp_size
+
+        job = Job(
+            replicas=self.alloc_mode.gen.world_size,
+            tasks=self.inf_engine.get_scheduling_config(),
+            schedule_strategy=ScheduleStrategy(type="colocation", target=target) if target else None,
+            role=self.role,
+        )
+        logger.info(f"Start to create job: {job}")
+        self.scheduler.create_workers(job)
+
+        workers = self.scheduler.get_workers(self.role, timeout=1800)
+        self.dp_head_workers = [worker for idx, worker in enumerate(workers) if idx % self.dp_world_size == 0]
+        assert len(self.dp_head_workers) == self.alloc_mode.gen.dp_size
+
+        engine_addrs = [f"{w.ip}:{w.serve_port}" for w in self.dp_head_workers]
+        with ThreadPoolExecutor(max_workers=len(self.dp_head_workers)) as executor:
+            futures = [
+                executor.submit(
+                    partial(
+                        create_engine_with_retry,
+                        self.scheduler.create_engine,
+                        worker.id,
+                        self.inf_engine,
+                        None,
+                        engine_addrs,
+                        self.dp_world_size,
+                    )
+                )
+                for worker in self.dp_head_workers
+            ]
+
+            wait_future_ordered(futures, exit_on_exception=True)
+
+    def destroy(self):
+        self.scheduler.delete_workers()
+
+    def __del__(self):
+        self.destroy()
+
+    def update_weights(self, meta: WeightUpdateMeta) -> None:
+        """Update weights in the inference engine."""
+        self.custom_function_call("update_weights", None, meta)
+        return None
+
+    def prepare_batch(self, data: DistributedBatch, workflow: RolloutWorkflow) -> None:
+        """Asynchronously submit a request to the inference engine. Exits immediately."""
+        batches = data.chunk(self.alloc_mode.gen.dp_size)
+        self.custom_function_call("prepare_batch", batches, workflow)
+        return None
+
+    def rollout_batch(
+        self,
+        data: DistributedBatch,
+        workflow: RolloutWorkflow
+    ) -> DistributedBatch:
+        """Submit a batch of requests to the inference engine and wait for the results."""
+        batches = data.chunk(self.alloc_mode.gen.dp_size)
+        results = self.custom_function_call("rollout_distributed_batch", batches, workflow)
+        assert len(results) > 0
+        size = int(results[0]["input_ids"].shape[0])
+        bs = size * len(results)
+        padded = concat_padded_tensors(results)
+        if isinstance(padded, dict):
+            padded = TensorDict(padded, batch_size=[bs])
+        return DistributedBatch.concat(padded.to_dict())
+
+    def set_version(self, version: int) -> None:
+        self.custom_function_call("set_version", None, version)
+        return None
+
+    def get_version(self) -> int:
+        results = self.custom_function_call("get_version", None)
+        return results[0]
+
+    def pause(self):
+        self.custom_function_call("pause", None)
+
+    def resume(self):
+        self.custom_function_call("resume", None)
+
+    def submit(self, data: DistributedBatch):
+        batches = data.chunk(self.alloc_mode.gen.dp_size)
+        self.custom_function_call("submit", batches)
+
+    def wait(self, counts: List[int], timeout: float | None = None)->DistributedBatch:
+        assert len(counts) == len(self.dp_head_workers)
+        results = self.custom_function_call("wait", counts, timeout)
+        return DistributedBatch.concat(results)
+
+    def custom_function_call(self, method: str, batches, *args, **kwargs):
+        return rpc_call(self.scheduler, self.dp_head_workers, method, batches, args, kwargs)
diff --git a/areal/engine/sglang_remote.py b/areal/engine/sglang_remote.py
diff --git a/areal/launcher/sglang_server.py b/areal/launcher/sglang_server.py