andylin-hao
diff --git a/‎.github/workflows/scheduler-tests.yml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/scheduler-tests.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/source-en/rst_source/tutorials/scheduler/dynamic-scheduling.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source-en/rst_source/tutorials/scheduler/dynamic-scheduling.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source-zh/rst_source/tutorials/scheduler/dynamic-scheduling.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source-zh/rst_source/tutorials/scheduler/dynamic-scheduling.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎rlinf/data/io_struct.py‎
Lines changed: 59 additions & 60 deletions b/‎rlinf/data/io_struct.py‎
Lines changed: 59 additions & 60 deletions
diff --git a/‎rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py‎
Lines changed: 7 additions & 5 deletions b/‎rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎rlinf/scheduler/dynamic_scheduler/scheduler_worker.py‎
Lines changed: 2 additions & 2 deletions b/‎rlinf/scheduler/dynamic_scheduler/scheduler_worker.py‎
Lines changed: 2 additions & 2 deletions
@@ -47,3 +47,12 @@ jobs:
           export REPO_PATH=$(pwd)
           source switch_env reason
           bash tests/e2e_tests/dynamic_scheduler/run.sh qwen2.5-1.5b-grpo-dynamic-mg-sgl
+
+      - name: Megatron vLLM
+        timeout-minutes: 20
+        run: |
+          export PYTHONPATH=$(pwd)/Megatron-LM-011:$(pwd)/params_resharding_release
+          cd rlinf
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/dynamic_scheduler/run.sh qwen2.5-1.5b-grpo-dynamic-mg-vllm
@@ -4,7 +4,7 @@ Dynamic Scheduling
 
 Dynamic scheduling adjusts and migrates resources among components (actor / rollout / inference)
 in real time during training to improve overall throughput and resource utilization.
-It relies on Megatron-LM's online scaling (second-level elasticity) and SGLang's migrate capability
+It relies on Megatron-LM's online scaling (second-level elasticity) and SGLang/vLLM's migrate capability
 to reallocate GPU resources without stopping training.
 
 What is Dynamic Scheduling?
 
@@ -5,7 +5,7 @@
 动态调度（Dynamic Scheduling）
 是在训练运行期根据系统各组件（actor / rollout / inference）的实时状态，
 对资源进行秒级动态调整与迁移，以提升整体吞吐与资源利用率的机制。
-它依托于 Megatron-LM 的在线扩缩容能力（秒级扩缩）与 SGLang 的请求迁移功能，
+它依托于 Megatron-LM 的在线扩缩容能力（秒级扩缩）与 SGLang/vLLM 的请求迁移功能，
 在不终止训练的前提下，对集群中的 GPU 资源进行弹性重分配。
 
 什么是动态调度？
 
@@ -49,10 +49,10 @@ def get_seq_length(
 class RolloutRequest:
     """
     Attr
-    input_ids: List of input token IDs for rollout
+    input_ids: list of input token IDs for rollout
     n: Number of completions to generate for each input
     image_data: list of image data (bytes or URLs) for multimodal inputs
-    answers: List of answers for the requests, where each answer can be either a list of strings (for typical tasks) or a dict (for VQA tasks), if available.
+    answers: list of answers for the requests, where each answer can be either a list of strings (for typical tasks) or a dict (for VQA tasks), if available.
     multi_modal_inputs: list of multi-modal inputs for the requests
     """
 
@@ -66,7 +66,7 @@ def to_seq_group_infos(self) -> list["SeqGroupInfo"]:
         """Convert the RolloutRequest into a list of SeqGroupInfo objects.
 
         Returns:
-            List[SeqGroupInfo]: A list of SeqGroupInfo objects.
+            list[SeqGroupInfo]: A list of SeqGroupInfo objects.
         """
         return [
             SeqGroupInfo(
@@ -102,12 +102,12 @@ class SeqGroupInfo:
 
     Attributes:
         id (int): Unique identifier for the sequence group.
-        input_ids (List[int]): List of input IDs of the original sequence.
-        answer (Union[List[str], Dict]): List of answers of the original sequence.(One sequence can have multiple equivalent answers), or a dict in case of vqa task.
+        input_ids (list[int]): list of input IDs of the original sequence.
+        answer (Union[list[str], dict]): list of answers of the original sequence.(One sequence can have multiple equivalent answers), or a dict in case of vqa task.
         group_size (int): Number of sequences in the group.
         idx_completed (set[int]): Set of indices for sequences that have completed rollout and are ready for evaluation.
         idx_aborted (set[int]): Set of indices for sequences that have been aborted. These sequences need to be re-rolled out before they can be evaluated.
-        results (List[Optional[Dict]]): List storing result dictionaries for each sequence, or None if not yet available.
+        results (list[Optional[dict]]): list storing result for each sequence, or None if not yet available.
     """
 
     id: int
@@ -116,7 +116,9 @@ class SeqGroupInfo:
     group_size: int
     idx_completed: set[int] = field(init=False, compare=False)
     idx_aborted: set[int] = field(init=False, compare=False)
-    results: list[Optional[dict]] = field(init=False, compare=False)
+    results: list[Optional[Union[dict, "VllmRequestOutput"]]] = field(
+        init=False, compare=False
+    )
     image_data: Optional[list] = None
     multi_modal_inputs: Optional[dict] = None
 
@@ -126,6 +128,18 @@ def __post_init__(self):
         self.idx_aborted = set()
         self.results = [None for _ in range(self.group_size)]
 
+    def record_vllm_result(self, idx: int, result: "VllmRequestOutput", logger=None):
+        finish_reason = result.outputs[0].finish_reason
+        if finish_reason is None or finish_reason == "abort":
+            self.idx_aborted.add(idx)
+        else:
+            self.idx_completed.add(idx)
+
+        if self.results[idx] is None:
+            self.results[idx] = result
+        else:
+            self.results[idx].add(next_output=result, aggregate=True)
+
     def record_sglang_result(self, idx: int, result: dict, logger=None):
         """Record a single sglang execution result and update internal tracking.
 
@@ -139,7 +153,7 @@ def record_sglang_result(self, idx: int, result: dict, logger=None):
         Args:
             idx: int
                 The index of the sequence within the group (0 <= idx < group_size).
-            result: Dict
+            result: dict
                 Result of SGLang. Expected to contain at least:
                 - "meta_info": {"finish_reason": {"type": FinishReasonEnum}}
                 - "output_ids": a list (or list-like) of output identifier elements
@@ -300,13 +314,14 @@ def from_vllm_results(
         return_logprobs: bool = False,
     ) -> "RolloutResult":
         """
-        Create a RolloutResult from the given vLLM results.
+        Create a RolloutResult from the given vLLM results. every result is generated with n=1,
+        so its outputs len is 1
 
         Args:
             group_size (int): The group size used during rollout.
             results (list[VllmRequestOutput]): The rollout results from vLLM.
             answers (Optional[Union[list[str], dict]]): The answers corresponding to the inputs, notably, if task type is vqa, answers is a dict.
-            multi_modal_inputs (Optional[list[Dict]]): The multi-modal inputs corresponding to the inputs.
+            multi_modal_inputs (Optional[list[dict]]): The multi-modal inputs corresponding to the inputs.
             return_logprobs (bool): Whether to return log probabilities.
 
         Returns:
@@ -325,62 +340,36 @@ def get_logprobs(
                 logprobs.append(logprob[response_ids[i]].logprob)
             return logprobs
 
-        num_sequences = len(results) * group_size
-
-        if multi_modal_inputs:
-            mm_inputs = []
-            for mm_input in multi_modal_inputs:
-                mm_inputs.extend([mm_input] * group_size)
-        else:
-            mm_inputs = None
-
         # for VQA task, answers is a dict
         if isinstance(answers, dict):
             answers = [answers]
 
-        prompt_lengths = []
-        prompt_ids = []
-        response_lengths = []
-        response_ids = []
-        logprobs = []
-        is_end = []
-        response_texts = []
-        rollout_answers = (
-            [answer for answer in answers for _ in range(group_size)]
-            if answers
-            else None
-        )
-        for vllm_result in results:
-            if vllm_result.prompt_token_ids is not None:
-                prompt_ids.extend([vllm_result.prompt_token_ids] * group_size)
-                prompt_lengths.extend([len(vllm_result.prompt_token_ids)] * group_size)
-            else:
-                raise NotImplementedError("vllm should return tokenized prompt.")
-            response_ids.extend(
-                [list(output.token_ids) for output in vllm_result.outputs]
-            )
-            response_texts.extend([output.text for output in vllm_result.outputs])
-            response_lengths.extend(
-                [len(output.token_ids) for output in vllm_result.outputs]
-            )
-            is_end.extend([vllm_result.finished] * group_size)
-            if return_logprobs:
-                logprobs.extend(
-                    [
-                        get_logprobs(list(output.token_ids), output)
-                        for output in vllm_result.outputs
-                    ]
+        # here vllm must return prompt ids because we pass input_ids as input
+        prompt_ids = [vllm_result.prompt_token_ids for vllm_result in results]
+        prompt_lengths = [len(vllm_result.prompt_token_ids) for vllm_result in results]
+        response_ids = [vllm_result.outputs[0].token_ids for vllm_result in results]
+        response_texts = [vllm_result.outputs[0].text for vllm_result in results]
+        response_lengths = [
+            len(vllm_result.outputs[0].token_ids) for vllm_result in results
+        ]
+        is_end = [vllm_result.finished for vllm_result in results]
+        if return_logprobs:
+            logprobs = [
+                get_logprobs(
+                    list(vllm_result.outputs[0].token_ids), vllm_result.outputs[0]
                 )
+                for vllm_result in results
+            ]
         result: RolloutResult = RolloutResult(
             group_size=group_size,
-            num_sequence=num_sequences,
-            answers=rollout_answers,
+            num_sequence=len(results),
+            answers=answers,
             prompt_ids=prompt_ids,
             prompt_lengths=prompt_lengths,
             response_ids=response_ids,
             response_lengths=response_lengths,
             response_texts=response_texts,
-            multi_modal_inputs=mm_inputs,
+            multi_modal_inputs=multi_modal_inputs,
             is_end=is_end,
         )
         if return_logprobs:
@@ -400,8 +389,8 @@ def from_sglang_results(
         """Create a MathRolloutResult from the given results and input IDs.
 
         Args:
-            results (List[Dict]): The rollout results from the model.
-            input_ids (List[List[int]]): The input IDs for the prompts.
+            results (list[dict]): The rollout results from the model.
+            input_ids (list[list[int]]): The input IDs for the prompts.
             return_logprobs (bool): Whether to return log probabilities.
         """
         assert len(results) == len(input_ids), (
@@ -447,6 +436,16 @@ def from_sglang_seq_group(cls, seq_group: SeqGroupInfo, return_logprobs: bool):
             return_logprobs=return_logprobs,
         )
 
+    @classmethod
+    def from_vllm_seq_group(cls, seq_group: SeqGroupInfo, return_logprobs: bool):
+        return cls.from_vllm_results(
+            seq_group.group_size,
+            seq_group.results,
+            answers=[seq_group.answer] * seq_group.group_size,
+            multi_modal_inputs=[seq_group.multi_modal_inputs] * seq_group.group_size,
+            return_logprobs=return_logprobs,
+        )
+
     @staticmethod
     def merge_result_list(
         rollout_results: list["RolloutResult"],
@@ -550,10 +549,10 @@ def split_result_list_by_group(
         If input has multiple RolloutResult objects, split each one and merge the results.
 
         Args:
-            rollout_results: List of input RolloutResult objects
+            rollout_results: list of input RolloutResult objects
 
         Returns:
-            List of RolloutResult objects grouped by group_size
+            list of RolloutResult objects grouped by group_size
         """
         assert len(rollout_results) > 0, "No rollout results to split."
 
@@ -576,7 +575,7 @@ def _split_single_result_by_group(
             rollout_result: The RolloutResult to be split
 
         Returns:
-            List of split RolloutResult objects
+            list of split RolloutResult objects
         """
         group_size = rollout_result.group_size
         num_sequence = rollout_result.num_sequence
@@ -710,7 +709,7 @@ def to_actor_batch(
             pad_token (int): Token used for padding, e.g., `tokenizer.pad_token_id`.
 
         Returns:
-            Dict[str, torch.Tensor]: A dictionary with keys:
+            dict[str, torch.Tensor]: A dictionary with keys:
 
             input_ids (torch.Tensor):
                 Concatenated prompt and response token IDs,
 
@@ -62,6 +62,7 @@ def __init__(
         self.actor_weight_rank = rank_map[
             self._rlinf_worker.get_parent_rank(), self.rank
         ]
+        self.is_weight_offloaded = False
 
     def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
@@ -76,6 +77,7 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
 
     def offload_model_weights(self) -> None:
         super().sleep(level=2)
+        self.is_weight_offloaded = True
 
     def sync_hf_weight(self) -> None:
         use_cudagraph = not self.rlinf_config.rollout.enforce_eager
@@ -85,20 +87,20 @@ def sync_hf_weight(self) -> None:
         state_dict = self._rlinf_worker.recv(
             src_group_name=self._actor_group_name, src_rank=self.actor_weight_rank
         )
-        if self.placement_mode == PlacementMode.COLLOCATED:
-            # in disaggregated mode, rollout backend will never offload weights
-            # so we don't need to wake up when placement is disaggregated
+        if self.is_weight_offloaded:
             super().wake_up()
+            self.is_weight_offloaded = False
 
         model = self.model_runner.model
         if colocate:
+            batch_weights = []
             for name, handle in state_dict.items():
                 func, args = handle
                 list_args = list(args)
                 list_args[6] = torch.cuda.current_device()
                 new_weight: torch.Tensor = func(*list_args)
-                model.load_weights([(name, new_weight)])
-                del new_weight
+                batch_weights.append((name, new_weight))
+            model.load_weights(batch_weights)
         else:
             model.load_weights(state_dict.items())
         super().compile_or_warm_up_model()
 
@@ -43,8 +43,8 @@ def __init__(
         self.components = self.component_placement._components
         self.workflow = workflow
 
-        assert self.cfg.rollout.rollout_backend == "sglang", (
-            "only sglang is supported for dynamic scheduler"
+        assert self.cfg.rollout.rollout_backend in ["sglang", "vllm"], (
+            "only sglang and vllm are supported for dynamic scheduler"
         )
         assert self.cfg.actor.training_backend == "megatron", (
             "only megatron is supported for dynamic scheduler"