modelscope
diff --git a/‎swift/llm/argument/rlhf_args.py‎
Lines changed: 4 additions & 22 deletions b/‎swift/llm/argument/rlhf_args.py‎
Lines changed: 4 additions & 22 deletions
diff --git a/‎swift/megatron/train/trainers/trainer.py‎
Lines changed: 4 additions & 2 deletions b/‎swift/megatron/train/trainers/trainer.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎swift/megatron/train/utils.py‎
Lines changed: 1 addition & 2 deletions b/‎swift/megatron/train/utils.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎swift/trainers/arguments.py‎
Lines changed: 0 additions & 4 deletions b/‎swift/trainers/arguments.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎swift/trainers/rlhf_trainer/grpo_trainer.py‎
Lines changed: 46 additions & 12 deletions b/‎swift/trainers/rlhf_trainer/grpo_trainer.py‎
Lines changed: 46 additions & 12 deletions
diff --git a/‎swift/ui/llm_grpo/external_rollout.py‎
Lines changed: 1 addition & 2 deletions b/‎swift/ui/llm_grpo/external_rollout.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎swift/ui/llm_grpo/external_runtime.py‎
Lines changed: 3 additions & 2 deletions b/‎swift/ui/llm_grpo/external_runtime.py‎
Lines changed: 3 additions & 2 deletions
@@ -5,7 +5,7 @@
 
 from swift.llm import MODEL_MAPPING
 from swift.trainers.arguments import GRPOArgumentsMixin, RLHFArgumentsMixin
-from swift.utils import get_logger, is_master, set_default_ddp_config
+from swift.utils import get_logger, is_master, is_mp, set_default_ddp_config
 from .train_args import TrainArguments
 
 logger = get_logger()
@@ -155,7 +155,6 @@ def __post_init__(self):
     def _init_grpo(self):
         if self.rlhf_type == 'grpo':
             if self.use_vllm:
-                os.environ['USE_FAST_INFERENCE'] = '1'
                 set_default_ddp_config()
             if self.async_generate or not self.use_vllm:
                 self.sleep_level = 0
@@ -255,7 +254,9 @@ def _check_grpo(self):
         trl_version = version.parse(trl.__version__)
         assert trl_version >= version.parse('0.17'), ('Your current version of `trl` is outdated. '
                                                       'Please update it by running: pip install -U trl')
-
+        if is_mp() and self.use_vllm:
+            raise ValueError('GRPO with vLLM is not compatible with `device_map`. '
+                             'Please set NPROC_PER_NODE equal to num_processes.')
         if self.use_liger_kernel:
             assert trl_version >= version.parse('0.18')
             if self.delta is not None:
@@ -308,25 +309,6 @@ def _deprecated_warning(self):
         if self.rlhf_type != 'grpo':
             return
 
-        if self.tensor_parallel_size is not None:
-            logger.warning(
-                "The parameter 'tensor_parallel_size' has been deprecated and will be removed in version 3.6. "
-                "It is recommended to use 'vllm_tensor_parallel_size' instead.")
-            self.vllm_tensor_parallel_size = self.tensor_parallel_size
-
-        if self.vllm_device is not None:
-            logger.warning("The parameter 'vllm_device' has been deprecated and will be removed in version 3.6. ")
-
-        if self.vllm_max_num_seqs is not None:
-            logger.warning("The parameter 'vllm_max_num_seqs' is automatically set, "
-                           'and has been deprecated and will be removed in version 3.6. ')
-
-        if self.num_infer_workers is not None:
-            logger.warning(
-                "The parameter 'num_infer_workers' has been deprecated and will be removed in version 3.6. "
-                'If you wish to use colocate mode, please use `vllm_mode colocate` instead. '
-                'If you wish to use async mode, please use `vllm_mode server` and external vLLM server instead.')
-
         if self.multi_turn_func:
             logger.warning("The parameter 'multi_turn_func' has been deprecated and will be removed in version 3.7. "
                            "Please use 'multi_turn_scheduler' instead")
 
@@ -46,8 +46,10 @@ def initialize_megatron(*_args, **kwargs):
                 else:
                     raise ValueError(
                         'You are using a streaming training dataset. Please explicitly specify `--train_iters`.')
-            if val_dataset is not None and args.eval_iters < 0:
-                if hasattr(val_dataset, '__len__'):
+            if args.eval_iters < 0:
+                if val_dataset is None:
+                    args.eval_iters = 0
+                elif hasattr(val_dataset, '__len__'):
                     dataset_sample = len(val_dataset) // step_batch_size * step_batch_size
                     args.eval_iters = max(dataset_sample // args.global_batch_size, 1)
                 else:
 
@@ -15,8 +15,7 @@ def swift_datasets_provider(train_val_test_num_samples):
         nonlocal val_dataset
         args = get_args()
         data_parallel_size = mpu.get_data_parallel_world_size()
-        step_batch_size = \
-            args.micro_batch_size * data_parallel_size
+        step_batch_size = args.micro_batch_size * data_parallel_size
         # To avoid errors caused by the validation set being insufficient to complete a single step.
         if val_dataset is not None and len(val_dataset) < step_batch_size:
             val_dataset = None
 
@@ -155,14 +155,11 @@ class GRPOArgumentsMixin:
     top_k: int = 50
     top_p: float = 0.9
     repetition_penalty: float = 1.
-    num_infer_workers: Optional[int] = None  # deprecated
     # vllm
     vllm_mode: Literal['server', 'colocate'] = 'colocate'
     # internal vllm (colocate)
-    vllm_device: Optional[List[str]] = None  # deprecated
     vllm_gpu_memory_utilization: float = 0.9
     vllm_max_model_len: Optional[int] = None
-    vllm_max_num_seqs: Optional[int] = None  # deprecated
     vllm_enforce_eager: bool = False
     vllm_limit_mm_per_prompt: Optional[Union[dict, str]] = None  # '{"image": 5, "video": 2}'
     vllm_enable_prefix_caching: bool = True
@@ -195,7 +192,6 @@ class GRPOArgumentsMixin:
     ref_model_mixup_alpha: float = 0.6
 
     async_generate: bool = False
-    tensor_parallel_size: Optional[int] = None  # deprecated
 
     sleep_level: int = 0
     move_model_batches: Optional[int] = None
 
@@ -1094,8 +1094,20 @@ def _apply_chat_template_to_messages_list(self, messages_list: InputsType):
             InferRequest.remove_response(messages)
             template_inputs, _ = StdTemplateInputs.from_dict({'messages': messages})
             res_context_list, _, _ = self.template._swift_encode(template_inputs)
-            prompts_text.append(''.join(elem for elem in res_context_list if isinstance(elem, str)))
 
+            # check the type and convert
+            processed_context = []
+            for context in res_context_list:
+                if isinstance(context, str):
+                    processed_context.append(context)
+                elif isinstance(context, list) and all(isinstance(x, int) for x in context):
+                    # decode the token ID to text
+                    decoded_text = self.template.tokenizer.decode(context)
+                    processed_context.append(decoded_text)
+                else:
+                    # other type value ,just add to process_context
+                    processed_context.append(str(context))
+            prompts_text.append(''.join(processed_context))
         return prompts_text
 
     @profiling_decorator
@@ -1421,7 +1433,7 @@ def _process_infer_requests_images(self, infer_requests: InputsType):
         return
 
     def old_policy(self):
-        return self.num_iterations > 1 or self.args.steps_per_generation > self.args.gradient_accumulation_steps
+        return self.num_iterations > 1 or self.args.gradient_accumulation_steps % self.args.steps_per_generation != 0
 
     @property
     def _queue(self):
@@ -1580,18 +1592,40 @@ def is_async_generate_eval_rollout_done(self):
     def is_async_generate_train_rollout_done(self):
         return not self.train_queue.empty()
 
-    def inputs_to_rolloutrequest(self, inputs: InputsType) -> RolloutInferRequest:
+    def inputs_to_rolloutrequest(self, inputs: InputsType) -> List[RolloutInferRequest]:
+        """Convert a list of inputs to a list of RolloutInferRequest objects
+
+        If the input contains a 'data_dict' key, it will be used as the base for the new data_dict.
+        For other keys, if they overlap with keys in data_dict, the values from data_dict will be used.
+        Non-overlapping keys will be added to data_dict.
+
+        Args:
+            inputs: List of input dictionaries
 
+        Returns:
+            List of RolloutInferRequest objects
+        """
         request_keys = ['messages', 'images', 'audios', 'videos', 'tools', 'objects']
-        infer_requests = [
-            RolloutInferRequest(
-                **{
-                    **{k: request[k]
-                       for k in request_keys if k in request}, 'data_dict':
-                    {k: request[k]
-                     for k in request if k not in request_keys}
-                }) for request in inputs
-        ]
+        infer_requests = []
+
+        for request in inputs:
+            # Get the base data_dict if it exists in the input
+            base_data_dict = {}
+            if 'data_dict' in request:
+                if isinstance(request['data_dict'], dict):
+                    base_data_dict = request['data_dict']
+                else:
+                    raise ValueError('data_dict exists but is not a dictionary')
+
+            # Collect all non-request_keys items as extra fields
+            extra_data = {k: request[k] for k in request if k not in request_keys and k != 'data_dict'}
+
+            # Merge the data_dict, keeping keys from base_data_dict as priority
+            final_data_dict = {**extra_data, **base_data_dict}
+
+            # Create RolloutInferRequest instance
+            req_args = {k: request[k] for k in request_keys if k in request}
+            infer_requests.append(RolloutInferRequest(**req_args, data_dict=final_data_dict))
 
         return infer_requests
 
 
@@ -110,7 +110,7 @@ class LLMRollout(BaseUI):
 
     @classmethod
     def do_build_ui(cls, base_tab: Type['BaseUI']):
-        with gr.Accordion(elem_id='llm_rollout', visible=False):
+        with gr.Accordion(elem_id='llm_rollout', open=False, visible=False):
             default_device = 'cpu'
             device_count = get_device_count()
             if device_count > 0:
@@ -119,7 +119,6 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
                 with gr.Row():
                     gr.Textbox(elem_id='tensor_parallel_size', lines=1, value='1', scale=4)
                     gr.Textbox(elem_id='data_parallel_size', lines=1, value='1', scale=4)
-                    gr.Textbox(elem_id='max_model_len', lines=1, value='', scale=4)
                     gr.Slider(elem_id='gpu_memory_utilization', minimum=0.0, maximum=1.0, step=0.05, value=0.9, scale=4)
                 with gr.Row(equal_height=True):
                     gr.Dropdown(
 
@@ -56,8 +56,8 @@ class RolloutRuntime(Runtime):
                 'en': 'Logging content'
             },
             'info': {
-                'zh': '如果日志无更新请再次点击"展示日志内容"',
-                'en': 'Please press "Show log" if the log content is not updating'
+                'zh': '如果日志无更新请再次点击"展示rollout状态"',
+                'en': 'Please press "Show running status" if the log content is not updating'
             }
         },
         'rollout_running_tasks': {
@@ -90,6 +90,7 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
             with gr.Blocks():
                 with gr.Row(equal_height=True):
                     gr.Dropdown(elem_id='rollout_running_tasks', scale=10, allow_custom_value=True)
+                with gr.Row(equal_height=True):
                     gr.Button(elem_id='rollout_refresh_tasks', scale=1, variant='primary')
                     gr.Button(elem_id='rollout_show_log', scale=1, variant='primary')
                     gr.Button(elem_id='rollout_stop_show_log', scale=1)