Fix glm4v rlhf (#1745)

hjh0119 · web-flow · commit e457716abb9d · 2024-08-18T12:40:03.000+08:00
* update

* update

* update

* lint
diff --git a/swift/trainers/cpo_trainer.py b/swift/trainers/cpo_trainer.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -204,7 +204,8 @@ def concatenated_forward(
         )
 
         if self.is_vision_model:
-            concatenated_batch = self.concatenated_vision_inputs(batch, concatenated_batch)
+            concatenated_batch = self.concatenated_vision_inputs(
+                batch, concatenated_batch, device=self.accelerator.device)
 
         len_chosen = batch['chosen_labels'].shape[0]
 
@@ -216,8 +217,9 @@ def concatenated_forward(
         } if self.is_encoder_decoder else {})
 
         if self.is_vision_model:
+            # Here, we restore the _data, processing image information within the forward hook of the model.
             batch_size = concatenated_batch['concatenated_input_ids'].shape[0]
-            if self._data_keys is not None:
+            if self._data_keys:
                 _data = [dict() for _ in range(batch_size)]
                 for k in self._data_keys:
                     if k == 'input_ids':
@@ -231,6 +233,9 @@ def concatenated_forward(
                         _data = [{**d, k: concatenated_batch[k][i // 2]} for i, d in enumerate(_data)]
                 model_kwargs['_data'] = _data
 
+            if 'images' in concatenated_batch:
+                model_kwargs['images'] = concatenated_batch['images']
+
         if self.aux_loss_enabled:
             model_kwargs['output_router_logits'] = True
 
@@ -292,6 +297,7 @@ def cross_entropy_loss(logits, labels):
     def concatenated_vision_inputs(
         batch: Dict[str, Union[List, torch.LongTensor]],
         concatenated_batch: Dict[str, torch.LongTensor],
+        device: Optional[torch.device] = None,
     ) -> Dict[str, torch.LongTensor]:
         if 'prompt_pixel_values' in batch:
             pixel_values = [values for values in batch['prompt_pixel_values']]
@@ -308,6 +314,9 @@ def concatenated_vision_inputs(
         if 'prompt_image_sizes' in batch:
             concatenated_batch['image_sizes'] = batch['prompt_image_sizes']
 
+        if 'prompt_images' in batch:
+            # images not in _data, we manually execute data collector here
+            concatenated_batch['images'] = batch['prompt_images'].squeeze(1).repeat(2, 1, 1, 1).to(device=device)
         return concatenated_batch
 
     @staticmethod
diff --git a/swift/trainers/dpo_trainer.py b/swift/trainers/dpo_trainer.py
@@ -294,7 +294,7 @@ def concatenated_forward(
         if self.is_vision_model:
             # Here, we restore the _data, processing image information within the forward hook of the model.
             batch_size = concatenated_batch['concatenated_input_ids'].shape[0]
-            if self._data_keys is not None:
+            if self._data_keys:
                 _data = [dict() for _ in range(batch_size)]
                 for k in self._data_keys:
                     if k == 'input_ids':
@@ -306,7 +306,10 @@ def concatenated_forward(
                         _data = [{**d, k: concatenated_batch[k][i // 2].to(model_dtype)} for i, d in enumerate(_data)]
                     else:
                         _data = [{**d, k: concatenated_batch[k][i // 2]} for i, d in enumerate(_data)]
-                model_kwargs['_data'] = _data
+                    model_kwargs['_data'] = _data
+
+            if 'images' in concatenated_batch:
+                model_kwargs['images'] = concatenated_batch['images']
 
         if self.aux_loss_enabled:
             model_kwargs['output_router_logits'] = True
@@ -427,9 +430,8 @@ def concatenated_inputs(
                 batch['prompt_attention_mask'].repeat(2, 1).to(device=device))
 
         # patch here
-        # leave data collector in hook
-
         if is_vision_model:
+            # for keys appear in _data, we leave data collector in hook
             if 'prompt_pixel_values' in batch:
                 pixel_values = [values for values in batch['prompt_pixel_values']]
                 concatenated_batch['pixel_values'] = pixel_values
@@ -445,6 +447,9 @@ def concatenated_inputs(
             if 'prompt_image_sizes' in batch:
                 concatenated_batch['image_sizes'] = batch['prompt_image_sizes']
 
+            if 'prompt_images' in batch:
+                # images not in _data, we manually execute data collector here
+                concatenated_batch['images'] = batch['prompt_images'].squeeze(1).repeat(2, 1, 1, 1).to(device=device)
         return concatenated_batch
 
     @staticmethod
diff --git a/swift/trainers/orpo_trainer.py b/swift/trainers/orpo_trainer.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -215,8 +215,9 @@ def concatenated_forward(
         } if self.is_encoder_decoder else {})
 
         if self.is_vision_model:
+            # Here, we restore the _data, processing image information within the forward hook of the model.
             batch_size = concatenated_batch['concatenated_input_ids'].shape[0]
-            if self._data_keys is not None:
+            if self._data_keys:
                 _data = [dict() for _ in range(batch_size)]
                 for k in self._data_keys:
                     if k == 'input_ids':
@@ -230,6 +231,9 @@ def concatenated_forward(
                         _data = [{**d, k: concatenated_batch[k][i // 2]} for i, d in enumerate(_data)]
                 model_kwargs['_data'] = _data
 
+            if 'images' in concatenated_batch:
+                model_kwargs['images'] = concatenated_batch['images']
+
         if self.aux_loss_enabled:
             model_kwargs['output_router_logits'] = True
 
@@ -293,6 +297,7 @@ def cross_entropy_loss(logits, labels):
     def concatenated_vision_inputs(
         batch: Dict[str, Union[List, torch.LongTensor]],
         concatenated_batch: Dict[str, torch.LongTensor],
+        device: Optional[torch.device] = None,
     ) -> Dict[str, torch.LongTensor]:
         if 'prompt_pixel_values' in batch:
             pixel_values = [values for values in batch['prompt_pixel_values']]
@@ -309,6 +314,9 @@ def concatenated_vision_inputs(
         if 'prompt_image_sizes' in batch:
             concatenated_batch['image_sizes'] = batch['prompt_image_sizes']
 
+        if 'prompt_images' in batch:
+            # images not in _data, we manually execute data collector here
+            concatenated_batch['images'] = batch['prompt_images'].squeeze(1).repeat(2, 1, 1, 1).to(device=device)
         return concatenated_batch
 
     @staticmethod
diff --git a/swift/trainers/utils.py b/swift/trainers/utils.py
@@ -151,7 +151,7 @@ def patch_datacollator():
         def new_call(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
             padded_batch = {}
             for k in features[0].keys():
-                if k.endswith(('_input_ids', '_attention_mask', '_labels', '_pixel_values')):
+                if k.endswith(('_input_ids', '_attention_mask', '_labels', '_pixel_values', '_images')):
                     if self.is_encoder_decoder:
                         to_pad = [torch.LongTensor(ex[k]) for ex in features]
 
@@ -187,7 +187,7 @@ def new_call(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
                             padding_value = self.label_pad_token_id
                         elif k.endswith('_attention_mask'):
                             padding_value = 0
-                        elif k.endswith('_pixel_values'):
+                        elif k.endswith(('_pixel_values', '_images')):
                             padding_value = 0
                         else:
                             raise ValueError(f"Unexpected key in batch '{k}'")
@@ -199,7 +199,7 @@ def new_call(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
                             padding_side = 'right'
 
                         # Set the dtype
-                        if k.endswith('_pixel_values'):
+                        if k.endswith(('_pixel_values', '_images')):
                             dtype = torch.float32  # will be downcasted if necessary by the Trainer
                         else:
                             dtype = torch.int64