OpenGVLab · LuminolT · Oct 13, 2025
diff --git a/internvl_chat/internvl/model/internvl_chat/configuration_internvl_chat.py b/internvl_chat/internvl/model/internvl_chat/configuration_internvl_chat.py
@@ -34,7 +34,7 @@ def __init__(
             template=None,
             dynamic_image_size=False,
             use_thumbnail=False,
-            ps_version='v1',
+            pixel_unshuffle_version='v1',
             min_dynamic_patch=1,
             max_dynamic_patch=6,
             **kwargs):
@@ -69,7 +69,7 @@ def __init__(
         self.template = template
         self.dynamic_image_size = dynamic_image_size
         self.use_thumbnail = use_thumbnail
-        self.ps_version = ps_version  # pixel shuffle version
+        self.pixel_unshuffle_version = pixel_unshuffle_version  # pixel unshuffle version
         self.min_dynamic_patch = min_dynamic_patch
         self.max_dynamic_patch = max_dynamic_patch
 
@@ -79,7 +79,7 @@ def __init__(
         self.llm_config.tie_word_embeddings = self.tie_word_embeddings
 
         logger.info(f'vision_select_layer: {self.select_layer}')
-        logger.info(f'ps_version: {self.ps_version}')
+        logger.info(f'pixel_unshuffle_version: {self.pixel_unshuffle_version}')
         logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
         logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
 
@@ -102,7 +102,7 @@ def to_dict(self):
         output['template'] = self.template
         output['dynamic_image_size'] = self.dynamic_image_size
         output['use_thumbnail'] = self.use_thumbnail
-        output['ps_version'] = self.ps_version
+        output['pixel_unshuffle_version'] = self.pixel_unshuffle_version
         output['min_dynamic_patch'] = self.min_dynamic_patch
         output['max_dynamic_patch'] = self.max_dynamic_patch
 

diff --git a/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py b/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py
@@ -56,15 +56,15 @@ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model
         self.template = config.template
         self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
         self.downsample_ratio = config.downsample_ratio
-        self.ps_version = config.ps_version
+        self.pixel_unshuffle_version = config.pixel_unshuffle_version
         self.llm_arch_name = config.llm_config.architectures[0]
         # Enable Flash Attention if supported, otherwise fall back to eager attention.
         use_flash_attn = use_flash_attn if has_flash_attn else False
         config.vision_config.use_flash_attn = True if use_flash_attn else False
         config.llm_config.attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
 
         logger.info(f'num_image_token: {self.num_image_token}')
-        logger.info(f'ps_version: {self.ps_version}')
+        logger.info(f'pixel_unshuffle_version: {self.pixel_unshuffle_version}')
         if vision_model is not None:
             self.vision_model = vision_model
         else:
@@ -254,7 +254,7 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def pixel_shuffle(self, x, scale_factor=0.5):
+    def pixel_unshuffle(self, x, scale_factor=0.5):
         n, w, h, c = x.size()
         # N, W, H, C --> N, W, H * scale, C // scale
         x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
@@ -263,8 +263,8 @@ def pixel_shuffle(self, x, scale_factor=0.5):
         # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
         x = x.view(n, int(h * scale_factor), int(w * scale_factor),
                    int(c / (scale_factor * scale_factor)))
-        if self.ps_version == 'v1':
-            warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
+        if self.pixel_unshuffle_version == 'v1':
+            warnings.warn("In pixel_unshuffle_version 'v1', the height and width have not been swapped back, "
                           'which results in a transposed image.')
         else:
             x = x.permute(0, 2, 1, 3).contiguous()
@@ -285,7 +285,7 @@ def extract_feature(self, pixel_values):
 
         h = w = int(vit_embeds.shape[1] ** 0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
-        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = self.pixel_unshuffle(vit_embeds, scale_factor=self.downsample_ratio)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds

diff --git a/internvl_chat/internvl/train/internvl_chat_finetune.py b/internvl_chat/internvl/train/internvl_chat_finetune.py
@@ -145,10 +145,15 @@ class ModelArguments:
         default=0.0,
         metadata={'help': 'Set the drop path rate for the ViT. Default is 0.'},
     )
-    ps_version: Literal['v1', 'v2'] = field(
+    pixel_unshuffle_version: Literal['v1', 'v2'] = field(
         default='v2',
         metadata={'help': 'Specify the version of pixel shuffle implementation. Default is v2.'}
     )
+    # Deprecated alias for pixel_unshuffle_version; keep temporarily for backward compatibility
+    ps_version: Optional[Literal['v1', 'v2']] = field(
+        default=None,
+        metadata={'help': 'DEPRECATED: use --pixel_unshuffle_version instead.'}
+    )
     use_fast_tokenizer: bool = field(
         default=False,
         metadata={'help': 'Set to True to use the fast mode of the tokenizer.'}
@@ -827,6 +832,34 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    # -------------------- begin: migrate deprecated args (ps_version -> pixel_unshuffle_version) --------------------
+
+    # !! IF `ps_version` IS READY TO BE DEPRECATED, REMOVE THE FOLLOWING BLOCK COMPLETELY !!
+
+    # Getting the "default value" of the new argument to determine if the user has explicitly set it
+    _default_new = ModelArguments.__dataclass_fields__['pixel_unshuffle_version'].default
+    new_is_default = (model_args.pixel_unshuffle_version == _default_new)
+
+    # If the old argument is provided, issue a deprecation warning and map it
+    if getattr(model_args, 'ps_version', None) is not None:
+        warnings.warn(
+            '`--ps_version` is deprecated and will be removed; use `--pixel_unshuffle_version` instead.',
+            DeprecationWarning
+        )
+        # If the new argument appears to have been explicitly set, and conflicts with the old value → take the new argument
+        if not new_is_default and model_args.ps_version != model_args.pixel_unshuffle_version:
+            logger.warning(
+                f'Both ps_version={model_args.ps_version} (deprecated) and '
+                f'pixel_unshuffle_version={model_args.pixel_unshuffle_version} are provided; '
+                f'using pixel_unshuffle_version.'
+            )
+        else:
+            # Else, map the old value to the new argument
+            model_args.pixel_unshuffle_version = model_args.ps_version
+
+    logger.info(f'Pixel unshuffle version = {model_args.pixel_unshuffle_version}')
+    # --------------------  end: migrate deprecated args  ------------------------------------------------------------
+
     if training_args.should_log:
         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
         transformers.utils.logging.set_verbosity_info()
@@ -903,7 +936,7 @@ def main():
         config.select_layer = model_args.vision_select_layer
         config.dynamic_image_size = data_args.dynamic_image_size
         config.use_thumbnail = data_args.use_thumbnail
-        config.ps_version = model_args.ps_version
+        config.pixel_unshuffle_version = model_args.pixel_unshuffle_version
         config.min_dynamic_patch = data_args.min_dynamic_patch
         config.max_dynamic_patch = data_args.max_dynamic_patch
         model = InternVLChatModel.from_pretrained(
@@ -932,7 +965,7 @@ def main():
             vision_config.to_dict(), llm_config.to_dict(), downsample_ratio=data_args.down_sample_ratio,
             pad2square=data_args.pad2square, template=data_args.conv_style,
             select_layer=model_args.vision_select_layer, dynamic_image_size=data_args.dynamic_image_size,
-            use_thumbnail=data_args.use_thumbnail, ps_version=model_args.ps_version,
+            use_thumbnail=data_args.use_thumbnail, pixel_unshuffle_version=model_args.pixel_unshuffle_version,
             min_dynamic_patch=data_args.min_dynamic_patch, max_dynamic_patch=data_args.max_dynamic_patch)
         internvl_chat_config.force_image_size = data_args.force_image_size
         logger.info('Building InternVLChatModel...')

diff --git a/internvl_chat/internvl/train/internvl_chat_mpo.py b/internvl_chat/internvl/train/internvl_chat_mpo.py
@@ -146,10 +146,15 @@ class ModelArguments:
         default=0.0,
         metadata={'help': 'Set the drop path rate for the ViT. Default is 0.'},
     )
-    ps_version: Literal['v1', 'v2'] = field(
+    pixel_unshuffle_version: Literal['v1', 'v2'] = field(
         default='v2',
         metadata={'help': 'Specify the version of pixel shuffle implementation. Default is v2.'}
     )
+    # Deprecated alias for pixel_unshuffle_version; keep temporarily for backward compatibility
+    ps_version: Optional[Literal['v1', 'v2']] = field(
+        default=None,
+        metadata={'help': 'DEPRECATED: use --pixel_unshuffle_version instead.'}
+    )
     use_fast_tokenizer: bool = field(
         default=False,
         metadata={'help': 'Set to True to use the fast mode of the tokenizer.'}
@@ -803,6 +808,35 @@ def main():
         handlers=[logging.StreamHandler(sys.stdout)],
     )
 
+    # -------------------- begin: migrate deprecated args (ps_version -> pixel_unshuffle_version) --------------------
+
+    # !! IF `ps_version` IS READY TO BE DEPRECATED, REMOVE THE FOLLOWING BLOCK COMPLETELY !!
+
+    # Getting the "default value" of the new argument to determine if the user has explicitly set it
+    _default_new = ModelArguments.__dataclass_fields__['pixel_unshuffle_version'].default
+    new_is_default = (model_args.pixel_unshuffle_version == _default_new)
+
+    # If the old argument is provided, issue a deprecation warning and map it
+    if getattr(model_args, 'ps_version', None) is not None:
+        warnings.warn(
+            '`--ps_version` is deprecated and will be removed; use `--pixel_unshuffle_version` instead.',
+            DeprecationWarning
+        )
+        # If the new argument appears to have been explicitly set, and conflicts with the old value → take the new argument
+        if not new_is_default and model_args.ps_version != model_args.pixel_unshuffle_version:
+            logger.warning(
+                f'Both ps_version={model_args.ps_version} (deprecated) and '
+                f'pixel_unshuffle_version={model_args.pixel_unshuffle_version} are provided; '
+                f'using pixel_unshuffle_version.'
+            )
+        else:
+            # Else, map the old value to the new argument
+            model_args.pixel_unshuffle_version = model_args.ps_version
+
+    logger.info(f'Pixel unshuffle version = {model_args.pixel_unshuffle_version}')
+    # --------------------  end: migrate deprecated args  ------------------------------------------------------------
+
+
     if training_args.should_log:
         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
         transformers.utils.logging.set_verbosity_info()
@@ -873,7 +907,7 @@ def main():
         config.select_layer = model_args.vision_select_layer
         config.dynamic_image_size = data_args.dynamic_image_size
         config.use_thumbnail = data_args.use_thumbnail
-        config.ps_version = model_args.ps_version
+        config.pixel_unshuffle_version = model_args.pixel_unshuffle_version
         config.min_dynamic_patch = data_args.min_dynamic_patch
         config.max_dynamic_patch = data_args.max_dynamic_patch
         model = InternVLChatModel.from_pretrained(
@@ -904,7 +938,7 @@ def main():
             vision_config.to_dict(), llm_config.to_dict(), downsample_ratio=data_args.down_sample_ratio,
             pad2square=data_args.pad2square, template=data_args.conv_style,
             select_layer=model_args.vision_select_layer, dynamic_image_size=data_args.dynamic_image_size,
-            use_thumbnail=data_args.use_thumbnail, ps_version=model_args.ps_version,
+            use_thumbnail=data_args.use_thumbnail, pixel_unshuffle_version=model_args.pixel_unshuffle_version,
             min_dynamic_patch=data_args.min_dynamic_patch, max_dynamic_patch=data_args.max_dynamic_patch)
         internvl_chat_config.force_image_size = data_args.force_image_size
         logger.info('Building InternVLChatModel...')

diff --git a/internvl_chat/internvl/train/internvl_chat_pretrain.py b/internvl_chat/internvl/train/internvl_chat_pretrain.py
@@ -145,10 +145,15 @@ class ModelArguments:
         default=0.0,
         metadata={'help': 'Set the drop path rate for the ViT. Default is 0.'},
     )
-    ps_version: Literal['v1', 'v2'] = field(
+    pixel_unshuffle_version: Literal['v1', 'v2'] = field(
         default='v2',
         metadata={'help': 'Specify the version of pixel shuffle implementation. Default is v2.'}
     )
+    # Deprecated alias for pixel_unshuffle_version; keep temporarily for backward compatibility
+    ps_version: Optional[Literal['v1', 'v2']] = field(
+        default=None,
+        metadata={'help': 'DEPRECATED: use --pixel_unshuffle_version instead.'}
+    )
     use_fast_tokenizer: bool = field(
         default=False,
         metadata={'help': 'Set to True to use the fast mode of the tokenizer.'}
@@ -870,6 +875,36 @@ def main():
         datefmt='%m/%d/%Y %H:%M:%S',
         handlers=[logging.StreamHandler(sys.stdout)],
     )
+
+
+    # -------------------- begin: migrate deprecated args (ps_version -> pixel_unshuffle_version) --------------------
+
+    # !! IF `ps_version` IS READY TO BE DEPRECATED, REMOVE THE FOLLOWING BLOCK COMPLETELY !!
+
+    # Getting the "default value" of the new argument to determine if the user has explicitly set it
+    _default_new = ModelArguments.__dataclass_fields__['pixel_unshuffle_version'].default
+    new_is_default = (model_args.pixel_unshuffle_version == _default_new)
+
+    # If the old argument is provided, issue a deprecation warning and map it
+    if getattr(model_args, 'ps_version', None) is not None:
+        warnings.warn(
+            '`--ps_version` is deprecated and will be removed; use `--pixel_unshuffle_version` instead.',
+            DeprecationWarning
+        )
+        # If the new argument appears to have been explicitly set, and conflicts with the old value → take the new argument
+        if not new_is_default and model_args.ps_version != model_args.pixel_unshuffle_version:
+            logger.warning(
+                f'Both ps_version={model_args.ps_version} (deprecated) and '
+                f'pixel_unshuffle_version={model_args.pixel_unshuffle_version} are provided; '
+                f'using pixel_unshuffle_version.'
+            )
+        else:
+            # Else, map the old value to the new argument
+            model_args.pixel_unshuffle_version = model_args.ps_version
+
+    logger.info(f'Pixel unshuffle version = {model_args.pixel_unshuffle_version}')
+    # --------------------  end: migrate deprecated args  ------------------------------------------------------------
+
 
     if training_args.should_log:
         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
@@ -947,7 +982,7 @@ def main():
         config.select_layer = model_args.vision_select_layer
         config.dynamic_image_size = data_args.dynamic_image_size
         config.use_thumbnail = data_args.use_thumbnail
-        config.ps_version = model_args.ps_version
+        config.pixel_unshuffle_version = model_args.pixel_unshuffle_version
         config.min_dynamic_patch = data_args.min_dynamic_patch
         config.max_dynamic_patch = data_args.max_dynamic_patch
         model = InternVLChatModel.from_pretrained(
@@ -976,7 +1011,7 @@ def main():
             vision_config.to_dict(), llm_config.to_dict(), downsample_ratio=data_args.down_sample_ratio,
             pad2square=data_args.pad2square, template=data_args.conv_style,
             select_layer=model_args.vision_select_layer, dynamic_image_size=data_args.dynamic_image_size,
-            use_thumbnail=data_args.use_thumbnail, ps_version=model_args.ps_version,
+            use_thumbnail=data_args.use_thumbnail, pixel_unshuffle_version=model_args.pixel_unshuffle_version,
             min_dynamic_patch=data_args.min_dynamic_patch, max_dynamic_patch=data_args.max_dynamic_patch)
         internvl_chat_config.force_image_size = data_args.force_image_size
         logger.info('Building InternVLChatModel...')

diff --git a/...ll/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh b/...ll/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh
@@ -70,7 +70,7 @@ srun -p ${PARTITION} \
   --group_by_length True \
   --dynamic_image_size False \
   --use_thumbnail False \
-  --ps_version 'v1' \
+  --pixel_unshuffle_version 'v1' \
   --deepspeed "zero_stage3_config_34b.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/...ll/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora.sh b/...ll/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_lora.sh
@@ -63,7 +63,7 @@ torchrun \
   --group_by_length True \
   --dynamic_image_size False \
   --use_thumbnail False \
-  --ps_version 'v1' \
+  --pixel_unshuffle_version 'v1' \
   --deepspeed "zero_stage3_config_34b.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/...ternvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh b/...ternvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
@@ -62,7 +62,7 @@ torchrun \
   --group_by_length True \
   --dynamic_image_size True \
   --use_thumbnail True \
-  --ps_version 'v2' \
+  --pixel_unshuffle_version 'v2' \
   --deepspeed "zero_stage1_config.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/...ternvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh b/...ternvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh
@@ -63,7 +63,7 @@ torchrun \
   --group_by_length True \
   --dynamic_image_size True \
   --use_thumbnail True \
-  --ps_version 'v2' \
+  --pixel_unshuffle_version 'v2' \
   --deepspeed "zero_stage1_config.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/...nternvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh b/...nternvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh
@@ -62,7 +62,7 @@ torchrun \
   --group_by_length True \
   --dynamic_image_size True \
   --use_thumbnail True \
-  --ps_version 'v2' \
+  --pixel_unshuffle_version 'v2' \
   --deepspeed "zero_stage3_config.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/...nternvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh b/...nternvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh
@@ -63,7 +63,7 @@ torchrun \
   --group_by_length True \
   --dynamic_image_size True \
   --use_thumbnail True \
-  --ps_version 'v2' \
+  --pixel_unshuffle_version 'v2' \
   --deepspeed "zero_stage3_config.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/...ll/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh b/...ll/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh
@@ -62,7 +62,7 @@ torchrun \
   --group_by_length True \
   --dynamic_image_size True \
   --use_thumbnail True \
-  --ps_version 'v2' \
+  --pixel_unshuffle_version 'v2' \
   --deepspeed "zero_stage1_config.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/...ll/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh b/...ll/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh
@@ -63,7 +63,7 @@ torchrun \
   --group_by_length True \
   --dynamic_image_size True \
   --use_thumbnail True \
-  --ps_version 'v2' \
+  --pixel_unshuffle_version 'v2' \
   --deepspeed "zero_stage1_config.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/.../shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh b/.../shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh
@@ -70,7 +70,7 @@ srun -p ${PARTITION} \
   --group_by_length True \
   --dynamic_image_size True \
   --use_thumbnail True \
-  --ps_version 'v2' \
+  --pixel_unshuffle_version 'v2' \
   --deepspeed "zero_stage3_config_34b.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/.../shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh b/.../shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh
@@ -72,7 +72,7 @@ srun -p ${PARTITION} \
   --group_by_length False \
   --dynamic_image_size True \
   --use_thumbnail True \
-  --ps_version 'v2' \
+  --pixel_unshuffle_version 'v2' \
   --deepspeed "zero_stage3_config_34b.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/...hell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh b/...hell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh
@@ -70,7 +70,7 @@ srun -p ${PARTITION} \
   --group_by_length True \
   --dynamic_image_size True \
   --use_thumbnail True \
-  --ps_version 'v2' \
+  --pixel_unshuffle_version 'v2' \
   --deepspeed "zero_stage1_config.json" \
   --report_to "tensorboard" \
   2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"