Fix internvl2.5/3 deepspeed packing (#3855)

Jintao-Huang · web-flow · commit 89432b5093fb · 2025-04-12T21:51:17.000+08:00
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -584,7 +584,7 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
 - MAX_NUM: 默认为12
 - INPUT_SIZE: 默认为448
 
-### internvl2, internvl2_phi3, internvl2_5
+### internvl2, internvl2_phi3, internvl2_5, internvl3
 参数含义可以查看[这里](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)
 - MAX_NUM: 默认为12
 - INPUT_SIZE: 默认为448
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -596,7 +596,7 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - MAX_NUM: Default is 12
 - INPUT_SIZE: Default is 448
 
-### internvl2, internvl2_phi3, internvl2_5
+### internvl2, internvl2_phi3, internvl2_5, internvl3
 For the meaning of the arguments, please refer to [here](https://modelscope.cn/models/OpenGVLab/InternVL2_5-2B)
 - MAX_NUM: Default is 12
 - INPUT_SIZE: Default is 448
diff --git a/examples/train/packing/qwen2_5_omni.sh b/examples/train/packing/qwen2_5_omni.sh
@@ -1,5 +1,5 @@
 # 4 * 32GB
-# Multimodal packing currently only supports qwen2_vl, qwen2_5_vl, qwen2_5_omni, internvl2_5
+# Multimodal packing currently only supports qwen2_vl, qwen2_5_vl, qwen2_5_omni, internvl2_5/3
 # A demo for four modalities that can be run directly
 # For local datasets, it is recommended to use streaming: `--streaming true` (save memory)
 pip uninstall transformers
diff --git a/examples/train/packing/qwen2_5_vl.sh b/examples/train/packing/qwen2_5_vl.sh
@@ -1,5 +1,5 @@
 # 4 * 36GB
-# Multimodal packing currently only supports qwen2_vl, qwen2_5_vl, qwen2_5_omni, internvl2_5
+# Multimodal packing currently only supports qwen2_vl, qwen2_5_vl, qwen2_5_omni, internvl2_5/3
 # Efficiency: With packing: 10 minutes; Without packing: >=1 hour
 # For local datasets, it is recommended to use streaming: `--streaming true` (save memory)
 NPROC_PER_NODE=4 \
diff --git a/swift/hub/hub.py b/swift/hub/hub.py
@@ -295,7 +295,7 @@ def load_dataset(cls,
             version=revision,
             download_mode=download_mode,
             use_streaming=streaming,
-            trust_remote_code=True)
+        )
 
     @classmethod
     def download_model(cls,
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -201,7 +201,7 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
         if loss_scale is not None:
             loss_kwargs['loss_scale'] = loss_scale
 
-        with self.template.compute_loss_context(model, inputs):
+        with self.template.compute_loss_context(self.model, inputs):
             outputs = model(**inputs)
         # Save past state if it exists
         # TODO: this needs to be fixed and made cleaner later.