[megatron] use batched mrope (#6281)

Jintao-Huang · Jintao-Huang · commit a3543294dda2 · 2025-10-24T13:47:29.000+08:00
diff --git a/docs/source/Customization/自定义数据集.md b/docs/source/Customization/自定义数据集.md
@@ -215,8 +215,8 @@ alpaca格式:
 {"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "<image>帮我打开谷歌浏览器"}, {"role": "assistant", "content": "Action: click(start_box='<bbox>')"}], "images": ["/xxx/x.jpg"], "objects": {"ref": [], "bbox": [[615, 226]]}}
 ```
 该格式将自动转换数据集格式为对应模型的grounding任务格式，且选择对应模型的bbox归一化方式。该格式比通用格式多了objects字段，该字段包含的字段有：
-- ref: 用于替换`<ref-object>`。ref的长度需要与`<ref-object>`的数量一致。
-- bbox: 用于替换`<bbox>`。若bbox中每个box长度为2，则代表x和y坐标，若box长度为4，则代表2个点的x和y坐标。bbox的长度需要与`<bbox>`的数量一致。
+- ref: 用于替换messages中的`<ref-object>`。ref的长度需要与`<ref-object>`的数量一致。
+- bbox: 用于替换messages中的`<bbox>`。若bbox中每个box长度为2，则代表x和y坐标，若box长度为4，则代表2个点的x和y坐标。bbox的长度需要与`<bbox>`的数量一致。
   - 注意：`<ref-object>`和`<bbox>`并没有对应关系，ref和bbox各自替换各自的占位符。
 - bbox_type: 可选项为'real'，'norm1'。默认为'real'，即bbox为真实bbox值。若是'norm1'，则bbox已经归一化为0~1。
 - image_id: 通常用于多图grounding任务。该参数只有当bbox_type为'real'时生效，代表bbox对应的图片是第几张，用于缩放bbox。索引从0开始，默认全为第0张。image_id的数量需要和bbox的数量一致。例如：若bbox的长度为10，images的长度为2，那么image_id的长度需要是10，其值需要在`{0, 1}`集合内。
diff --git a/docs/source_en/Customization/Custom-dataset.md b/docs/source_en/Customization/Custom-dataset.md
@@ -230,8 +230,8 @@ When using this type of data, please note:
 
 The format will automatically convert the dataset format to the corresponding model's grounding task format and select the appropriate model's bbox normalization method. Compared to the general format, this format includes an additional "objects" field, which contains the following subfields:
 
-- ref: Used to replace `<ref-object>`. The length of `ref` should match the number of `<ref-object>` instances.
-- bbox: Used to replace `<bbox>`. If the length of each box in the bbox is 2, it represents the x and y coordinates. If the box length is 4, it represents the x and y coordinates of two points. The length of `bbox` should match the number of `<bbox>` instances.
+- ref: Used to replace the `<ref-object>` placeholder in messages. The length of `ref` should match the number of `<ref-object>` instances.
+- bbox: Used to replace the `<bbox>` placeholder in messages. If the length of each box in the bbox is 2, it represents the x and y coordinates. If the box length is 4, it represents the x and y coordinates of two points. The length of `bbox` should match the number of `<bbox>` instances.
   - Note: `<ref-object>` and `<bbox>` do not have a corresponding relationship; references and bounding boxes replace their own placeholders separately.
 - bbox_type: Optional values are 'real' and 'norm1'. The default is 'real', meaning the bbox represents the actual bounding box value. If set to 'norm1', the bbox is normalized to the range 0~1.
 - image_id: Typically used for multi-image grounding tasks. This parameter only takes effect when bbox_type is 'real', representing which image the bbox corresponds to, used for scaling the bbox. The index starts from 0, and defaults to all being the 0th image. The length of image_id needs to be consistent with the length of bbox. For example: if the length of bbox is 10 and the length of images is 2, then the length of image_id needs to be 10, with values within the set `{0, 1}`.
diff --git a/swift/megatron/init.py b/swift/megatron/init.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import concurrent.futures
+import logging
 import os
 import subprocess
 import sys
@@ -619,10 +620,17 @@ def _apply_rotary_pos_emb_thd(
             Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
         """
         args = get_args()
-        if args.position_embedding_type != 'mrope':
+        cu_seqlens_for_batched = cu_seqlens
+        use_batched_mrope = False
+        if cp_group is not None:
+            cp_size = cp_group.size()
+            cu_seqlens_for_batched = cu_seqlens // cp_size
+            use_batched_mrope = (freqs.dim() >= 1 and freqs.shape[0] == cu_seqlens_for_batched[-1]).item()
+        if args.position_embedding_type != 'mrope' and not use_batched_mrope:
+            logger.warning_once('Using non-batched RoPE, which may affect performance.')
             return _origin_apply_rotary_pos_emb_thd(
                 t,
-                cu_seqlens,
+                cu_seqlens_for_batched,
                 freqs,
                 rotary_interleaved=rotary_interleaved,
                 multi_latent_attention=multi_latent_attention,
@@ -632,24 +640,20 @@ def _apply_rotary_pos_emb_thd(
 
         if cp_group is None:
             raise ValueError('cp_group must be provided for THD format RoPE')
-        cp_size = cp_group.size()
-        cu_seqlens = cu_seqlens // cp_size
-        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-
-        return torch.cat([
-            _apply_rotary_pos_emb_bshd(
-                x.unsqueeze(1),
-                f,
-                rotary_interleaved=rotary_interleaved,
-                multi_latent_attention=multi_latent_attention,
-                mscale=mscale,
-            ) for x, f in zip(torch.split(t, seqlens), torch.split(freqs, seqlens))
-        ]).squeeze(1)
+
+        return _apply_rotary_pos_emb_bshd(
+            t.unsqueeze(1),
+            freqs,
+            rotary_interleaved=rotary_interleaved,
+            multi_latent_attention=multi_latent_attention,
+            mscale=mscale,
+        ).squeeze(1)
 
     rope_utils._apply_rotary_pos_emb_thd = _apply_rotary_pos_emb_thd
 
 
 def _patch_megatron():
+    logging_level = logging.root.level
     _patch_flash_attn()
     _patch_transformer_engine()
     _patch_TELinear()
@@ -660,6 +664,7 @@ def _patch_megatron():
     _patch_compile_helpers()
     _patch_build_train_valid_test_datasets()
     _patch_mrope()
+    logging.root.setLevel(logging_level)  # revert logger level
     from swift.megatron import tuners  # patch lora
     try:
         _patch_torch_FileSystemReader()