InternLM · HIT-cwh · Jan 7, 2026 · Jan 4, 2026 · Jan 6, 2026 · Copilot
diff --git a/xtuner/v1/data_proto/sequence_context.py b/xtuner/v1/data_proto/sequence_context.py
@@ -199,7 +199,7 @@ def split(self, sequence_parallel_mesh: DeviceMesh | None = None) -> Self:
             return self
 
     @classmethod
-    def pack(cls, sequence_context_list: list["SequenceContext"]):
+    def cat(cls, sequence_context_list: list["SequenceContext"]):
         packed_input_ids: list[torch.Tensor] = []
         cu_seq_lens_q: list[torch.IntTensor] = []
         cu_seq_lens_k: list[torch.IntTensor] = []

diff --git a/xtuner/v1/loss/base_loss_ctx.py b/xtuner/v1/loss/base_loss_ctx.py
@@ -65,6 +65,27 @@ def chunk(self, chunk_size) -> list["BaseLossKwargs"]:
             chunks.append(type(self)(**chunk_dict))
         return chunks
 
+    @classmethod
+    def cat(cls, chunks: list["BaseLossKwargs"]) -> "BaseLossKwargs":
+        assert len(chunks) > 0, "chunks must not be empty."
+
+        # 收集所有 tensor 字段名（按 chunk[0] 的字段为准；pydantic extra=forbid 也要求字段一致）
+        first = chunks[0]
+        tensor_field_names: list[str] = []
+        for field_name, field_value in first.__dict__.items():
+            if isinstance(field_value, torch.Tensor):
+                tensor_field_names.append(field_name)
+
+        assert len(tensor_field_names) > 0, "At least one field should be a tensor to cat."
+
+        cat_dict: dict[str, torch.Tensor] = {}
+        for field_name in tensor_field_names:
+            tensors = [getattr(c, field_name) for c in chunks]
+            # 与 chunk() 对应：按 dim=1 拼回去
+            cat_dict[field_name] = torch.cat(tensors, dim=1)
+
+        return cls(**cat_dict)
+
 
 class BaseLossConfig(BaseModel):
     model_config = ConfigDict(title="BaseLossConfig", extra="forbid", arbitrary_types_allowed=True)
@@ -156,3 +177,14 @@ def forward(
             loss = all_reduce(loss, op=dist.ReduceOp.SUM, group=dist.group.WORLD)
 
         return loss, (logits, extra_info)
+
+    @classmethod
+    def cat(cls, chunks: list["BaseLossContext"]) -> "BaseLossContext":
+        assert len(chunks) > 0, "chunks must not be empty."
+
+        first = chunks[0]
+        loss_cfg = first.loss_cfg
+        loss_kwargs_chunks = [c.loss_kwargs for c in chunks]
+        loss_kwargs = type(first.loss_kwargs).cat(loss_kwargs_chunks)
+
+        return cls(loss_cfg, loss_kwargs)
diff --git a/xtuner/v1/model/moe/moe.py b/xtuner/v1/model/moe/moe.py
@@ -326,24 +326,19 @@ def _micro_batch_forward(
         assert len(seq_ctx_list) == len(loss_ctx_list), "seq_ctx and loss_ctx must have same length"
 
         # Prepare input embeddings for all micro-batches
-        hidden_states_list: list[torch.Tensor] = []
-        position_embeddings_list = []
-
-        for ctx in seq_ctx_list:
-            input_ids = ctx.input_ids
-            position_ids = ctx.position_ids
-
-            if input_ids is not None:
-                hidden_states = self.embed_tokens(input_ids)
-            else:
-                hidden_states = ctx.inputs_embeds
-
-            # create position embeddings to be shared across the decoder layers
-            assert position_ids is not None
-            position_embeddings = self.rotary_emb(hidden_states, position_ids)
-
-            hidden_states_list.append(hidden_states)
-            position_embeddings_list.append(position_embeddings)
+        if seq_ctx_list[0].input_ids is None:
+            cat_hidden_states = torch.cat([ctx.inputs_embeds for ctx in seq_ctx_list], dim=1)
+        else:
+            cat_input_ids = torch.cat([ctx.input_ids for ctx in seq_ctx_list], dim=1)
+            cat_hidden_states = self.embed_tokens(cat_input_ids)
+        cat_position_ids = torch.cat([ctx.position_ids for ctx in seq_ctx_list], dim=1)
+        cat_position_embeddings = self.rotary_emb(cat_hidden_states, cat_position_ids)
+        position_embeddings_list = list(
+            zip(
+                cat_position_embeddings[0].chunk(len(seq_ctx_list), dim=1),
+                cat_position_embeddings[1].chunk(len(seq_ctx_list), dim=1),
+            )
+        )
 
         # Initialize output containers
         output: dict = {}
@@ -353,28 +348,22 @@ def _micro_batch_forward(
 
         # Process through layers
         cat_seq_ctx: SequenceContext | None = None
-        cat_position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None
-        cat_hidden_states: torch.Tensor | None = None
 
         moe_forawrd = False
         for idx, decoder_layer in self.layers.items():
             layer_idx = int(idx)
 
             if layer_idx < self.config.first_k_dense_replace:
                 if cat_seq_ctx is None:
-                    cat_seq_ctx = SequenceContext.pack(seq_ctx_list)
-                    cos = torch.cat([pe[0] for pe in position_embeddings_list], dim=1)
-                    sin = torch.cat([pe[1] for pe in position_embeddings_list], dim=1)
-                    cat_position_embeddings = (cos, sin)
-                    cat_hidden_states = torch.cat(hidden_states_list, dim=1)
+                    cat_seq_ctx = SequenceContext.cat(seq_ctx_list)
                 # Dense decoder layer - process concated hidden states
                 cat_hidden_states = decoder_layer(
                     cat_hidden_states,
                     position_embeddings=cat_position_embeddings,
                     seq_ctx=cat_seq_ctx,
                 )
             else:
-                if cat_hidden_states is not None and not moe_forawrd:
+                if not moe_forawrd:
                     # TODO: `i.clone()` here is weird. However, the current Implementation of
                     # `async_save_on_cpu` is not friendly with `chunk` op (maybe caused by shared storage? not sure),
                     # resulting in nan grad norm. So we have to clone the chunked tensors here to make sure each
@@ -415,25 +404,24 @@ def _micro_batch_forward(
                     router_weights_list[i][f"layer{idx}"] = router_weights[i]
 
         # Apply final norm to all micro-batches
-        for i, hidden_states in enumerate(hidden_states_list):
-            hidden_states_list[i] = self.norm(hidden_states)
+        cat_hidden_states = torch.cat(hidden_states_list, dim=1)
+        cat_hidden_states = self.norm(cat_hidden_states)
 
         # Process final outputs for each micro-batch
-        loss_list: list[torch.Tensor] = []
-        logits_list: list[torch.Tensor] = []
-        moe_extra_info = ModelForwardExtraLogInfo()
-        for hidden_states, loss_ctx_single in zip(hidden_states_list, loss_ctx_list):
-            loss, (logits, extra_info) = self.lm_head(hidden_states, loss_ctx_single)  # type: ignore
-            loss_list.append(loss)
-            if logits is not None:
-                logits_list.append(logits)
-            if extra_info:
-                moe_extra_info.append(extra_info)
+        cat_loss_ctx = CELossContext.cat(loss_ctx_list)
+        loss, (logits, extra_info) = self.lm_head(cat_hidden_states, cat_loss_ctx)  # type: ignore
 
         # Aggregate losses (mean across micro-batches)
-        # Aggregate losses (mean across micro-batches)
+        # Aggregate loss value (using sum across micro-batches or scalar loss as returned)
-        # Aggregate losses (mean across micro-batches)
+        # Aggregate loss value (using sum across micro-batches or scalar loss as returned)
-        output["loss"] = torch.stack(loss_list).sum() if loss_list else None
+        loss: torch.Tensor
-        loss: torch.Tensor
-        loss: torch.Tensor
+        output["loss"] = loss.sum()
+        moe_extra_info = ModelForwardExtraLogInfo()
+        if extra_info:
+            moe_extra_info.append(extra_info)
         output["extra_info"] = moe_extra_info
 
+        # Return logits for all micro-batches
+        final_logits = logits
+
         # Handle router results for all micro-batches
         all_router_logits = []
         all_router_weights = []
@@ -476,12 +464,6 @@ def _micro_batch_forward(
 
             del combined_router_logits
 
-        # Return logits for all micro-batches
-        if all(logits is not None for logits in logits_list):
-            final_logits = torch.cat(logits_list, dim=0) if logits_list else None
-        else:
-            final_logits = None
-
         if self.config.return_router_results or return_router_logits:
             # raise NotImplementedError
 

diff --git a/xtuner/v1/rl/base/controller.py b/xtuner/v1/rl/base/controller.py
@@ -137,7 +137,7 @@ def _packing(self, data_batches, pack_max_length, language_cfg):
                     )
                     rollout_logprobs_list.append(pad_rollout_logprobs)
 
-            seq_ctx = SequenceContext.pack(seq_ctx_list)
+            seq_ctx = SequenceContext.cat(seq_ctx_list)
             shifted_labels = torch.cat(label_list, dim=1)  # (1, max_len)
             advantages = torch.tensor(advantage_list).float().unsqueeze(0)  # (1, num_samples)
             cu_seq_lens_q = seq_ctx.cu_seq_lens_q