PaddlePaddle
diff --git a/‎examples/experiments/deepseek_v3_pretrain/load_hf_ckpt.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/experiments/deepseek_v3_pretrain/load_hf_ckpt.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/experiments/deepseek_v3_pretrain/modeling.py‎
Lines changed: 14 additions & 14 deletions b/‎examples/experiments/deepseek_v3_pretrain/modeling.py‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎examples/experiments/deepseek_v3_pretrain/modeling_pp.py‎
Lines changed: 11 additions & 11 deletions b/‎examples/experiments/deepseek_v3_pretrain/modeling_pp.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎examples/experiments/deepseek_v3_pretrain/moe_gate.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/experiments/deepseek_v3_pretrain/moe_gate.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/experiments/deepseek_v3_pretrain/moe_layer.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/experiments/deepseek_v3_pretrain/moe_layer.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/experiments/deepseek_v3_pretrain/moe_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/experiments/deepseek_v3_pretrain/moe_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddleformers/datasets/rlhf_datasets/protocol.py‎
Lines changed: 1 addition & 1 deletion b/‎paddleformers/datasets/rlhf_datasets/protocol.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddleformers/generation/utils.py‎
Lines changed: 9 additions & 11 deletions b/‎paddleformers/generation/utils.py‎
Lines changed: 9 additions & 11 deletions
diff --git a/‎paddleformers/nn/attention/eager_attention.py‎
Lines changed: 1 addition & 1 deletion b/‎paddleformers/nn/attention/eager_attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddleformers/nn/criterion/dpo_loss.py‎
Lines changed: 1 addition & 1 deletion b/‎paddleformers/nn/criterion/dpo_loss.py‎
Lines changed: 1 addition & 1 deletion
@@ -257,7 +257,7 @@ def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:
 
 def prepare_tensor(tensor, dst_shape, *, force_transpose=False):
     if isinstance(tensor, list):
-        t = paddle.concat(
+        t = paddle.cat(
             [
                 paddle.transpose(tensor[0], perm=[1, 0]).contiguous(),
                 paddle.transpose(tensor[1], perm=[1, 0]).contiguous(),
 
@@ -813,14 +813,14 @@ def forward(
             sin = sin[None, :, None, :]
             q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, self.fuse_rope)
 
-            query_states = paddle.concat([q_nope, q_pe], axis=-1)
-            key_states = paddle.concat([k_nope, k_pe], axis=-1)
+            query_states = paddle.cat([q_nope, q_pe], axis=-1)
+            key_states = paddle.cat([k_nope, k_pe], axis=-1)
 
             # [bs, seq_len, num_head, head_dim]
             if past_key_value is not None:
                 # reuse k, v, self_attention
-                key_states = paddle.concat([past_key_value[0], key_states], axis=1)
-                value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+                key_states = paddle.cat([past_key_value[0], key_states], axis=1)
+                value_states = paddle.cat([past_key_value[1], value_states], axis=1)
             past_key_value = (key_states, value_states) if use_cache else None
 
             has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient)
@@ -1141,7 +1141,7 @@ def forward(
         hidden_states = self.hnorm(hidden_states)
         nextn_hidden_state = self.enorm(nextn_hidden_state)
 
-        concat_h = paddle.concat([nextn_hidden_state, hidden_states], axis=-1)
+        concat_h = paddle.cat([nextn_hidden_state, hidden_states], axis=-1)
         hidden_states = FP8LinearFunction.apply(concat_h, self.eh_proj)
 
         layer_outputs = super(DeepseekV2MTPLayer, self).forward(
@@ -1686,7 +1686,7 @@ def forward(
                     hidden_states = GatherOp.apply(hidden_states)
                     hidden_states = hidden_states.reshape([-1, seq_length, hidden_states.shape[-1]])
 
-                inputs_embeds_cur_depth = paddle.concat(
+                inputs_embeds_cur_depth = paddle.cat(
                     [inputs_embeds_ori[:, (nextn + 1) :, :], inputs_embeds_extra[:, : (nextn + 1), :]], axis=1
                 )
 
@@ -1848,7 +1848,7 @@ def _set_cos_sin_cache(self, seq_len):
                 / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
             )
 
-            emb = paddle.concat((freqs, freqs), axis=-1)
+            emb = paddle.cat((freqs, freqs), axis=-1)
             self.cos_cached = emb.cos() * _mscale
             self.sin_cached = emb.sin() * _mscale
 
@@ -1919,7 +1919,7 @@ def _set_cos_sin_cache(self, seq_len):
         freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         # [seq_len, axis]
-        emb = paddle.concat([freqs, freqs], axis=-1)
+        emb = paddle.cat([freqs, freqs], axis=-1)
         # [1, seqlen, 1, axis]
         self.cos_cached = emb.cos()[None, :, None, :]
         self.sin_cached = emb.sin()[None, :, None, :]
@@ -2137,8 +2137,8 @@ def qkv_pre_process_no_fuse(
     sin = sin[None, :, None, :]
     q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids, False)
 
-    query_states = paddle.concat([q_nope, q_pe], axis=-1)
-    key_states = paddle.concat([k_nope, k_pe], axis=-1)
+    query_states = paddle.cat([q_nope, q_pe], axis=-1)
+    key_states = paddle.cat([k_nope, k_pe], axis=-1)
 
     return query_states, key_states, value_states
 
@@ -2149,7 +2149,7 @@ def rearrange_kv(kv, k_pe, qk_nope_head_dim, num_heads):
     value_states = kv[..., qk_nope_head_dim:]
 
     k_pe = k_pe.expand([k_pe.shape[0], k_pe.shape[1], num_heads, k_pe.shape[3]])
-    key_states = paddle.concat([k_nope, k_pe], axis=-1)
+    key_states = paddle.cat([k_nope, k_pe], axis=-1)
 
     return key_states, value_states
 
@@ -2315,7 +2315,7 @@ def forward(
                 [bsz, kv_seq_len, v_num_heads, q_head_dim - v_head_dim],
                 dtype=value_states.dtype,
             )
-            value_states_pad = paddle.concat([value_states, value_padding], axis=-1)
+            value_states_pad = paddle.cat([value_states, value_padding], axis=-1)
 
             attn_out, _, softmax_lse, seed_offset = _C_ops.flash_attn(
                 query_states,
@@ -2541,7 +2541,7 @@ def backward(ctx, dout):
                 [bsz, kv_seq_len, v_num_heads, q_head_dim - v_head_dim],
                 dtype=value_states.dtype,
             )
-            value_states_pad = paddle.concat([value_states, value_padding], axis=-1)
+            value_states_pad = paddle.cat([value_states, value_padding], axis=-1)
 
             with paddle.no_grad():
 
@@ -2655,7 +2655,7 @@ def kv_up_weight_grad(kv_ln_trans_fp8, kv_ln_trans_scale, d_kv_t_fp8, d_kv_t_sca
             compressed_kv, kv_ln_weight, kv_ln_invar, d_kv_ln_t, eps
         )
 
-        d_kv_init = paddle.concat([d_compressed_kv, d_k_pe], axis=-1)
+        d_kv_init = paddle.cat([d_compressed_kv, d_k_pe], axis=-1)
 
         if hasattr(q_up_weight, "main_grad"):
 
 
@@ -205,7 +205,7 @@ def forward_without_residual(self, inputs):
 
         if self.send_mtp_embed:
             assert not self.output_mtp_embed_first, "forward_without_residual doesn't support output_mtp_embed_first"
-            hidden_states = paddle.concat([hidden_states, inputs_embeds_mtp], axis=-1)
+            hidden_states = paddle.cat([hidden_states, inputs_embeds_mtp], axis=-1)
             self.mtp_embed_shape = (
                 inputs_embeds_mtp.shape
             )  # Save the shape of mtp_embed, used for backward propagation
@@ -248,9 +248,9 @@ def forward(self, inputs):
 
         if self.send_mtp_embed:
             if self.output_mtp_embed_first:
-                hidden_states = paddle.concat([inputs_embeds_mtp, hidden_states], axis=-1)
+                hidden_states = paddle.cat([inputs_embeds_mtp, hidden_states], axis=-1)
             else:
-                hidden_states = paddle.concat([hidden_states, inputs_embeds_mtp], axis=-1)
+                hidden_states = paddle.cat([hidden_states, inputs_embeds_mtp], axis=-1)
             self.mtp_embed_shape = (
                 inputs_embeds_mtp.shape
             )  # Save the shape of mtp_embed shape, used for backward propagation
@@ -1501,7 +1501,7 @@ def forward(self, args):
             embeds_res = [inputs_embeds]
             mtp_embeds = []
             for depth in range(self.config.num_nextn_predict_layers):
-                inputs_embeds_mtp = paddle.concat(
+                inputs_embeds_mtp = paddle.cat(
                     [
                         inputs_embeds_ori[:, (depth + 1) :, :],
                         inputs_embeds_extra[:, : (depth + 1), :],
@@ -1519,7 +1519,7 @@ def forward(self, args):
                 # mtp_embeds: [B*num_nextn_predict_layers, seq_len, hidden_size]
                 # else:
                 # mtp_embeds: [B*seq_len*num_nextn_predict_layers, hidden_size]
-                inputs_embeds = paddle.concat(embeds_res, axis=-1)
+                inputs_embeds = paddle.cat(embeds_res, axis=-1)
             else:
                 global global_inputs_embeds_mtp_queue
                 cloned_mtp_embeds = [t.detach() for t in mtp_embeds]
@@ -1586,7 +1586,7 @@ def forward(self, args):
             )
 
         if self.config.send_mtp_embed:
-            hidden_states = paddle.concat([hidden_states, inputs_embeds_mtp], axis=-1)
+            hidden_states = paddle.cat([hidden_states, inputs_embeds_mtp], axis=-1)
 
         return return_args(hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids)
 
@@ -1727,7 +1727,7 @@ def post_process_compute(self, inputs):
                 l_aux,
             )
         if send_mtp_embed:
-            hidden_states = paddle.concat([hidden_states, inputs_embeds_mtp], axis=-1)
+            hidden_states = paddle.cat([hidden_states, inputs_embeds_mtp], axis=-1)
 
         return return_args(hidden_states)
 
@@ -1752,7 +1752,7 @@ def post_process_compute_for_fusion(self, inputs):
             hidden_states = hidden_states[0]
 
         if send_mtp_embed:
-            hidden_states = paddle.concat([hidden_states, inputs_embeds_mtp], axis=-1)
+            hidden_states = paddle.cat([hidden_states, inputs_embeds_mtp], axis=-1)
 
         return return_args(hidden_states)
 
@@ -1784,7 +1784,7 @@ def mlp_compute_dense(self, inputs):
         hidden_states = residual + hidden_states
 
         if self.config.send_mtp_embed:
-            hidden_states = paddle.concat([hidden_states, inputs_embeds_mtp], axis=-1)
+            hidden_states = paddle.cat([hidden_states, inputs_embeds_mtp], axis=-1)
 
         return hidden_states
 
@@ -1915,7 +1915,7 @@ def forward(self, args):
                 )
             output_list.append(hidden_states)
 
-        hidden_states = paddle.concat(output_list, axis=-1)
+        hidden_states = paddle.cat(output_list, axis=-1)
         return return_args(hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids)
 
     def attn_compute_for_fusion(self, args):
@@ -1941,7 +1941,7 @@ def attn_compute_for_fusion(self, args):
         hidden_states = self.hnorm(hidden_states)
         nextn_hidden_state = self.enorm(nextn_hidden_state)
 
-        concat_h = paddle.concat([nextn_hidden_state, hidden_states], axis=-1)
+        concat_h = paddle.cat([nextn_hidden_state, hidden_states], axis=-1)
         hidden_states = FP8LinearFunction.apply(concat_h, self.eh_proj)
 
         # attention compute
 
@@ -269,7 +269,7 @@ def top2gating(
         mask2 = self._one_hot_to_int64(indices2_s, self.num_experts)  # [S, E]
 
         # Note: mask1 and mask2 can be combined to form a single mask.
-        # mask = paddle.concat([mask1, mask2], axis=0)
+        # mask = paddle.cat([mask1, mask2], axis=0)
         # locations = paddle.cumsum(mask, axis=0) - 1
         # locations1, locations2 = locations.split(2, axis=0)
         # Compute locations in capacity buffer.
 
@@ -294,7 +294,7 @@ def forward_drop_token(
             expert_out = expert(tokens_for_this_expert)
             outputs.append(expert_out)
             start_idx = end_idx
-        outs = paddle.concat(outputs, axis=0) if len(outputs) > 0 else paddle.to_tensor(0, dtype=sorted_tokens.dtype)
+        outs = paddle.cat(outputs, axis=0) if len(outputs) > 0 else paddle.to_tensor(0, dtype=sorted_tokens.dtype)
         if self.expert_parallel_degree > 1:
             new_x = paddle.empty_like(outs)
             new_x[gatherd_idxs] = outs
@@ -349,7 +349,7 @@ def expert_forward(self, dispatched_input, tokens_per_expert):
             # assert chunk.shape[0] != 0, "Cannot dispatch empty input"
             outputs += [expert(chunk)]
 
-        return paddle.concat(outputs, axis=0)
+        return paddle.cat(outputs, axis=0)
 
     def forward(self, hidden_states: paddle.Tensor):
         _, _, d_model = hidden_states.shape
 
@@ -56,7 +56,7 @@ def _holder_size(self):
 
 def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):
     x = paddle.flatten(x)
-    prob_permuted_indices = paddle.concat(
+    prob_permuted_indices = paddle.cat(
         [
             paddle.tensor.search._restrict_nonzero(x == i, total_true_num)
             for i, total_true_num in enumerate(num_tokens_per_expert_list)
 
@@ -544,7 +544,7 @@ def concat(data: List["DataProto"]) -> "DataProto":
         for batch in data:
             batch_lst.append(batch.batch)
         if batch_lst[0] is not None:
-            new_batch = paddle.concat(batch_lst, axis=0)
+            new_batch = paddle.cat(batch_lst, axis=0)
         else:
             new_batch = None
 
 
@@ -565,11 +565,11 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder
         # update token_type_ids with last value
         if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
             token_type_ids = model_kwargs["token_type_ids"]
-            model_kwargs["token_type_ids"] = paddle.concat([token_type_ids, token_type_ids[:, -1:]], axis=-1)
+            model_kwargs["token_type_ids"] = paddle.cat([token_type_ids, token_type_ids[:, -1:]], axis=-1)
         if not is_encoder_decoder and model_kwargs.get("attention_mask", None) is not None:
             # update attention mask
             attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = paddle.concat(
+            model_kwargs["attention_mask"] = paddle.cat(
                 [
                     attention_mask,
                     paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype),
@@ -579,7 +579,7 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder
         # update role_ids
         if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
             role_ids = model_kwargs["role_ids"]
-            model_kwargs["role_ids"] = paddle.concat([role_ids, role_ids[:, -1:]], axis=-1)
+            model_kwargs["role_ids"] = paddle.cat([role_ids, role_ids[:, -1:]], axis=-1)
 
         return model_kwargs
 
@@ -1235,7 +1235,7 @@ def greedy_search(
             scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag)
             cur_len += 1
 
-            input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+            input_ids = paddle.cat([input_ids, next_tokens], axis=1)
             if streamer is not None:
                 if self.config.tensor_parallel_rank == 0:
                     streamer.put(next_tokens.cpu())
@@ -1379,7 +1379,7 @@ def sample(
             scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag)
 
             cur_len += 1
-            input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+            input_ids = paddle.cat([input_ids, next_tokens], axis=1)
             if streamer is not None:
                 if self.config.tensor_parallel_rank == 0:
                     streamer.put(next_tokens.cpu())
@@ -1550,7 +1550,7 @@ def _post_process_(
             if eos_token_id is not None:
                 next_tokens = paddle.where(unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id))
 
-            input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+            input_ids = paddle.cat([input_ids, next_tokens], axis=1)
 
             if eos_token_id is not None:
                 unfinished_flag = get_unfinished_flag(input_ids, unfinished_flag, eos_token_id)
@@ -1729,9 +1729,7 @@ def beam_search(
             beam_idx = paddle.maximum(beam_idx, paddle.full_like(beam_idx, 0))
 
             cur_len += 1
-            input_ids = paddle.concat(
-                [paddle.index_select(input_ids, beam_idx), beam_next_tokens.unsqueeze(-1)], axis=-1
-            )
+            input_ids = paddle.cat([paddle.index_select(input_ids, beam_idx), beam_next_tokens.unsqueeze(-1)], axis=-1)
 
             if beam_scorer.is_done or stopping_criteria(input_ids, beam_scores):
                 if not synced_gpus:
@@ -1893,7 +1891,7 @@ def group_beam_search(
                 beam_idx = paddle.maximum(beam_idx, paddle.full_like(beam_idx, 0))
 
                 input_ids[batch_group_indices] = group_input_ids[beam_idx]
-                group_input_ids = paddle.concat(
+                group_input_ids = paddle.cat(
                     [paddle.index_select(group_input_ids, index=beam_idx), beam_next_tokens.unsqueeze(-1)], axis=-1
                 )
                 current_tokens[batch_group_indices] = beam_next_tokens
@@ -1902,7 +1900,7 @@ def group_beam_search(
                     num_beams * (beam_idx // group_size) + group_start_idx + (beam_idx % group_size)
                 )
 
-            input_ids = paddle.concat([input_ids, current_tokens.unsqueeze(-1)], axis=-1)
+            input_ids = paddle.cat([input_ids, current_tokens.unsqueeze(-1)], axis=-1)
 
             cur_len += 1
 
 
@@ -49,7 +49,7 @@ def eager_attention_forward(
 
     if sink is not None:
         sink = sink.reshape([1, -1, 1, 1]).expand([query.shape[0], -1, query.shape[-2], -1])
-        combined_logits = paddle.concat([attn_weights, sink], axis=-1)
+        combined_logits = paddle.cat([attn_weights, sink], axis=-1)
         probs = nn.functional.softmax(combined_logits, axis=-1, dtype=combined_logits.dtype)
         scores = probs[..., :-1]  # we drop the sink here
         attn_weights = nn.functional.dropout(scores, p=dropout, training=module.training)
 
@@ -241,7 +241,7 @@ def cal_dpo_loss(
         rejected_logratios = policy_rejected_logps - reference_rejected_logps
         # As described in the KTO report, the KL term for chosen (rejected) is
         # estimated using the rejected (chosen) half.
-        loss = paddle.concat(
+        loss = paddle.cat(
             (
                 1 - F.sigmoid(self.dpo_config.beta * (chosen_logratios - rejected_KL)),
                 1 - F.sigmoid(self.dpo_config.beta * (chosen_KL - rejected_logratios)),
Original file line number	Diff line number	Diff line change
`@@ -257,7 +257,7 @@ def _handle_mlp_weights(hf_prefix: str, rest: str) -> Optional[List[str]]:`
`257`	`257`
`258`	`258`	`def prepare_tensor(tensor, dst_shape, *, force_transpose=False):`
`259`	`259`	`if isinstance(tensor, list):`
`260`		`- t = paddle.concat(`
	`260`	`+ t = paddle.cat(`
`261`	`261`	`[`
`262`	`262`	`paddle.transpose(tensor[0], perm=[1, 0]).contiguous(),`
`263`	`263`	`paddle.transpose(tensor[1], perm=[1, 0]).contiguous(),`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ def _holder_size(self):`
`56`	`56`
`57`	`57`	`def topk_to_permuted_indices(x, num_tokens_per_expert_list, topk):`
`58`	`58`	`x = paddle.flatten(x)`
`59`		`- prob_permuted_indices = paddle.concat(`
	`59`	`+ prob_permuted_indices = paddle.cat(`
`60`	`60`	`[`
`61`	`61`	`paddle.tensor.search._restrict_nonzero(x == i, total_true_num)`
`62`	`62`	`for i, total_true_num in enumerate(num_tokens_per_expert_list)`
Original file line number	Diff line number	Diff line change
`@@ -241,7 +241,7 @@ def cal_dpo_loss(`
`241`	`241`	`rejected_logratios = policy_rejected_logps - reference_rejected_logps`
`242`	`242`	`# As described in the KTO report, the KL term for chosen (rejected) is`
`243`	`243`	`# estimated using the rejected (chosen) half.`
`244`		`- loss = paddle.concat(`
	`244`	`+ loss = paddle.cat(`
`245`	`245`	`(`
`246`	`246`	`1 - F.sigmoid(self.dpo_config.beta * (chosen_logratios - rejected_KL)),`
`247`	`247`	`1 - F.sigmoid(self.dpo_config.beta * (chosen_KL - rejected_logratios)),`