Skip to content

Commit 7444cbe

Browse files
committed
refine many code
1 parent 5bb3018 commit 7444cbe

File tree

16 files changed

+509
-2293
lines changed

16 files changed

+509
-2293
lines changed

examples/config/deepseek_v3/sft_4k_argument_dsv3.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"eval_dataset_path": "/root/paddlejob/tmpspace/chenzhichao/PaddleNLP-SFT/llm/en_data/dev.json",
99
"eval_dataset_prob": "1.0",
1010
"eval_dataset_type": "erniekit",
11+
"packing": true,
1112
"per_device_train_batch_size": 1,
1213
"gradient_accumulation_steps": 16,
1314
"per_device_eval_batch_size": 1,
@@ -55,6 +56,7 @@
5556
"sharding_parallel_config": "split_param",
5657
"tensor_parallel_output": true,
5758
"num_nextn_predict_layers": 1,
58-
"convert_from_hf": true
59+
"convert_from_hf": true,
60+
"use_attn_mask_startend_row_indices": true
5961
}
6062

examples/run_finetune.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ def neft_post_hook(module, input, output):
312312
if training_args.use_expert_parallel:
313313
callbacks += [MoeExpertsGradScaleCallback(training_args)]
314314

315-
logger.info("callbacks:", callbacks, flush=True)
315+
logger.info(f"callbacks: {callbacks}")
316316

317317
trainer = SFTTrainer(
318318
model=model,

paddleformers/nn/lm_head.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def _set_distributed_attr(self, param):
6868
if param.is_distributed:
6969
param.split_axis = 0
7070

71-
def forward(self, hidden_states, tensor_parallel_output=None):
71+
def forward(self, hidden_states, tensor_parallel_output=None, gather_hidden_states=True):
7272
"""Project hidden states to vocabulary logits.
7373
7474
Args:
@@ -103,7 +103,7 @@ def forward(self, hidden_states, tensor_parallel_output=None):
103103
self.bias,
104104
tensor_parallel_output,
105105
training=self.training,
106-
gather_hidden_states=True,
106+
gather_hidden_states=gather_hidden_states,
107107
)
108108

109109
def extra_repr(self):

paddleformers/nn/mlp.py

Lines changed: 68 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,14 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import paddle
15+
16+
import contextlib
17+
1518
import paddle.nn as nn
1619
from paddle.incubate.nn.functional import swiglu as fused_swiglu
1720

1821
from ..generation.configuration_utils import PretrainedConfig
22+
from ..transformers.model_utils import dtype_guard
1923
from .activation import ACT2FN
2024
from .linear import Linear
2125

@@ -45,66 +49,81 @@ def __init__(
4549
self.act_type = config.get("hidden_act", "silu")
4650
self.act_fn = ACT2FN[self.act_type]
4751
self.fuse_up_gate = fuse_up_gate
52+
self.is_moe = kwargs.get("is_moe", False)
53+
linear_type = None
54+
if self.is_moe:
55+
linear_type = "default"
4856

49-
if self.fuse_up_gate:
50-
setattr(
51-
self,
52-
gate_up_proj_name,
53-
Linear.create(
54-
self.hidden_size,
55-
self.intermediate_size * 2,
56-
has_bias=self.has_bias,
57-
config=config,
58-
fuse_matmul_bias=config.fuse_linear,
59-
tp_plan="colwise",
60-
),
61-
)
62-
self.up_gate_proj = getattr(self, gate_up_proj_name)
63-
else:
64-
# set attr for gate_proj
65-
setattr(
66-
self,
67-
gate_proj_name,
68-
Linear.create(
69-
self.hidden_size,
70-
self.intermediate_size,
71-
has_bias=self.has_bias,
72-
config=config,
73-
fuse_matmul_bias=config.fuse_linear,
74-
tp_plan="colwise",
75-
),
76-
)
77-
self.gate_proj = getattr(self, gate_proj_name)
57+
def linear_type_gaurd():
58+
if config.use_fp8:
59+
return dtype_guard("float8_e4m3fn")
60+
else:
61+
return contextlib.nullcontext()
62+
63+
with linear_type_gaurd():
64+
if self.fuse_up_gate:
65+
setattr(
66+
self,
67+
gate_up_proj_name,
68+
Linear.create(
69+
self.hidden_size,
70+
self.intermediate_size * 2,
71+
has_bias=self.has_bias,
72+
config=config,
73+
fuse_matmul_bias=config.fuse_linear,
74+
tp_plan="colwise",
75+
linear_type=linear_type,
76+
),
77+
)
78+
self.up_gate_proj = getattr(self, gate_up_proj_name)
79+
else:
80+
# set attr for gate_proj
81+
setattr(
82+
self,
83+
gate_proj_name,
84+
Linear.create(
85+
self.hidden_size,
86+
self.intermediate_size,
87+
has_bias=self.has_bias,
88+
config=config,
89+
fuse_matmul_bias=config.fuse_linear,
90+
tp_plan="colwise",
91+
linear_type=linear_type,
92+
),
93+
)
94+
self.gate_proj = getattr(self, gate_proj_name)
95+
96+
# set attr for up_proj
97+
setattr(
98+
self,
99+
up_proj_name,
100+
Linear.create(
101+
self.hidden_size,
102+
self.intermediate_size,
103+
has_bias=self.has_bias,
104+
config=config,
105+
fuse_matmul_bias=config.fuse_linear,
106+
tp_plan="colwise",
107+
linear_type=linear_type,
108+
),
109+
)
110+
self.up_proj = getattr(self, up_proj_name)
78111

79-
# set attr for up_proj
112+
# set attr for down_proj
80113
setattr(
81114
self,
82-
up_proj_name,
115+
down_proj_name,
83116
Linear.create(
84-
self.hidden_size,
85117
self.intermediate_size,
118+
self.hidden_size,
86119
has_bias=self.has_bias,
87120
config=config,
88121
fuse_matmul_bias=config.fuse_linear,
89-
tp_plan="colwise",
122+
tp_plan="rowwise",
123+
linear_type=linear_type,
90124
),
91125
)
92-
self.up_proj = getattr(self, up_proj_name)
93-
94-
# set attr for down_proj
95-
setattr(
96-
self,
97-
down_proj_name,
98-
Linear.create(
99-
self.intermediate_size,
100-
self.hidden_size,
101-
has_bias=self.has_bias,
102-
config=config,
103-
fuse_matmul_bias=config.fuse_linear,
104-
tp_plan="rowwise",
105-
),
106-
)
107-
self.down_proj = getattr(self, down_proj_name)
126+
self.down_proj = getattr(self, down_proj_name)
108127

109128
def forward(self, x):
110129
if self.fuse_up_gate:

paddleformers/nn/norm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
from paddle.incubate.nn.functional import fused_rms_norm_ext
1818

1919
from ..generation.configuration_utils import PretrainedConfig
20+
from ..transformers.llama import fusion_ops
2021
from ..utils.log import logger
22+
from ..utils.tools import get_env_device
2123
from .general import GeneralInterface
2224

2325
try:

paddleformers/nn/pp_model.py

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -343,8 +343,19 @@ def __init__(self, *args, **kwargs):
343343

344344
def forward(self, args):
345345
hidden_states, _, _, _, _ = parse_args(args)
346-
hidden_states = super().forward(hidden_states)
347-
return hidden_states
346+
347+
if self.config.num_nextn_predict_layers > 0:
348+
hidden_states_list = paddle.split(hidden_states, self.config.num_nextn_predict_layers + 1, axis=-1)
349+
hidden_states = hidden_states_list[0]
350+
hidden_states_mtp = hidden_states_list[-self.config.num_nextn_predict_layers :]
351+
352+
output_list = [super().forward(hidden_states)]
353+
for hidden_states in hidden_states_mtp:
354+
output_list.append(super().forward(hidden_states))
355+
return output_list
356+
else:
357+
hidden_states = super().forward(hidden_states)
358+
return hidden_states
348359

349360

350361
class LayerNormPipe(LayerNorm):
@@ -389,6 +400,12 @@ def forward(self, args):
389400
[batch_size, sequence_length, vocab_size]
390401
representing unnormalized log probabilities for each token
391402
"""
403+
if self.config.num_nextn_predict_layers > 0:
404+
logits = []
405+
for _hidden_states in args:
406+
logits.append(super().forward(_hidden_states))
407+
return logits
408+
392409
hidden_states, _, _, _, _ = parse_args(args)
393410
logits = super().forward(hidden_states)
394411
return logits
@@ -507,12 +524,25 @@ class GeneralModelForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
507524
transpose_weight_keys = None
508525
_embed_cls = None
509526
_rotary_emb_cls = None
527+
_mtp_layer_pipe_cls = None
528+
_embedding_pipe_cls = None
529+
_decoder_layer_pipe_cls = None
530+
_criterion_pipe_cls = None
531+
_lmhead_pipe_cls = None
510532

511533
def __init__(self, config: PretrainedConfig, **kwargs):
512534
# dynamic inherit DecoderLayer
513535
if self._decoder_layer_cls is None:
514536
raise ValueError("_decoder_layer_cls must be set before init.")
515-
DecoderLayerPipe = make_decoder_layer_pipe(self._decoder_layer_cls)
537+
538+
EmbeddingPipeCls = self._embedding_pipe_cls if self._embedding_pipe_cls is not None else Embedding
539+
540+
if self._decoder_layer_pipe_cls is None:
541+
DecoderLayerPipe = make_decoder_layer_pipe(self._decoder_layer_cls)
542+
else:
543+
DecoderLayerPipe = self._decoder_layer_pipe_cls
544+
545+
LMHeadPipeCls = self._lmhead_pipe_cls if self._lmhead_pipe_cls is not None else LMHeadPipe
516546

517547
new_initializer_range = math.sqrt(0.3333 / config.hidden_size)
518548
logger.info(f"change initializer-range from {config.initializer_range} to {new_initializer_range}")
@@ -559,7 +589,7 @@ def __init__(self, config: PretrainedConfig, **kwargs):
559589
else:
560590
self.add_sequential_layer(
561591
LayerDesc(
562-
EmbeddingPipe, config=config, embed_cls=self._embed_cls, rotary_emb_cls=self._rotary_emb_cls
592+
EmbeddingPipeCls, config=config, embed_cls=self._embed_cls, rotary_emb_cls=self._rotary_emb_cls
563593
),
564594
"model",
565595
)
@@ -573,6 +603,11 @@ def __init__(self, config: PretrainedConfig, **kwargs):
573603
),
574604
f"model.layers.{i}",
575605
)
606+
for i in range(config.num_nextn_predict_layers):
607+
self.add_sequential_layer(
608+
LayerDesc(self._mtp_layer_pipe_cls, config=config, layer_idx=config.num_hidden_layers + i),
609+
f"model.layers.{config.num_hidden_layers + i}",
610+
)
576611
for i in range(config.add_tail_layers):
577612
self.add_sequential_layer(
578613
LayerDesc(
@@ -590,14 +625,14 @@ def __init__(self, config: PretrainedConfig, **kwargs):
590625
self.add_sequential_layer(
591626
SharedLayerDesc(
592627
"model_shared_weight",
593-
LMHeadPipe,
628+
LMHeadPipeCls,
594629
shared_weight_attr="embedding_weight",
595630
config=config,
596631
),
597632
"lm_head",
598633
)
599634
else:
600-
self.add_sequential_layer(LayerDesc(LMHeadPipe, config=config), "lm_head")
635+
self.add_sequential_layer(LayerDesc(LMHeadPipeCls, config=config), "lm_head")
601636
recompute_interval = 0
602637

603638
seg_method = config.pp_seg_method if hasattr(config, "pp_seg_method") else "layer:DecoderLayer|EmptyLayer"
@@ -630,10 +665,12 @@ def __init__(self, config: PretrainedConfig, **kwargs):
630665
)
631666

632667
def get_loss_fn(self, config):
668+
CriterionPipeCls = self._criterion_pipe_cls if self._criterion_pipe_cls is not None else CriterionLayerPipe
669+
633670
if config.get("dpo_config", None) is not None:
634-
loss_fn = CriterionLayerPipe(config, use_infohub=True)
671+
loss_fn = CriterionPipeCls(config, use_infohub=True)
635672
else:
636-
loss_fn = CriterionLayerPipe(config)
673+
loss_fn = CriterionPipeCls(config)
637674

638675
return loss_fn
639676

paddleformers/transformers/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,20 +123,18 @@
123123
"DeepseekV2DynamicNTKScalingRotaryEmbedding",
124124
"DeepseekV2MLP",
125125
"yarn_get_mscale",
126-
"DeepseekV2LMHead",
127126
"DeepseekV2DecoderLayer",
128-
"DeepseekV2PretrainingCriterion",
129127
"yarn_find_correction_range",
130128
"get_triangle_upper_mask",
131129
"DeepseekV2LinearScalingRotaryEmbedding",
130+
"DeepseekV2ForCausalLMPipe",
132131
],
133132
"deepseek_v2.modeling_auto": [
134133
"DeepseekV2LMHeadAuto",
135134
"DeepseekV2ForCausalLMAuto",
136135
"DeepseekV2ModelAuto",
137136
"DeepseekV2PretrainedModelAuto",
138137
],
139-
"deepseek_v2.modeling_pp": ["DeepseekV2ForCausalLMPipe"],
140138
"deepseek_v2.mfu_utils": ["DeepSeekProjection"],
141139
"deepseek_v2.kernel": [
142140
"act_quant",
@@ -160,14 +158,14 @@
160158
"DeepseekV3ForSequenceClassification",
161159
"DeepseekV3Model",
162160
"DeepseekV3PretrainedModel",
161+
"DeepseekV3ForCausalLMPipe",
163162
],
164163
"deepseek_v3.modeling_auto": [
165164
"DeepseekV3LMHeadAuto",
166165
"DeepseekV3ForCausalLMAuto",
167166
"DeepseekV3ModelAuto",
168167
"DeepseekV3PretrainedModelAuto",
169168
],
170-
"deepseek_v3.modeling_pp": ["DeepseekV3ForCausalLMPipe"],
171169
"ernie4_5.configuration": ["Ernie4_5Config"],
172170
"ernie4_5.modeling": ["Ernie4_5Model", "Ernie4_5ForCausalLM", "Ernie4_5ForCausalLMPipe"],
173171
"ernie4_5.tokenizer": ["Ernie4_5Tokenizer"],

paddleformers/transformers/deepseek_v2/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,20 +50,18 @@
5050
"DeepseekV2DynamicNTKScalingRotaryEmbedding",
5151
"DeepseekV2MLP",
5252
"yarn_get_mscale",
53-
"DeepseekV2LMHead",
5453
"DeepseekV2DecoderLayer",
55-
"DeepseekV2PretrainingCriterion",
5654
"yarn_find_correction_range",
5755
"get_triangle_upper_mask",
5856
"DeepseekV2LinearScalingRotaryEmbedding",
57+
"DeepseekV2ForCausalLMPipe",
5958
],
6059
"modeling_auto": [
6160
"DeepseekV2LMHeadAuto",
6261
"DeepseekV2ForCausalLMAuto",
6362
"DeepseekV2ModelAuto",
6463
"DeepseekV2PretrainedModelAuto",
6564
],
66-
"modeling_pp": ["DeepseekV2ForCausalLMPipe"],
6765
"mfu_utils": ["DeepSeekProjection"],
6866
"kernel": [
6967
"act_quant",

paddleformers/transformers/deepseek_v2/configuration.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,12 +160,13 @@ def __init__(
160160
first_k_dense_replace=0,
161161
norm_topk_prob=False,
162162
scoring_func="softmax",
163-
aux_loss_alpha=0.001,
163+
aux_loss_alpha=0.0001,
164164
seq_aux=True,
165165
hidden_act="silu",
166166
max_position_embeddings=2048,
167167
seq_length=32768,
168168
initializer_range=0.02,
169+
use_rmsnorm=True,
169170
rms_norm_eps=1e-6,
170171
use_cache=True,
171172
pad_token_id=None,
@@ -217,6 +218,7 @@ def __init__(
217218
self.num_key_value_heads = num_key_value_heads
218219
self.hidden_act = hidden_act
219220
self.initializer_range = initializer_range
221+
self.use_rmsnorm = use_rmsnorm
220222
self.rms_norm_eps = rms_norm_eps
221223
self.pretraining_tp = pretraining_tp
222224
self.use_cache = use_cache

0 commit comments

Comments
 (0)