Skip to content

Commit a733ccb

Browse files
committed
(1)recover bos download (2) move dsv3_fast_pretrain from env to arg (3) move load_hf_ckpt
1 parent 788e712 commit a733ccb

File tree

13 files changed

+46
-428
lines changed

13 files changed

+46
-428
lines changed
File renamed without changes.

paddleformers/examples/deepseek_v3/modeling.py

Lines changed: 10 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,13 @@
2121

2222
from __future__ import annotations
2323

24-
import contextlib
25-
import math
26-
import os
27-
import warnings
2824
from functools import partial
29-
from typing import List, Optional, Tuple, Union
3025

3126
import paddle
32-
import paddle.distributed as dist
33-
import paddle.distributed.fleet.meta_parallel as mpu
3427
import paddle.nn.functional as F
3528
from paddle import Tensor, nn
3629
from paddle.distributed import fleet
37-
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
38-
from paddle.distributed.fleet.recompute.recompute import recompute
3930
from paddle.jit import to_static
40-
from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
4131
from paddle.utils import try_import
4232

4333
try:
@@ -48,7 +38,6 @@
4838
try:
4939
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
5040
GatherOp,
51-
ScatterOp,
5241
mark_as_sequence_parallel_parameter,
5342
)
5443
except:
@@ -62,43 +51,20 @@
6251
flash_attention = None
6352

6453
from config.configuration import DeepseekV2FastConfig
65-
from moe_gate import PretrainedMoEGate
66-
from moe_layer import MoEFlexTokenLayer, MoELayer
6754
from paddle.distributed.fleet.meta_parallel.zero_bubble_utils import WeightGradStore
6855

69-
from paddleformers.transformers.activations import ACT2FN
70-
from paddleformers.transformers.conversion_utils import (
71-
StateDictNameMapping,
72-
init_name_mappings,
73-
)
74-
from paddleformers.transformers.deepseek_v2 import DeepseekV2RotaryEmbedding, Linear
75-
from paddleformers.transformers.deepseek_v2 import fp8_linear as linear_utils
7656
from paddleformers.transformers.deepseek_v2 import (
57+
DeepseekV2RotaryEmbedding,
7758
yarn_find_correction_range,
7859
yarn_get_mscale,
7960
yarn_linear_ramp_mask,
8061
)
8162
from paddleformers.transformers.fp8_utils import (
82-
FP8Linear,
8363
FP8LinearFunctionBase,
8464
cache_fp8_weight,
8565
set_parameter_color,
8666
)
87-
from paddleformers.transformers.llama import fusion_ops
88-
from paddleformers.transformers.llama.modeling import get_use_casual_mask
89-
from paddleformers.transformers.model_outputs import (
90-
BaseModelOutputWithPastAndMTP,
91-
CausalLMOutputWithPast,
92-
SequenceClassifierOutputWithPast,
93-
)
94-
from paddleformers.transformers.model_utils import (
95-
PretrainedModel,
96-
dtype_guard,
97-
register_base_model,
98-
)
99-
from paddleformers.transformers.utils import cast_if_needed, device_guard
100-
from paddleformers.utils.initializer import kaiming_uniform_
101-
from paddleformers.utils.log import logger
67+
from paddleformers.transformers.utils import device_guard
10268
from paddleformers.utils.tools import get_env_device
10369

10470
try:
@@ -117,13 +83,7 @@ def swiglu(x, y=None):
11783
except ImportError:
11884
fused_partial_rope = None
11985

120-
from paddleformers.transformers.deepseek_v2 import (
121-
DeepseekV2ForCausalLM,
122-
DeepseekV2ForSequenceClassification,
123-
DeepseekV2Model,
124-
DeepseekV2PretrainedModel,
125-
DeepseekV2PretrainingCriterion,
126-
)
86+
from paddleformers.transformers.deepseek_v2 import rotate_half
12787

12888
__all__ = [
12989
"DeepseekV2LMHead",
@@ -153,6 +113,13 @@ def rms_norm_fused(x_in, w, eps, use_fast_ln=False):
153113
return fused_ln.fused_rms_norm(x_in, w, eps)[0]
154114

155115

116+
def cast_if_needed(x, dtype):
117+
"""
118+
cast_if_needed
119+
"""
120+
return x.cast(dtype) if x.dtype != dtype else x
121+
122+
156123
def fusion_rms_norm(hidden_states, weight, variance_epsilon, use_fast_ln=False):
157124
if get_env_device() == "npu":
158125
return paddle.base.core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0]

paddleformers/examples/deepseek_v3/modeling_fast.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323

2424
import contextlib
2525
import math
26-
import os
2726
import warnings
2827
from functools import partial
2928
from typing import List, Optional, Tuple, Union
@@ -65,7 +64,7 @@
6564
init_name_mappings,
6665
)
6766
from paddleformers.transformers.deepseek_v2 import fp8_linear as linear_utils
68-
from paddleformers.transformers.deepseek_v2.fp8_linear import Linear
67+
from paddleformers.transformers.deepseek_v2.fp8_linear import Linear as Linear_
6968
from paddleformers.transformers.fp8_utils import (
7069
FP8KeepXLinear,
7170
FP8Linear,
@@ -100,7 +99,6 @@ def swiglu(x, y=None):
10099
fused_partial_rope = None
101100
from modeling import (
102101
AddAuxiliaryLoss,
103-
DeepseekV2PretrainingCriterion,
104102
DeepseekV2RMSNorm,
105103
DeepseekV2RotaryEmbedding,
106104
DeepseekV2YarnRotaryEmbedding,
@@ -117,6 +115,7 @@ def swiglu(x, y=None):
117115
from paddleformers.transformers.deepseek_v2 import (
118116
DeepseekV2DynamicNTKScalingRotaryEmbedding,
119117
DeepseekV2LinearScalingRotaryEmbedding,
118+
DeepseekV2PretrainingCriterion,
120119
_expand_2d_mask,
121120
_make_causal_mask,
122121
is_casual_mask,
@@ -168,7 +167,7 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, intermediate_
168167
self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
169168
self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
170169
self.fuse_attention_ffn = config.fuse_attention_ffn
171-
Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear
170+
Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear_
172171

173172
def linear_dtype_gaurd():
174173
if config.use_fp8:
@@ -534,7 +533,7 @@ def linear_dtype_gaurd():
534533

535534
self._init_rope()
536535
self.softmax_scale = self.q_head_dim ** (-0.5)
537-
Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear
536+
Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear_
538537

539538
# fmt: off
540539
if self.config.tensor_parallel_degree > 1:
@@ -1260,7 +1259,7 @@ def get_tensor_parallel_split_mappings(num_layers):
12601259
def _init_weights(self, layer):
12611260
if self.config.tensor_parallel_degree > 1:
12621261
rng_tracker = get_rng_state_tracker().rng_state
1263-
Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear
1262+
Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear_
12641263

12651264
if isinstance(
12661265
layer,

paddleformers/examples/deepseek_v3/modeling_pp.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.
1414

1515
import math
16-
import os
1716
from typing import OrderedDict, Tuple, Union
1817

1918
import paddle
@@ -37,7 +36,6 @@
3736
from config.configuration import DeepseekV2FastConfig
3837
from modeling import (
3938
DeepseekV2LMHead,
40-
DeepseekV2PretrainingCriterion,
4139
DeepseekV2RMSNorm,
4240
TemporaryVarContext,
4341
set_global_step,
@@ -49,7 +47,7 @@
4947
from paddle.distributed.fleet.recompute.recompute import recompute
5048
from paddle.distributed.fleet.utils.sequence_parallel_utils import ScatterOp
5149

52-
from paddleformers.transformers.deepseek_v2 import DeepseekV2RotaryEmbedding
50+
from paddleformers.transformers.deepseek_v2 import DeepseekV2PretrainingCriterion
5351
from paddleformers.transformers.model_utils import PipelinePretrainedModel
5452
from paddleformers.utils.log import logger
5553
from paddleformers.utils.tools import get_env_device

paddleformers/examples/deepseek_v3/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,5 @@
1919

2020
# mpirun sh script/kill_process.sh
2121
# mpirun rm -rf output
22-
nohup bash script/train_gpu.sh ./config/pretrain_argument.json > run.log 2>&1 &
22+
nohup bash script/train_gpu.sh ./config/pretrain_argument.json --dsv3_fast_pretrain=True > run.log 2>&1 &
2323

paddleformers/examples/deepseek_v3/run_pretrain.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
import paddle
2323
from config.configuration import DeepseekV2FastConfig
24-
from modeling import DeepseekV2ForCausalLM
24+
from load_hf_ckpt import load_huggingface_ckpt
2525
from modeling_pp import DeepseekV2ForCausalLMPipe
2626

2727
from paddleformers.data.causal_dataset import (
@@ -40,14 +40,12 @@
4040
speed_metrics,
4141
)
4242
from paddleformers.transformers import (
43-
AutoConfig,
44-
AutoModelForCausalLM,
45-
AutoModelForCausalLMPipe,
4643
AutoTokenizer,
4744
CosineAnnealingWithWarmupDecay,
4845
LinearAnnealingWithWarmupDecay,
4946
)
5047
from paddleformers.transformers.configuration_utils import LlmMetaConfig, llmmetaclass
48+
from paddleformers.transformers.deepseek_v2 import DeepseekV2ForCausalLM
5149
from paddleformers.utils.batch_sampler import DistributedBatchSampler
5250
from paddleformers.utils.log import logger
5351
from paddleformers.utils.tools import get_env_device
@@ -413,8 +411,7 @@ def main():
413411
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
414412
)
415413

416-
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path, **{"download_hub": "bos"})
417-
# config = AutoConfig.from_pretrained("./")
414+
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path, download_hub="huggingface")
418415
config = DeepseekV2FastConfig.from_pretrained("./config/config.json")
419416

420417
# set all llm config
@@ -583,6 +580,12 @@ def main():
583580

584581
callbacks = [StepFlexToken(), FP8QuantWeightCallback()]
585582

583+
def resume_from_custom_func(model):
584+
if training_args.resume_from_huggingface_ckpt:
585+
load_huggingface_ckpt(model, training_args.resume_from_huggingface_ckpt)
586+
else:
587+
logger.info("No resume from checkpoint since training args 'resume_from_huggingface_ckpt' is None.")
588+
586589
trainer = PretrainingTrainer(
587590
model=model,
588591
args=training_args,
@@ -592,6 +595,7 @@ def main():
592595
optimizers=(None, lr_scheduler),
593596
tokenizer=tokenizer,
594597
callbacks=callbacks,
598+
resume_from_custom_func=resume_from_custom_func,
595599
)
596600

597601
checkpoint = None

paddleformers/trainer/trainer.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -86,15 +86,12 @@
8686
)
8787
from ..peft import LoKrModel, LoRAModel, PrefixModelForCausalLM, ReFTModel, VeRAModel
8888
from ..peft.lora import QuantizationLoRABaseLinear
89+
from ..quantization.quantization_linear import (
90+
ColumnParallelQuantizationLinear,
91+
QuantizationLinear,
92+
RowParallelQuantizationLinear,
93+
)
8994

90-
try:
91-
from ..quantization.quantization_linear import (
92-
ColumnParallelQuantizationLinear,
93-
QuantizationLinear,
94-
RowParallelQuantizationLinear,
95-
)
96-
except:
97-
QuantizationLinear = None
9895
try:
9996
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
10097
register_sequence_parallel_allreduce_hooks,
@@ -199,7 +196,6 @@
199196
nested_numpify,
200197
nested_truncate,
201198
)
202-
from .utils.load_hf_ckpt import load_huggingface_ckpt
203199
from .utils.sharding_io import ShardingIO
204200

205201
DEFAULT_CALLBACKS = [DefaultFlowCallback]
@@ -307,6 +303,7 @@ def __init__(
307303
optimizers: Tuple[paddle.optimizer.Optimizer, paddle.optimizer.lr.LRScheduler] = (None, None),
308304
preprocess_logits_for_metrics: Callable[[paddle.Tensor, paddle.Tensor], paddle.Tensor] = None,
309305
processing_class: Optional[ImageProcessingMixin] = None,
306+
resume_from_custom_func: Optional[Callable] = None,
310307
):
311308

312309
if args is None:
@@ -361,6 +358,7 @@ def __init__(
361358
self.train_dataset = train_dataset
362359
self.eval_dataset = eval_dataset
363360
self.tokenizer = tokenizer
361+
self.resume_from_custom_func = resume_from_custom_func
364362
if not args.skip_profile_timer:
365363
set_timers()
366364
self.timers = get_timers()
@@ -1138,8 +1136,8 @@ def _inner_training_loop(
11381136
if self.args.ignore_data_skip:
11391137
self.timers and self.timers("read-data").start()
11401138

1141-
if self.args.resume_from_huggingface_ckpt is not None:
1142-
load_huggingface_ckpt(model, self.args.resume_from_huggingface_ckpt)
1139+
if self.resume_from_custom_func is not None:
1140+
self.resume_from_custom_func(self.model)
11431141

11441142
for epoch in range(epochs_trained, num_train_epochs):
11451143
if isinstance(train_dataloader, paddle.io.DataLoader) and isinstance(

paddleformers/trainer/training_args.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,10 @@ class TrainingArguments:
10921092
default=False,
10931093
metadata={"help": "Save model to HuggingFace safetensors."},
10941094
)
1095+
dsv3_fast_pretrain: Optional[bool] = field(
1096+
default=False,
1097+
metadata={"help": "Use fast pretrain version of DeepSeekV3."},
1098+
)
10951099

10961100
def __post_init__(self):
10971101
world_size = paddle.distributed.get_world_size()
@@ -1409,7 +1413,7 @@ def is_segment_parallel_supported():
14091413
else:
14101414
order = ["dp", "sharding", "pp", "mp"]
14111415
if self.use_expert_parallel:
1412-
if not os.getenv("DSV3_FAST_PRETRAIN", "False"):
1416+
if not self.dsv3_fast_pretrain:
14131417
if self.moe_sharding_parallel_degree >= 1 and self.expert_parallel_degree > 1:
14141418
order.insert(-1, "ep")
14151419
sd_idx = order.index("sharding")
@@ -1571,7 +1575,8 @@ def is_segment_parallel_supported():
15711575
fleet.init(is_collective=True, strategy=strategy)
15721576
logger.info(strategy)
15731577

1574-
if os.getenv("DSV3_FAST_PRETRAIN", "False"):
1578+
# if os.getenv("DSV3_FAST_PRETRAIN", "False"):
1579+
if self.dsv3_fast_pretrain:
15751580
if self.expert_parallel_degree > 1:
15761581
self.add_moe_comm_group()
15771582

paddleformers/transformers/auto/configuration.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
[
3636
("bert", "BertConfig"),
3737
("deepseek_v2", "DeepseekV2Config"),
38-
("deepseek_v2_fast", "DeepseekV2FastConfig"),
3938
("deepseek_v3", "DeepseekV3Config"),
4039
("ernie4_5", "Ernie4_5Config"),
4140
("llama", "LlamaConfig"),

paddleformers/transformers/deepseek_v2/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,6 @@
5656
"yarn_find_correction_range",
5757
"get_triangle_upper_mask",
5858
"DeepseekV2LinearScalingRotaryEmbedding",
59-
"set_global_step",
60-
"get_global_step",
6159
],
6260
"modeling_auto": [
6361
"DeepseekV2LMHeadAuto",

0 commit comments

Comments
 (0)