Skip to content

Commit 788e712

Browse files
committed
(1) move all updates into example folder (2) move DSV3_USE_FP8_GEMM DSV3_USE_ATTEN_RECOMPUTE DSV3_USE_FP8_DISPATCH USE_DS_GEMM into config.json
1 parent 65bb06d commit 788e712

26 files changed

+6880
-4624
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
import sys
17+
from typing import TYPE_CHECKING
18+
19+
from paddleformers.utils.lazy_import import _LazyModule
20+
21+
import_structure = {
22+
"configuration": ["DeepseekV2FastConfig"],
23+
"modeling": [
24+
"masked_fill",
25+
"DeepseekV2Attention",
26+
"MoEGate",
27+
"FakeGate",
28+
"DeepseekV2ForCausalLM",
29+
"_make_causal_mask",
30+
"is_casual_mask",
31+
"DeepseekV2MoE",
32+
"DeepseekV2MoEFlexToken",
33+
"scaled_dot_product_attention",
34+
"DeepseekV2RotaryEmbedding",
35+
"rotate_half",
36+
"DeepseekV2MTPLayer",
37+
"DeepseekV2RMSNorm",
38+
"DeepseekV2YarnRotaryEmbedding",
39+
"parallel_matmul",
40+
"DeepseekV2PretrainedModel",
41+
"AddAuxiliaryLoss",
42+
"apply_rotary_pos_emb",
43+
"assign_kv_heads",
44+
"DeepseekV2ForSequenceClassification",
45+
"_expand_2d_mask",
46+
"DeepseekV2Model",
47+
"repeat_kv",
48+
"yarn_find_correction_dim",
49+
"yarn_linear_ramp_mask",
50+
"DeepseekV2DynamicNTKScalingRotaryEmbedding",
51+
"DeepseekV2MLP",
52+
"yarn_get_mscale",
53+
"DeepseekV2LMHead",
54+
"DeepseekV2DecoderLayer",
55+
"DeepseekV2PretrainingCriterion",
56+
"yarn_find_correction_range",
57+
"get_triangle_upper_mask",
58+
"DeepseekV2LinearScalingRotaryEmbedding",
59+
"set_global_step",
60+
"get_global_step",
61+
],
62+
"modeling_auto": [
63+
"DeepseekV2LMHeadAuto",
64+
"DeepseekV2ForCausalLMAuto",
65+
"DeepseekV2ModelAuto",
66+
"DeepseekV2PretrainedModelAuto",
67+
],
68+
"modeling_pp": ["DeepseekV2ForCausalLMPipe"],
69+
"mfu_utils": ["DeepSeekProjection"],
70+
"kernel": [
71+
"act_quant",
72+
"weight_dequant",
73+
"fp8_gemm",
74+
"weight_dequant_kernel",
75+
"act_quant_kernel",
76+
"fp8_gemm_kernel",
77+
],
78+
"tokenizer_fast": ["DeepseekTokenizerFast"],
79+
"fp8_linear": [
80+
"Linear",
81+
"ColumnParallelLinear",
82+
"RowParallelLinear",
83+
"ColumnSequenceParallelLinear",
84+
"RowSequenceParallelLinear",
85+
],
86+
}
87+
88+
if TYPE_CHECKING:
89+
from .configuration import *
90+
from .modeling import *
91+
from .modeling_auto import *
92+
from .modeling_pp import *
93+
from .tokenizer_fast import *
94+
else:
95+
sys.modules[__name__] = _LazyModule(
96+
__name__,
97+
globals()["__file__"],
98+
import_structure,
99+
module_spec=__spec__,
100+
)

paddleformers/examples/deepseek_v3/config/config.json

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
{
22
"architectures": [
3-
"DeepseekV3ForCausalLM"
3+
"DeepseekV2ForCausalLM"
44
],
55
"attention_bias": false,
66
"attention_dropout": 0.0,
77
"auto_map": {
8-
"AutoConfig": "configuration_deepseek.DeepseekV3Config",
9-
"AutoModel": "modeling_deepseek.DeepseekV3Model",
10-
"AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
8+
"AutoConfig": "DeepseekV2FastConfig",
9+
"AutoModel": "DeepseekV2ModelFast",
10+
"AutoModelForCausalLM": "DeepseekV2ForCausalLM"
1111
},
1212
"aux_loss_alpha": 0.001,
1313
"bos_token_id": 0,
@@ -20,7 +20,7 @@
2020
"intermediate_size": 18432,
2121
"kv_lora_rank": 512,
2222
"max_position_embeddings": 163840,
23-
"model_type": "deepseek_v3",
23+
"model_type": "deepseek_v2_fast",
2424
"moe_intermediate_size": 2048,
2525
"moe_layer_freq": 1,
2626
"n_group": 8,
@@ -71,5 +71,10 @@
7171
"use_dualpipev": true,
7272
"send_mtp_embed": true,
7373
"offline_quant_expert_weight": false,
74-
"clear_origin_weight_when_offline_quant": false
74+
"clear_origin_weight_when_offline_quant": false,
75+
"dsv3_use_fp8_gemm": true,
76+
"dsv3_use_atten_recompute": true,
77+
"use_ds_gemm": false,
78+
"dsv3_use_fp8_dispatch": true,
79+
"fa_version": 3
7580
}

0 commit comments

Comments
 (0)