PaddlePaddle
diff --git a/‎paddleformers/examples/deepseek_v3/config/__init__.py‎
Lines changed: 100 additions & 0 deletions b/‎paddleformers/examples/deepseek_v3/config/__init__.py‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎paddleformers/examples/deepseek_v3/config/config.json‎
Lines changed: 11 additions & 6 deletions b/‎paddleformers/examples/deepseek_v3/config/config.json‎
Lines changed: 11 additions & 6 deletions
@@ -0,0 +1,100 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+from typing import TYPE_CHECKING
+
+from paddleformers.utils.lazy_import import _LazyModule
+
+import_structure = {
+    "configuration": ["DeepseekV2FastConfig"],
+    "modeling": [
+        "masked_fill",
+        "DeepseekV2Attention",
+        "MoEGate",
+        "FakeGate",
+        "DeepseekV2ForCausalLM",
+        "_make_causal_mask",
+        "is_casual_mask",
+        "DeepseekV2MoE",
+        "DeepseekV2MoEFlexToken",
+        "scaled_dot_product_attention",
+        "DeepseekV2RotaryEmbedding",
+        "rotate_half",
+        "DeepseekV2MTPLayer",
+        "DeepseekV2RMSNorm",
+        "DeepseekV2YarnRotaryEmbedding",
+        "parallel_matmul",
+        "DeepseekV2PretrainedModel",
+        "AddAuxiliaryLoss",
+        "apply_rotary_pos_emb",
+        "assign_kv_heads",
+        "DeepseekV2ForSequenceClassification",
+        "_expand_2d_mask",
+        "DeepseekV2Model",
+        "repeat_kv",
+        "yarn_find_correction_dim",
+        "yarn_linear_ramp_mask",
+        "DeepseekV2DynamicNTKScalingRotaryEmbedding",
+        "DeepseekV2MLP",
+        "yarn_get_mscale",
+        "DeepseekV2LMHead",
+        "DeepseekV2DecoderLayer",
+        "DeepseekV2PretrainingCriterion",
+        "yarn_find_correction_range",
+        "get_triangle_upper_mask",
+        "DeepseekV2LinearScalingRotaryEmbedding",
+        "set_global_step",
+        "get_global_step",
+    ],
+    "modeling_auto": [
+        "DeepseekV2LMHeadAuto",
+        "DeepseekV2ForCausalLMAuto",
+        "DeepseekV2ModelAuto",
+        "DeepseekV2PretrainedModelAuto",
+    ],
+    "modeling_pp": ["DeepseekV2ForCausalLMPipe"],
+    "mfu_utils": ["DeepSeekProjection"],
+    "kernel": [
+        "act_quant",
+        "weight_dequant",
+        "fp8_gemm",
+        "weight_dequant_kernel",
+        "act_quant_kernel",
+        "fp8_gemm_kernel",
+    ],
+    "tokenizer_fast": ["DeepseekTokenizerFast"],
+    "fp8_linear": [
+        "Linear",
+        "ColumnParallelLinear",
+        "RowParallelLinear",
+        "ColumnSequenceParallelLinear",
+        "RowSequenceParallelLinear",
+    ],
+}
+
+if TYPE_CHECKING:
+    from .configuration import *
+    from .modeling import *
+    from .modeling_auto import *
+    from .modeling_pp import *
+    from .tokenizer_fast import *
+else:
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        import_structure,
+        module_spec=__spec__,
+    )
@@ -1,13 +1,13 @@
 {
     "architectures": [
-      "DeepseekV3ForCausalLM"
+      "DeepseekV2ForCausalLM"
     ],
     "attention_bias": false,
     "attention_dropout": 0.0,
     "auto_map": {
-      "AutoConfig": "configuration_deepseek.DeepseekV3Config",
-      "AutoModel": "modeling_deepseek.DeepseekV3Model",
-      "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
+      "AutoConfig": "DeepseekV2FastConfig",
+      "AutoModel": "DeepseekV2ModelFast",
+      "AutoModelForCausalLM": "DeepseekV2ForCausalLM"
     },
     "aux_loss_alpha": 0.001,
     "bos_token_id": 0,
@@ -20,7 +20,7 @@
     "intermediate_size": 18432,
     "kv_lora_rank": 512,
     "max_position_embeddings": 163840,
-    "model_type": "deepseek_v3",
+    "model_type": "deepseek_v2_fast",
     "moe_intermediate_size": 2048,
     "moe_layer_freq": 1,
     "n_group": 8,
@@ -71,5 +71,10 @@
     "use_dualpipev": true,
     "send_mtp_embed": true,
     "offline_quant_expert_weight": false,
-    "clear_origin_weight_when_offline_quant": false
+    "clear_origin_weight_when_offline_quant": false,
+    "dsv3_use_fp8_gemm": true,
+    "dsv3_use_atten_recompute": true,
+    "use_ds_gemm": false,
+    "dsv3_use_fp8_dispatch": true,
+    "fa_version": 3
   }