@@ -539,7 +539,7 @@ def __init__(
539
539
prefix = "deepseek_v3.embed_tokens" ,
540
540
)
541
541
542
- self .decoder_layers = nn .LayerList (
542
+ self .layers = nn .LayerList (
543
543
[
544
544
DeepSeekV3DecoderLayer (
545
545
fd_config ,
@@ -564,7 +564,7 @@ def load_state_dict(self, state_dict):
564
564
self .norm .load_state_dict (state_dict )
565
565
for i in range (self .num_layers ):
566
566
logger .info (f"Start load layer { i } " )
567
- self .decoder_layers [i ].load_state_dict (state_dict )
567
+ self .layers [i ].load_state_dict (state_dict )
568
568
569
569
def forward (
570
570
self ,
@@ -578,7 +578,7 @@ def forward(
578
578
579
579
residual = None
580
580
for i in range (self .num_layers ):
581
- hidden_states , residual = self .decoder_layers [i ](
581
+ hidden_states , residual = self .layers [i ](
582
582
forward_meta ,
583
583
hidden_states ,
584
584
residual ,
@@ -658,12 +658,11 @@ def load_weights(self, weights_iterator) -> None:
658
658
659
659
for loaded_weight_name , loaded_weight in weights_iterator :
660
660
loaded_weight_name = loaded_weight_name .replace ("deepseek_v3" , "model" )
661
- loaded_weight_name = loaded_weight_name .replace ("layers" , "decoder_layers" )
662
661
663
662
for param_name , weight_name , shard_id in stacked_params_mapping :
664
663
if weight_name not in loaded_weight_name :
665
664
continue
666
- if "mlp.experts." in loaded_weight_name and loaded_weight_name not in params_dict :
665
+ if "mlp.experts." in loaded_weight_name :
667
666
continue
668
667
model_param_name = loaded_weight_name .replace (weight_name , param_name )
669
668
0 commit comments