InternLM
diff --git a/‎ci_scripts/train/ci_7B_sft.py‎
Lines changed: 0 additions & 2 deletions b/‎ci_scripts/train/ci_7B_sft.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎configs/1.8B_MoE16_sft.py‎
Lines changed: 0 additions & 2 deletions b/‎configs/1.8B_MoE16_sft.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎configs/57B_qwen2_MoE.py‎
Lines changed: 0 additions & 226 deletions b/‎configs/57B_qwen2_MoE.py‎
Lines changed: 0 additions & 226 deletions
diff --git a/‎configs/7B_MoE4_sft.py‎
Lines changed: 0 additions & 2 deletions b/‎configs/7B_MoE4_sft.py‎
Lines changed: 0 additions & 2 deletions
@@ -101,14 +101,12 @@
 model = dict(
     checkpoint=False,
     num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
     embed_grad_scale=1,
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
     dtype="torch.bfloat16",
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
 
@@ -136,14 +136,12 @@
 model = dict(
     checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
     num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
     embed_grad_scale=1,
     parallel_output=False,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
     dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
 
@@ -149,14 +149,12 @@
 model = dict(
     checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
     num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
     embed_grad_scale=1,
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
     dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,