Skip to content

Commit 65bb06d

Browse files
committed
doc(comment): fix code comment
1 parent 0905f8c commit 65bb06d

File tree

3 files changed

+35
-37
lines changed

3 files changed

+35
-37
lines changed

paddleformers/examples/deepseek_v3/run_pretrain.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -505,13 +505,13 @@ def main():
505505
dtype=dtype,
506506
)
507507
else:
508-
# 修改这里降低模型层数,deepseek前3层为dense层,之后才有稀疏层
509-
# config.num_hidden_layers = 4 # v3是61
510-
# config.first_k_dense_replace = 0 # v3是3
511-
# # 修改这里降低模型专家数量,如果希望进行EP并行,专家数量要能够被并行度整除
512-
# config.n_routed_experts = 64 # v3是256
513-
# config.num_experts_per_tok = 8 # v3是8
514-
# config.topk_group = 4 # v3是4
508+
# Modify here to reduce the number of model layers. The first 3 layers of DeepSeek are dense layers, and sparse layers appear after that.
509+
# config.num_hidden_layers = 4 # v3 uses 61
510+
# config.first_k_dense_replace = 0 # v3 uses 3
511+
# Modify here to reduce the number of experts in the model. If EP (Expert Parallelism) is desired, the number of experts should be divisible by the parallelism degree.
512+
# config.n_routed_experts = 64 # v3 uses 256
513+
# config.num_experts_per_tok = 8 # v3 uses 8
514+
# config.topk_group = 4 # v3 uses 4
515515

516516
# config.using_flex_token = True
517517
# config.num_nextn_predict_layers = 1

paddleformers/trainer/utils/load_hf_ckpt.py

Lines changed: 28 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
import paddle
2323

24+
from paddleformers.utils.log import logger
25+
2426
try:
2527
from safetensors import safe_open
2628
except:
@@ -50,13 +52,13 @@
5052

5153
def paddle_name_to_hf_names_ds_v2(paddle_name: str) -> List[str]:
5254
"""
53-
将Paddle模型参数名称转换为Hugging Face格式的名称列表
55+
Convert Paddle model parameter names to Hugging Face format name lists
5456
55-
参数:
56-
paddle_name: Paddle格式的参数名称
57+
Args:
58+
paddle_name: Parameter name in Paddle format
5759
58-
返回:
59-
Hugging Face格式的参数名称列表(可能拆分多个参数)
60+
Returns:
61+
List of parameter names in Hugging Face format (may be split into multiple parameters)
6062
"""
6163
if paddle_name == "_layers.deepseek_v2.embed_tokens.weight":
6264
return ["model.embed_tokens.weight"]
@@ -69,7 +71,6 @@ def paddle_name_to_hf_names_ds_v2(paddle_name: str) -> List[str]:
6971

7072
m = _LAYER_RE_v2.match(paddle_name)
7173
if not m:
72-
print("not match here !!", paddle_name)
7374
return []
7475

7576
rest = m.group(2) or ""
@@ -125,13 +126,13 @@ def paddle_name_to_hf_names_ds_v2(paddle_name: str) -> List[str]:
125126

126127
def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
127128
"""
128-
将Paddle模型参数名称转换为Hugging Face格式的名称列表
129+
Convert Paddle model parameter names to Hugging Face format name lists
129130
130-
参数:
131-
paddle_name: Paddle格式的参数名称
131+
Args:
132+
paddle_name: Parameter name in Paddle format
132133
133-
返回:
134-
Hugging Face格式的参数名称列表(可能拆分多个参数)
134+
Returns:
135+
List of parameter names in Hugging Face format (may be split into multiple parameters)
135136
"""
136137
if paddle_name == "_layers.local_shared_layers.DeepseekV2_shared_weight.embed_tokens.weight":
137138
return ["model.embed_tokens.weight"]
@@ -142,7 +143,6 @@ def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
142143
m = _LAYER_RE.match(paddle_name)
143144

144145
if not m:
145-
print("not match here !!", paddle_name)
146146
return []
147147
else:
148148
rest = m.group(3) or ""
@@ -201,8 +201,8 @@ def paddle_name_to_hf_names(paddle_name: str) -> List[str]:
201201

202202

203203
def _get_hf_prefix(segment_id: int, id_in_segment: int) -> str:
204-
"""生成Hugging Face格式的层级前缀"""
205-
# 特殊层级映射
204+
"""Generate hierarchical prefix in Hugging Face format"""
205+
# Special layer mappings
206206
# special_cases = {(0, 0): "model", (60, 2): "model.layers.61", (60, 3): "model"}
207207
# special_cases = {(0, 0): "model", (28, 2): "model.layers.61", (28, 3): "model"}
208208
# special_cases = {(0, 0): "model", (28, 2): "model.layers.61", (4, 1): "model"}
@@ -212,7 +212,7 @@ def _get_hf_prefix(segment_id: int, id_in_segment: int) -> str:
212212
if (segment_id, id_in_segment) in special_cases:
213213
return special_cases[(segment_id, id_in_segment)]
214214

215-
# 通用层级计算
215+
# General layer calculation
216216
layer_idx = segment_id + id_in_segment - 1
217217
return f"model.layers.{layer_idx}"
218218

@@ -265,39 +265,38 @@ def prepare_tensor(tensor, dst_shape, *, force_transpose=False):
265265
axis=-1,
266266
)
267267
if t.shape != dst_shape:
268-
print("base shape", tensor[0].shape, tensor[1].shape)
269-
print("shape not match ", t.shape, dst_shape)
268+
logger.warning(
269+
f"Prepare_tensor: shape not match. base tensor shape: {tensor[0].shape}, {tensor[1].shape}, t.shape: {t.shape}, dst_shape: {dst_shape}"
270+
)
270271
sys.exit()
271272
return t
272273

273274
if force_transpose:
274275
return tensor.T.contiguous()
275276

276277
if tensor.shape == dst_shape:
277-
if len(tensor.shape) != 1:
278-
print("attention same shape not transpose !!!!!!!!!!!!!!!!!!!!!!")
279278
return tensor
280279
if len(tensor.shape) == 2 and paddle.transpose(tensor, perm=[1, 0]).contiguous().shape == dst_shape:
281280
return paddle.transpose(tensor, perm=[1, 0]).contiguous()
282281

283-
print("shape not match here")
282+
logger.warning("Prepare_tensor: shape not match.")
284283
sys.exit()
285284

286285

287286
def load_huggingface_ckpt(model, huggingface_ckpt_path):
288287
ckpt_pre = huggingface_ckpt_path
289288

290-
# 1. 加载参数-文件映射表
289+
# 1. Load parameter file mapping table
291290
weight_map_path = ckpt_pre + "/model.safetensors.index.json"
292291
with open(weight_map_path, "r") as f:
293292
weight_map = json.load(f)["weight_map"]
294293

295-
# 2. 创建反向索引:文件 -> 参数列表
294+
# 2. Create inverse index: file -> parameter list
296295
file_to_params = defaultdict(list)
297296
for param_name, filename in weight_map.items():
298297
file_to_params[filename].append(param_name)
299298

300-
# 2. 收集模型需要的文件列表
299+
# 3. Collect file list that model needs
301300
required_files = set()
302301
file_to_pd_param_name = defaultdict(list)
303302
pd_param_name_to_file = defaultdict(list)
@@ -309,7 +308,7 @@ def load_huggingface_ckpt(model, huggingface_ckpt_path):
309308
file_to_pd_param_name[filename].append(pd_name)
310309
pd_param_name_to_file[pd_name].append(filename)
311310
else:
312-
print(f"Warning: {pd_name} -> {hf_name[0]} not found in weight map")
311+
logger.warning(f"Warning: {pd_name} -> {hf_name[0]} not found in weight map")
313312
import sys
314313

315314
sys.exit()
@@ -322,15 +321,15 @@ def load_huggingface_ckpt(model, huggingface_ckpt_path):
322321
if filename != pd_param_name_to_file[pd_name][0]:
323322
pd_param_name_to_file[pd_name].append(filename)
324323
else:
325-
print(f"Warning: {pd_name} -> {hf_name[1]} not found in weight map")
324+
logger.warning(f"Warning: {pd_name} -> {hf_name[1]} not found in weight map")
326325

327-
# 3. 按文件分组加载
326+
# 4. Group file and load
328327
check_list = []
329-
print("Start load huggingface ckpt")
328+
logger.info("Start load huggingface ckpt")
330329
for i, filename in enumerate(required_files):
331330
try:
332331
with safe_open(ckpt_pre + filename, framework="paddle", device="cpu") as f:
333-
# 加载该文件包含的所有参数
332+
# Load all parameters in file
334333
pd_params = file_to_pd_param_name[filename]
335334
for pd_param in pd_params:
336335
if pd_param in check_list:
@@ -374,5 +373,5 @@ def load_huggingface_ckpt(model, huggingface_ckpt_path):
374373
check_list.append(pd_param)
375374

376375
except Exception as e:
377-
print(f"Error loading {filename}: {str(e)}")
376+
logger.warning(f"Error loading {filename}: {str(e)}")
378377
raise

paddleformers/transformers/moe_gate.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,6 @@ def topkgating_nodrop(self, gates: paddle.Tensor):
579579
mask = paddle.zeros_like(gates).put_along_axis(top_idx, paddle.ones([], dtype="float32"), axis=1)
580580

581581
gates_masked = gates * mask
582-
# if self.training:
583582
gates_s = paddle.sum(gates_masked, axis=-1, keepdim=True)
584583
denom_s = paddle.clip(gates_s, min=paddle.finfo(gates_masked.dtype).eps)
585584

0 commit comments

Comments
 (0)