Skip to content

Commit 1b6c1f4

Browse files
committed
fix:(moe config): default using_flex_token
1 parent 4e79040 commit 1b6c1f4

File tree

2 files changed

+8
-1
lines changed

2 files changed

+8
-1
lines changed

paddleformers/transformers/moe_gate.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,9 @@ def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
210210
self.norm_topk_prob = kwargs.pop("norm_topk_prob", False)
211211
self.routed_scaling_factor = kwargs.pop("routed_scaling_factor", 1.0)
212212

213+
# for flex token moe layer
214+
self.using_flex_token = kwargs.pop("using_flex_token", False)
215+
213216
def _priority(self, topk_idx: paddle.Tensor, capacity: int) -> paddle.Tensor:
214217
"""_summary_
215218
The priority is the cumulative sum of the expert indices.

paddleformers/transformers/moe_layer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,11 @@ def __init__(
277277
def update_flex_token(self):
278278
from paddleformers.transformers.deepseek_v2 import get_global_step
279279

280-
if (not self.config.using_flex_token) or (get_global_step() < self.token_drop_steps):
280+
if (
281+
(not hasattr(self.config, "using_flex_token"))
282+
or (not self.config.using_flex_token)
283+
or (get_global_step() < self.token_drop_steps)
284+
):
281285
self.using_flex_token = False
282286
self.router.using_flex_token = False
283287
else:

0 commit comments

Comments
 (0)