Skip to content

Commit 919e373

Browse files
perf: Update moe_token_dispatcher_type default to alltoall (#2004)
Signed-off-by: Parth Mannan <pmannan@nvidia.com> Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
1 parent c4f8e1c commit 919e373

14 files changed

+15
-15
lines changed

examples/configs/distillation_math.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ policy: &POLICY_BASE
109109
defer_fp32_logits: False
110110
moe_per_layer_logging: False
111111
moe_enable_deepep: false
112-
moe_token_dispatcher_type: "allgather"
112+
moe_token_dispatcher_type: "alltoall"
113113
moe_shared_expert_overlap: false
114114

115115
optimizer:

examples/configs/distillation_math_megatron.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ policy: &POLICY_BASE
6060
moe_per_layer_logging: False
6161
defer_fp32_logits: False
6262
moe_enable_deepep: false
63-
moe_token_dispatcher_type: "allgather"
63+
moe_token_dispatcher_type: "alltoall"
6464
moe_shared_expert_overlap: false
6565
peft:
6666
enabled: false

examples/configs/dpo.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ policy:
133133
defer_fp32_logits: False
134134
moe_per_layer_logging: False
135135
moe_enable_deepep: false
136-
moe_token_dispatcher_type: "allgather"
136+
moe_token_dispatcher_type: "alltoall"
137137
moe_shared_expert_overlap: false
138138

139139
optimizer:

examples/configs/grpo_math_1B.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ policy:
144144
defer_fp32_logits: False
145145
moe_per_layer_logging: False
146146
moe_enable_deepep: false
147-
moe_token_dispatcher_type: "allgather"
147+
moe_token_dispatcher_type: "alltoall"
148148
moe_shared_expert_overlap: false
149149

150150
peft:

examples/configs/grpo_math_1B_megatron.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ policy:
9696
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
9797
moe_permute_fusion: false
9898
moe_enable_deepep: false
99-
moe_token_dispatcher_type: "allgather"
99+
moe_token_dispatcher_type: "alltoall"
100100
moe_shared_expert_overlap: false
101101
#gives ~20% training perf speedup with sequence packing
102102
apply_rope_fusion: True

examples/configs/sft.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ policy:
115115
defer_fp32_logits: False
116116
moe_per_layer_logging: False
117117
moe_enable_deepep: false
118-
moe_token_dispatcher_type: "allgather"
118+
moe_token_dispatcher_type: "alltoall"
119119
moe_shared_expert_overlap: false
120120

121121
peft:

examples/configs/sft_openmathinstruct2_megatron.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ policy:
9494
bias_activation_fusion: True
9595
moe_per_layer_logging: False
9696
moe_enable_deepep: false
97-
moe_token_dispatcher_type: "allgather"
97+
moe_token_dispatcher_type: "alltoall"
9898
moe_shared_expert_overlap: false
9999
peft:
100100
enabled: false

examples/configs/vlm_grpo_3B.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ policy:
114114
defer_fp32_logits: False
115115
moe_per_layer_logging: False
116116
moe_enable_deepep: false
117-
moe_token_dispatcher_type: "allgather"
117+
moe_token_dispatcher_type: "alltoall"
118118
moe_shared_expert_overlap: false
119119

120120
optimizer:

examples/configs/vlm_grpo_3B_megatron.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ policy:
156156
defer_fp32_logits: False
157157
moe_per_layer_logging: False
158158
moe_enable_deepep: false
159-
moe_token_dispatcher_type: "allgather"
159+
moe_token_dispatcher_type: "alltoall"
160160
moe_shared_expert_overlap: false
161161
peft:
162162
enabled: false

examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ policy:
118118
defer_fp32_logits: false
119119
moe_permute_fusion: false
120120
moe_enable_deepep: false
121-
moe_token_dispatcher_type: "allgather"
121+
moe_token_dispatcher_type: "alltoall"
122122
moe_shared_expert_overlap: false
123123

124124
optimizer:

0 commit comments

Comments
 (0)