Skip to content

Commit 43b3f58

Browse files
authored
Fix grammatical error in MoE variable name: expert_hitted → expert_hit, hitted_experts → hit_experts (#39959)
* Fix grammatical error: expert_hitted -> expert_hit in MoE implementations * Fix grammatical error: hitted_experts -> hit_experts in MoE implementation
1 parent dff6185 commit 43b3f58

File tree

11 files changed

+22
-22
lines changed

11 files changed

+22
-22
lines changed

src/transformers/integrations/mxfp4.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,8 +264,8 @@ def topk(vals, k):
264264

265265
expt_data = compute_expt_data_torch(hist, n_local_experts, n_gates_pad)
266266

267-
hitted_experts = n_expts_act
268-
return RoutingData(gate_scal, hist, n_local_experts, hitted_experts, expt_data), gather_indx, scatter_indx
267+
hit_experts = n_expts_act
268+
return RoutingData(gate_scal, hist, n_local_experts, hit_experts, expt_data), gather_indx, scatter_indx
269269

270270

271271
def mlp_forward(self, hidden_states):

src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,8 +356,8 @@ def forward(
356356
expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
357357

358358
# Loop over all available experts in the model and perform the computation on each expert
359-
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
360-
for expert_idx in expert_hitted:
359+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
360+
for expert_idx in expert_hit:
361361
expert_layer = self.experts[expert_idx]
362362
idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
363363

src/transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,8 @@ def forward(
167167
expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
168168

169169
# Loop over all available experts in the model and perform the computation on each expert
170-
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
171-
for expert_idx in expert_hitted:
170+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
171+
for expert_idx in expert_hit:
172172
expert_layer = self.experts[expert_idx]
173173
idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
174174

src/transformers/models/gpt_oss/modeling_gpt_oss.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weig
9797
expert_mask = expert_mask.permute(2, 1, 0)
9898
# we sum on the top_k and on the sequence lenght to get which experts
9999
# are hit this time around
100-
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
101-
for expert_idx in expert_hitted[:]:
100+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
101+
for expert_idx in expert_hit[:]:
102102
with torch.no_grad():
103103
_, token_idx = torch.where(expert_mask[expert_idx[0]])
104104
current_state = hidden_states[token_idx]

src/transformers/models/gpt_oss/modular_gpt_oss.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,8 @@ def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weig
9595
expert_mask = expert_mask.permute(2, 1, 0)
9696
# we sum on the top_k and on the sequence lenght to get which experts
9797
# are hit this time around
98-
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
99-
for expert_idx in expert_hitted[:]:
98+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
99+
for expert_idx in expert_hit[:]:
100100
with torch.no_grad():
101101
_, token_idx = torch.where(expert_mask[expert_idx[0]])
102102
current_state = hidden_states[token_idx]

src/transformers/models/minimax/modeling_minimax.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,8 +465,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
465465
# this will be used to easily index which expert is going to be sollicitated
466466
expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
467467

468-
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
469-
for expert_idx in expert_hitted:
468+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
469+
for expert_idx in expert_hit:
470470
expert_layer = self.experts[expert_idx]
471471
idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
472472
# Index the correct hidden states and compute the expert hidden state for

src/transformers/models/mixtral/modeling_mixtral.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
121121
# this will be used to easily index which expert is going to be sollicitated
122122
expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
123123

124-
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
125-
for expert_idx in expert_hitted:
124+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
125+
for expert_idx in expert_hit:
126126
expert_layer = self.experts[expert_idx]
127127
idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
128128
# Index the correct hidden states and compute the expert hidden state for

src/transformers/models/mixtral/modular_mixtral.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
201201
# this will be used to easily index which expert is going to be sollicitated
202202
expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
203203

204-
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
205-
for expert_idx in expert_hitted:
204+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
205+
for expert_idx in expert_hit:
206206
expert_layer = self.experts[expert_idx]
207207
idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
208208
# Index the correct hidden states and compute the expert hidden state for

src/transformers/models/qwen2_moe/modeling_qwen2_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -621,8 +621,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
621621
expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
622622

623623
# Loop over all available experts in the model and perform the computation on each expert
624-
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
625-
for expert_idx in expert_hitted:
624+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
625+
for expert_idx in expert_hit:
626626
expert_layer = self.experts[expert_idx]
627627
idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
628628

src/transformers/models/qwen3_moe/modeling_qwen3_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
244244
expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
245245

246246
# Loop over all available experts in the model and perform the computation on each expert
247-
expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
248-
for expert_idx in expert_hitted:
247+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
248+
for expert_idx in expert_hit:
249249
expert_layer = self.experts[expert_idx]
250250
idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
251251

0 commit comments

Comments
 (0)