Implement separate training/inference paths for torch.export compatibility

akacmazz · akacmazz · commit 0aa9de709055 · 2025-08-13T11:24:59.000+03:00
- Training path: Keep efficient .nonzero() for performance
  - Inference path: Use static loop for torch.export compatibility
  - Add conditional check to skip empty experts in inference
  - Update tests to validate inference mode export
  - Addresses maintainer feedback on performance concerns
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -134,23 +134,49 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             selected_experts, num_classes=self.num_experts
         ).permute(2, 1, 0)
 
-        # Loop over all available experts in the model and perform the computation on each expert
-        for expert_idx in range(self.num_experts):
-            expert_layer = self.experts[expert_idx]
-            idx, top_x = torch.where(expert_mask[expert_idx])
-            # Index the correct hidden states and compute the expert hidden state for
-            # the current expert. We need to make sure to multiply the output hidden
-            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
-            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
-            current_hidden_states = (
-                expert_layer(current_state) * routing_weights[top_x, idx, None]
-            )
-
-            # However `index_add_` only support torch tensors for indexing so we'll use
-            # the `top_x` tensor here.
-            final_hidden_states.index_add_(
-                0, top_x, current_hidden_states.to(hidden_states.dtype)
-            )
+        # Separate paths for training (with .nonzero()) and inference (without .nonzero())
+        if self.training:
+            # Training path: use .nonzero() for efficiency (skip non-selected experts)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit:
+                expert_layer = self.experts[expert_idx]
+                idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+                # Index the correct hidden states and compute the expert hidden state for
+                # the current expert. We need to make sure to multiply the output hidden
+                # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+                current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+                current_hidden_states = (
+                    expert_layer(current_state) * routing_weights[top_x, idx, None]
+                )
+
+                # However `index_add_` only support torch tensors for indexing so we'll use
+                # the `top_x` tensor here.
+                final_hidden_states.index_add_(
+                    0, top_x, current_hidden_states.to(hidden_states.dtype)
+                )
+        else:
+            # Inference path: loop over all experts for torch.export compatibility
+            for expert_idx in range(self.num_experts):
+                expert_layer = self.experts[expert_idx]
+                idx, top_x = torch.where(expert_mask[expert_idx])
+                
+                # Skip if no tokens are assigned to this expert
+                if top_x.shape[0] == 0:
+                    continue
+                    
+                # Index the correct hidden states and compute the expert hidden state for
+                # the current expert. We need to make sure to multiply the output hidden
+                # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+                current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+                current_hidden_states = (
+                    expert_layer(current_state) * routing_weights[top_x, idx, None]
+                )
+
+                # However `index_add_` only support torch tensors for indexing so we'll use
+                # the `top_x` tensor here.
+                final_hidden_states.index_add_(
+                    0, top_x, current_hidden_states.to(hidden_states.dtype)
+                )
         final_hidden_states = final_hidden_states.reshape(
             batch_size, sequence_length, hidden_dim
         )
diff --git a/src/transformers/models/mixtral/modular_mixtral.py b/src/transformers/models/mixtral/modular_mixtral.py
@@ -220,23 +220,49 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             selected_experts, num_classes=self.num_experts
         ).permute(2, 1, 0)
 
-        # Loop over all available experts in the model and perform the computation on each expert
-        for expert_idx in range(self.num_experts):
-            expert_layer = self.experts[expert_idx]
-            idx, top_x = torch.where(expert_mask[expert_idx])
-            # Index the correct hidden states and compute the expert hidden state for
-            # the current expert. We need to make sure to multiply the output hidden
-            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
-            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
-            current_hidden_states = (
-                expert_layer(current_state) * routing_weights[top_x, idx, None]
-            )
-
-            # However `index_add_` only support torch tensors for indexing so we'll use
-            # the `top_x` tensor here.
-            final_hidden_states.index_add_(
-                0, top_x, current_hidden_states.to(hidden_states.dtype)
-            )
+        # Separate paths for training (with .nonzero()) and inference (without .nonzero())
+        if self.training:
+            # Training path: use .nonzero() for efficiency (skip non-selected experts)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit:
+                expert_layer = self.experts[expert_idx]
+                idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+                # Index the correct hidden states and compute the expert hidden state for
+                # the current expert. We need to make sure to multiply the output hidden
+                # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+                current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+                current_hidden_states = (
+                    expert_layer(current_state) * routing_weights[top_x, idx, None]
+                )
+
+                # However `index_add_` only support torch tensors for indexing so we'll use
+                # the `top_x` tensor here.
+                final_hidden_states.index_add_(
+                    0, top_x, current_hidden_states.to(hidden_states.dtype)
+                )
+        else:
+            # Inference path: loop over all experts for torch.export compatibility
+            for expert_idx in range(self.num_experts):
+                expert_layer = self.experts[expert_idx]
+                idx, top_x = torch.where(expert_mask[expert_idx])
+                
+                # Skip if no tokens are assigned to this expert
+                if top_x.shape[0] == 0:
+                    continue
+                    
+                # Index the correct hidden states and compute the expert hidden state for
+                # the current expert. We need to make sure to multiply the output hidden
+                # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+                current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+                current_hidden_states = (
+                    expert_layer(current_state) * routing_weights[top_x, idx, None]
+                )
+
+                # However `index_add_` only support torch tensors for indexing so we'll use
+                # the `top_x` tensor here.
+                final_hidden_states.index_add_(
+                    0, top_x, current_hidden_states.to(hidden_states.dtype)
+                )
         final_hidden_states = final_hidden_states.reshape(
             batch_size, sequence_length, hidden_dim
         )
diff --git a/tests/models/mixtral/test_mixtral_torch_export.py b/tests/models/mixtral/test_mixtral_torch_export.py
@@ -39,10 +39,10 @@ def setUp(self):
         )
 
     def test_moe_block_torch_export(self):
-        """Test that MixtralSparseMoeBlock can be exported with torch.export."""
+        """Test that MixtralSparseMoeBlock can be exported with torch.export in inference mode."""
         # Create MoE block
         moe_block = MixtralSparseMoeBlock(self.config)
-        moe_block.eval()
+        moe_block.eval()  # Set to eval mode for inference path
 
         # Move to meta device for export testing
         moe_block = moe_block.to("meta")
@@ -69,7 +69,7 @@ def test_moe_block_torch_export(self):
             ):
                 self.fail(
                     f"torch.export failed with data-dependent operation error: {error_msg}\n"
-                    "This suggests the .nonzero() fix is not working properly."
+                    "This suggests the inference path fix is not working properly."
                 )
             else:
                 # Re-raise other unexpected errors