@@ -695,6 +695,26 @@ def determine_expert_map(
695
695
return (local_num_experts , expert_map )
696
696
697
697
698
+ def get_compressed_expert_map (expert_map : torch .Tensor ) -> str :
699
+ """
700
+ Compresses the expert map by removing any -1 entries.
701
+
702
+ Args:
703
+ expert_map (torch.Tensor): A tensor of shape (global_num_experts,)
704
+ mapping from global to local index. Contains -1 for experts not
705
+ assigned to the current rank.
706
+
707
+ Returns:
708
+ str: A string mapping from local to global index.
709
+ Using str to support hashing for logging once only.
710
+ """
711
+ global_indices = torch .where (expert_map != - 1 )[0 ]
712
+ local_indices = expert_map [global_indices ]
713
+ return ", " .join (
714
+ f"{ local_index .item ()} ->{ global_index .item ()} "
715
+ for local_index , global_index in zip (local_indices , global_indices ))
716
+
717
+
698
718
@CustomOp .register ("fused_moe" )
699
719
class FusedMoE (CustomOp ):
700
720
"""FusedMoE layer for MoE models.
@@ -795,6 +815,12 @@ def __init__(
795
815
ep_size = self .ep_size ,
796
816
ep_rank = self .ep_rank ,
797
817
global_num_experts = self .global_num_experts )
818
+ logger .info_once (
819
+ "[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
820
+ " number of experts: %s/%s. Experts local to global index map:"
821
+ " %s." , self .ep_rank , self .ep_size , self .local_num_experts ,
822
+ self .global_num_experts ,
823
+ get_compressed_expert_map (self .expert_map ))
798
824
else :
799
825
self .local_num_experts , self .expert_map = (self .global_num_experts ,
800
826
None )
0 commit comments