pytorch
diff --git a/‎fbgemm_gpu/experimental/gen_ai/gen_ai/moe/shuffling.py‎
Lines changed: 59 additions & 59 deletions b/‎fbgemm_gpu/experimental/gen_ai/gen_ai/moe/shuffling.py‎
Lines changed: 59 additions & 59 deletions
@@ -19,15 +19,15 @@ def combine_shuffling(
     token_counts: torch.Tensor,
     expert_start: Optional[int] = None,
     expert_end: Optional[int] = None,
-    is_balanced: bool = False,
+    is_padded: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     # pyre-ignore
     return _combine_or_split_shuffling(
         tokens=tokens,
         token_counts=token_counts,
         expert_start=expert_start,
         expert_end=expert_end,
-        is_balanced=is_balanced,
+        is_padded=is_padded,
         is_combine=True,
     )
 
@@ -37,7 +37,7 @@ def split_shuffling(
     token_counts: torch.Tensor,
     expert_start: Optional[int] = None,
     expert_end: Optional[int] = None,
-    is_balanced: bool = False,
+    is_padded: bool = False,
     init_with_zeros: bool = False,
 ) -> torch.Tensor:
     # pyre-ignore
@@ -46,7 +46,7 @@ def split_shuffling(
         token_counts=token_counts,
         expert_start=expert_start,
         expert_end=expert_end,
-        is_balanced=is_balanced,
+        is_padded=is_padded,
         is_combine=False,
         init_with_zeros=init_with_zeros,
     )
@@ -57,7 +57,7 @@ def _combine_or_split_shuffling(
     token_counts: torch.Tensor,
     expert_start: Optional[int],
     expert_end: Optional[int],
-    is_balanced: bool,
+    is_padded: bool,
     is_combine: bool,
     init_with_zeros: bool = False,
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
@@ -67,6 +67,10 @@ def _combine_or_split_shuffling(
 
     T, D = tokens.shape
     EP, E = token_counts.shape
+    B_T = -1
+    if is_padded:
+        assert T % EP == 0
+        B_T = T // EP
 
     if expert_start is None:
         expert_start = 0
@@ -95,8 +99,6 @@ def _combine_or_split_shuffling(
         )
     else:
         output_token_counts = None
-    T_BUCKET_CAP = 16384
-    T_BUCKET = min(triton.next_power_of_2(T), T_BUCKET_CAP)
 
     BLOCK_E = max(triton.next_power_of_2(E), 8)
     BLOCK_EG = max(triton.next_power_of_2(EG), 8)
@@ -108,9 +110,9 @@ def _combine_or_split_shuffling(
         output_tokens,
         output_token_counts,
         is_combine,
-        is_balanced,
-        T_BUCKET,
         expert_start,
+        is_padded,
+        B_T,
         EG,
         EP,
         E,
@@ -133,7 +135,7 @@ def _combine_or_split_shuffling(
 
 torch.library.define(
     "fbgemm::combine_shuffling",
-    "(Tensor tokens, Tensor token_counts, int? expert_start = None, int? expert_end = None, bool? is_balanced = False) -> (Tensor, Tensor)",
+    "(Tensor tokens, Tensor token_counts, int? expert_start = None, int? expert_end = None, bool? is_padded = False) -> (Tensor, Tensor)",
 )
 
 
@@ -143,7 +145,7 @@ def combine_shuffling_meta(
     token_counts,
     expert_start,
     expert_end,
-    is_balanced,
+    is_padded,
 ):
     _, E = token_counts.shape
     if expert_start is None:
@@ -165,22 +167,22 @@ def combine_shuffling_cuda(
     token_counts,
     expert_start=None,
     expert_end=None,
-    is_balanced=False,
+    is_padded=False,
 ):
     return combine_shuffling(
         tokens,
         token_counts,
         expert_start,
         expert_end,
-        is_balanced,
+        is_padded,
     )
 
 
 _SPLIT_SHUFFLING_OP_NAME = "fbgemm::split_shuffling"
 
 torch.library.define(
     "fbgemm::split_shuffling",
-    "(Tensor tokens, Tensor token_counts, int? expert_start = None, int? expert_end = None, bool? is_balanced = False, bool? init_with_zeros = False) -> Tensor",
+    "(Tensor tokens, Tensor token_counts, int? expert_start = None, int? expert_end = None, bool? is_padded = False, bool? init_with_zeros = False) -> Tensor",
 )
 
 
@@ -190,7 +192,7 @@ def split_shuffling_meta(
     token_counts,
     expert_start,
     expert_end,
-    is_balanced,
+    is_padded,
 ):
     output_tokens = torch.empty_like(tokens)
     return output_tokens
@@ -202,14 +204,14 @@ def split_shuffling_cuda(
     token_counts,
     expert_start=None,
     expert_end=None,
-    is_balanced=False,
+    is_padded=False,
 ):
     return split_shuffling(
         tokens,
         token_counts,
         expert_start,
         expert_end,
-        is_balanced,
+        is_padded,
     )
 
 
@@ -252,8 +254,6 @@ def split_shuffling_cuda(
     configs=_AMD_CONFIGS if torch.version.hip else _NV_CONFIGS,
     key=[
         "COMBINE",
-        "BALANCED",
-        "T_BUCKET",
         "EG",
         "EP",
         "E",
@@ -267,9 +267,9 @@ def _fbgemm_combine_or_split_shuffling(
     output_tokens_ptr,
     output_token_counts_ptr,
     COMBINE: tl.constexpr,
-    BALANCED,
-    T_BUCKET,
     EG_START,
+    PADDED,
+    B_T: tl.constexpr,
     EG: tl.constexpr,
     EP: tl.constexpr,
     E: tl.constexpr,
@@ -294,8 +294,11 @@ def _fbgemm_combine_or_split_shuffling(
     rank = tidx // (EG * SPLIT_D)
     local_expert = (tidx % (EG * SPLIT_D)) // SPLIT_D
     didx = tidx % SPLIT_D
+    # All experts in communication group
     offs_e = tl.arange(0, BLOCK_E)
+    # Local experts
     offs_eg = tl.arange(0, BLOCK_EG)
+    # Ranks
     offs_ep = tl.arange(0, BLOCK_EP)
 
     global_expert = local_expert + EG_START
@@ -307,59 +310,56 @@ def _fbgemm_combine_or_split_shuffling(
         other=0,
     )  # [EP, E]
 
-    input_token_counts_eg = tl.load(
-        input_token_counts_ptr + offs_ep[:, None] * E + EG_START + offs_eg[None, :],
-        eviction_policy="evict_last",
-        mask=((offs_ep[:, None] < EP) & (offs_eg[None, :] < EG)),
-        other=0,
-    )  # [EP, EG]
+    if E == EG:
+        input_token_counts_eg = input_token_counts
+    else:
+        input_token_counts_eg = tl.load(
+            input_token_counts_ptr + offs_ep[:, None] * E + EG_START + offs_eg[None, :],
+            eviction_policy="evict_last",
+            mask=((offs_ep[:, None] < EP) & (offs_eg[None, :] < EG)),
+            other=0,
+        )  # [EP, EG]
 
     if COMBINE:
         LAST_TILE: tl.constexpr = EP * EG * SPLIT_D
 
         if tidx == LAST_TILE:
-            if EG == E:
-                output_token_counts = tl.sum(input_token_counts, axis=0)
-                tl.store(
-                    output_token_counts_ptr + offs_e,
-                    output_token_counts,
-                    mask=(offs_e < E),
-                )
-                output_token_counts = tl.sum(output_token_counts)
-                tl.store(output_token_counts_ptr + E, output_token_counts)
-            else:
-                output_token_counts_eg = tl.sum(input_token_counts_eg, axis=0)
-                tl.store(
-                    output_token_counts_ptr + offs_eg,
-                    output_token_counts_eg,
-                    mask=(offs_eg < EG),
-                )
-                output_token_counts_eg = tl.sum(output_token_counts_eg)
-                tl.store(output_token_counts_ptr + EG, output_token_counts_eg)
+            output_token_counts_eg = tl.sum(input_token_counts_eg, axis=0)
+            tl.store(
+                output_token_counts_ptr + offs_eg,
+                output_token_counts_eg,
+                mask=(offs_eg < EG),
+            )
+            output_token_counts_eg = tl.sum(output_token_counts_eg)
+            tl.store(output_token_counts_ptr + EG, output_token_counts_eg)
             return
 
     cond0 = offs_ep[:, None] < rank
     cond1 = offs_ep[:, None] == rank
 
     cond2 = offs_e[None, :] < global_expert
-    cond3 = offs_e[None, :] == global_expert
-
-    # r < rank || (r == rank && e < expert)
-    ep_first_order = tl.sum(tl.where(cond0 or (cond1 and cond2), input_token_counts, 0))
-    if EG == E:
-        # e < expert || (e == expert && r < rank)
-        expert_first_order = tl.sum(
-            tl.where(cond2 or (cond3 and cond0), input_token_counts, 0)
+
+    if PADDED:
+        tl.device_assert(B_T >= 0)
+        # Only need information from previous experts in the same rank.
+        ep_first_order = (
+            tl.sum(tl.where(cond1 and cond2, input_token_counts, 0)) + B_T * rank
         )
     else:
-        # e < expert || (e == expert && r < rank)
-        cond4 = offs_eg[None, :] < local_expert
-        cond5 = offs_eg[None, :] == local_expert
-
-        expert_first_order = tl.sum(
-            tl.where(cond4 or (cond5 and cond0), input_token_counts_eg, 0)
+        # r < rank || (r == rank && e < expert)
+        ep_first_order = tl.sum(
+            tl.where(cond0 or (cond1 and cond2), input_token_counts, 0)
         )
 
+    cond4 = offs_eg[None, :] < local_expert
+    cond5 = offs_eg[None, :] == local_expert
+
+    # Expert first only need information from local experts across ranks.
+    # e < expert || (e == expert && r < rank)
+    expert_first_order = tl.sum(
+        tl.where(cond4 or (cond5 and cond0), input_token_counts_eg, 0)
+    )
+
     if COMBINE:
         input_offset = ep_first_order
         output_offset = expert_first_order