Fix a2a type (#3311)

kausv · facebook-github-bot · commit 40133810fc1b · 2025-09-05T16:20:31.000-07:00
Summary: Pull Request resolved: #3311 VBE initializes dist but kjt sets ctx flag at runtime. So if the batch sizes happens to match for all features, we assume fixed batch size resulting in runtime error. In this did, I fix the dist once initialized. We should follow up with driving this from config. https://fb.workplace.com/groups/1699838000485189/permalink/2222934654842185/ Differential Revision: D80742183 fbshipit-source-id: 1898040fd436a54742f78594f996a7f4e5e0225c
diff --git a/torchrec/distributed/sharding/tw_sharding.py b/torchrec/distributed/sharding/tw_sharding.py
@@ -367,20 +367,21 @@ def forward(
         """
         if self._dist is None:
             self._create_output_dist_module(sharding_ctx)
-
-        if sharding_ctx is None:
-            return cast(PooledEmbeddingsAllToAll, self._dist)(local_embs)
-        elif sharding_ctx.variable_batch_per_feature:
+        if isinstance(self._dist, VariableBatchPooledEmbeddingsAllToAll):
+            sharding_ctx = none_throws(sharding_ctx)
             return cast(VariableBatchPooledEmbeddingsAllToAll, self._dist)(
                 local_embs,
-                batch_size_per_rank_per_feature=sharding_ctx.batch_size_per_rank_per_feature,
-                batch_size_per_feature_pre_a2a=sharding_ctx.batch_size_per_feature_pre_a2a,
-            )
-        else:
-            return cast(PooledEmbeddingsAllToAll, self._dist)(
-                local_embs,
-                batch_size_per_rank=sharding_ctx.batch_size_per_rank,
+                batch_size_per_rank_per_feature=sharding_ctx.batch_size_per_rank_per_feature
+                or sharding_ctx.batch_size_per_rank,
+                batch_size_per_feature_pre_a2a=sharding_ctx.batch_size_per_feature_pre_a2a
+                or sharding_ctx.batch_size_per_rank,
             )
+        return cast(PooledEmbeddingsAllToAll, self._dist)(
+            local_embs,
+            batch_size_per_rank=(
+                sharding_ctx.batch_size_per_rank if sharding_ctx else None
+            ),
+        )
 
     def _create_output_dist_module(
         self, sharding_ctx: Optional[EmbeddingShardingContext] = None
diff --git a/torchrec/distributed/sharding/twrw_sharding.py b/torchrec/distributed/sharding/twrw_sharding.py
@@ -504,14 +504,21 @@ def forward(
             self._create_output_dist_modules(sharding_ctx)
         local_rank = self._rank % self._intra_pg.size()
         current_node = self._rank // self._intra_pg.size()
-        if sharding_ctx is not None and sharding_ctx.variable_batch_per_feature:
+        if isinstance(
+            self._intra_dist, VariableBatchPooledEmbeddingsReduceScatter
+        ) and isinstance(self._cross_dist, VariableBatchPooledEmbeddingsAllToAll):
+            assert sharding_ctx is not None and (
+                sharding_ctx.batch_size_per_rank_per_feature
+                or sharding_ctx.batch_size_per_rank
+            ), "Batch size not found in KJT input for VBE"
             (
                 batch_size_per_rank_per_feature_by_cross_group,
                 batch_size_per_feature_sum_by_cross_group,
             ) = self._preprocess_batch_size_per_rank_per_feature(
                 self._intra_pg.size(),
                 self._cross_pg.size(),
-                sharding_ctx.batch_size_per_rank_per_feature,
+                sharding_ctx.batch_size_per_rank_per_feature
+                or [sharding_ctx.batch_size_per_rank],
             )
             rs_result = cast(
                 VariableBatchPooledEmbeddingsReduceScatter, self._intra_dist
@@ -525,7 +532,8 @@ def forward(
                 batch_size_per_rank_per_feature=batch_size_per_rank_per_feature_by_cross_group[
                     local_rank
                 ],
-                batch_size_per_feature_pre_a2a=sharding_ctx.batch_size_per_feature_pre_a2a,
+                batch_size_per_feature_pre_a2a=sharding_ctx.batch_size_per_feature_pre_a2a
+                or sharding_ctx.batch_size_per_rank,
             )
         elif (
             sharding_ctx is not None and len(set(sharding_ctx.batch_size_per_rank)) > 1