neuralmagic
diff --git a/‎examples/offline_inference/data_parallel.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/offline_inference/data_parallel.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 10 additions & 0 deletions b/‎vllm/config/parallel.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 6 additions & 0 deletions b/‎vllm/engine/arg_utils.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎vllm/forward_context.py‎
Lines changed: 29 additions & 0 deletions b/‎vllm/forward_context.py‎
Lines changed: 29 additions & 0 deletions
@@ -259,4 +259,4 @@ def start(rank):
         elif proc.exitcode:
             exit_code = proc.exitcode
 
-    exit(exit_code)
+    exit(exit_code)
@@ -151,6 +151,16 @@ class ParallelConfig:
     prefills. If the number of tokens in the request is greater than this
     threshold, microbatching will be used. Otherwise, the request will be
     processed in a single batch."""
+    microbatch_schedule: Literal["mlp_shared_overlap", "attn_shared_overlap"] =\
+        "mlp_shared_overlap"
+    """Schedule policy for microbatch overlap coordination.
+
+    Options:
+    - "mlp_shared_overlap": overlap MLP and communication across ubatches
+    - "attn_shared_overlap": overlap MLA attention and communication across 
+        ubatches
+    see: vllm/v1/worker/ubatching.py for diagrams of the schedules.
+    """
 
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
@@ -334,6 +334,7 @@ class EngineArgs:
         ParallelConfig.dbo_decode_token_threshold
     dbo_prefill_token_threshold: int = \
         ParallelConfig.dbo_prefill_token_threshold
+    microbatch_schedule: str = ParallelConfig.microbatch_schedule
     eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
     enable_eplb: bool = ParallelConfig.enable_eplb
     expert_placement_strategy: ExpertPlacementStrategy = \
@@ -705,6 +706,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parallel_group.add_argument(
             "--dbo-prefill-token-threshold",
             **parallel_kwargs["dbo_prefill_token_threshold"])
+        parallel_group.add_argument(
+            "--microbatch-schedule",
+            dest="microbatch_schedule",
+            **parallel_kwargs["microbatch_schedule"])
         parallel_group.add_argument("--enable-eplb",
                                     **parallel_kwargs["enable_eplb"])
         parallel_group.add_argument("--eplb-config",
@@ -1329,6 +1334,7 @@ def create_engine_config(
             enable_dbo=self.enable_dbo,
             dbo_decode_token_threshold=self.dbo_decode_token_threshold,
             dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
+            microbatch_schedule=self.microbatch_schedule,
             enable_eplb=self.enable_eplb,
             eplb_config=self.eplb_config,
             expert_placement_strategy=self.expert_placement_strategy,
 
@@ -171,6 +171,34 @@ def should_ubatch_across_dp(
             return False, None
         return result, padded_num_tokens_tensor.cpu()
 
+    @staticmethod
+    def should_ubatch_across_dp(should_ubatch: bool, orig_num_tokens_per_ubatch: int,
+                                padded_num_tokens_per_ubatch: int, dp_size: int,
+                                dp_rank: int) -> tuple[bool, Optional[torch.Tensor]]:
+
+        tensor = torch.zeros(3, dp_size, device="cuda", dtype=torch.int32)
+        tensor[0][dp_rank] = orig_num_tokens_per_ubatch
+        tensor[1][dp_rank] = padded_num_tokens_per_ubatch
+        tensor[2][dp_rank] = 1 if should_ubatch else 0
+
+
+        from vllm.distributed.parallel_state import get_dp_group
+        dist.all_reduce(tensor, group=get_dp_group().device_group)
+
+        result: bool = bool(torch.all(tensor[2]== 1).item())
+        if not result:
+            return result, None
+        
+        orig_num_tokens_tensor = tensor[0, :]
+        padded_num_tokens_tensor = tensor[1, :]
+
+        orig_min_num_tokens = orig_num_tokens_tensor.min().item()
+        padded_max_num_tokens = padded_num_tokens_tensor.max().item()
+        if padded_max_num_tokens >= 2 * orig_min_num_tokens:
+            logger.debug(f"Aborting ubatching {orig_min_num_tokens} {padded_max_num_tokens}")
+            return False, None
+        return result, padded_num_tokens_tensor
+
     @staticmethod
     def make(
         parallel_config: ParallelConfig,
@@ -199,6 +227,7 @@ def make(
         if num_tokens_across_dp_cpu is None:
             num_tokens_across_dp_cpu = DPMetadata.num_tokens_across_dp(
                 batchsize, dp_size, dp_rank)
+
         max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp_cpu)
         return DPMetadata(max_tokens_across_dp_cpu, num_tokens_across_dp_cpu)