undo unrelated changes

LucasWilkinson · LucasWilkinson · commit 4006aef294e7 · 2025-09-04T19:57:14.000Z
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
@@ -57,7 +57,6 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
 The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive`  and `finalize` functions.
 The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers.  Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts.  The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
 
-
 ![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
 
 ### FusedMoEPermuteExpertsUnpermute
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
@@ -92,12 +92,6 @@ def parse_args():
         action="store_true",
         help=("Enable microbatched execution")
     )
-    parser.add_argument(
-        "--compilation-config",
-        type=int,
-        default=0,
-        help=("Compilation optimization (O) level 0-3."),
-    )
     parser.add_argument(
         "--compilation-config",
         type=int,
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
@@ -3,7 +3,6 @@
 import multiprocessing
 import os
 import pickle
-import queue
 import signal
 import threading
 import time
@@ -19,7 +18,6 @@
 from typing import Any, Callable, Optional, Union, cast
 
 import cloudpickle
-import torch
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -35,8 +33,7 @@
                         get_loopback_ip, get_mp_context, get_open_port,
                         set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
-from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
-                             ModelRunnerOutput)
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -415,14 +412,6 @@ def __init__(
         # Initializes a message queue for sending the model output
         self.worker_response_mq = MessageQueue(1, 1)
 
-        self.async_output_queue: queue.Queue = queue.Queue()
-        self.async_output_copy_stream = torch.cuda.Stream()
-        self.async_output_copy_thread = Thread(
-            target=self.async_output_busy_loop,
-            daemon=True,
-            name="WorkerAsyncOutputCopy")
-        self.async_output_copy_thread.start()
-
         # Initialize device and loads weights
         self.worker.init_device()
         self.worker.load_model()
@@ -604,18 +593,6 @@ class ResponseStatus(Enum):
         SUCCESS = auto()
         FAILURE = auto()
 
-    def enqueue_worker_output(self, output: Any) -> None:
-        if isinstance(output, AsyncModelRunnerOutput):
-            output = output.serialize(self.async_output_copy_stream)
-        self.worker_response_mq.enqueue(
-            (WorkerProc.ResponseStatus.SUCCESS, output))
-
-    def async_output_busy_loop(self):
-        """Entrypoint for the thread which handles outputs asynchronously."""
-        while True:
-            output = self.async_output_queue.get()
-            self.enqueue_worker_output(output)
-
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
         while True:
@@ -640,4 +617,5 @@ def worker_busy_loop(self):
                 continue
 
             if output_rank is None or self.rank == output_rank:
-                self.async_output_queue.put(output)
+                self.worker_response_mq.enqueue(
+                    (WorkerProc.ResponseStatus.SUCCESS, output))
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
@@ -114,34 +114,6 @@ class ModelRunnerOutput:
     num_nans_in_logits: Optional[dict[str, int]] = None
 
 
-# ModelRunnerOutput wrapper for async scheduling.
-# Contains GPU tensors which must be serialized before sending
-# to the scheduler process.
-@dataclass
-class AsyncModelRunnerOutput:
-    model_runner_output: ModelRunnerOutput
-
-    # [num_reqs, max_num_generated_tokens]
-    sampled_token_ids: torch.Tensor
-
-    invalid_req_indices: list[int]
-
-    def serialize(self, copy_stream: torch.cuda.Stream) -> ModelRunnerOutput:
-        default_stream = torch.cuda.current_stream()
-        with torch.cuda.stream(copy_stream):
-            copy_stream.wait_stream(default_stream)
-            sampled_token_ids_cpu = self.sampled_token_ids.to(
-                'cpu', non_blocking=True)
-        copy_stream.synchronize()
-        valid_sampled_token_ids = sampled_token_ids_cpu.tolist()
-        for i in self.invalid_req_indices:
-            valid_sampled_token_ids[i].clear()
-
-        output = self.model_runner_output
-        output.sampled_token_ids = valid_sampled_token_ids
-        return output
-
-
 @dataclass
 class DraftTokenIds:
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
@@ -250,11 +250,6 @@ def __init__(
 
         self.pooling_params: dict[str, PoolingParams] = {}
 
-        # Cached reference to the GPU tensor of previously sampled tokens
-        self.prev_sampled_token_ids: Optional[torch.Tensor] = None
-        self.prev_sampled_token_ids_invalid_indices: Optional[set[int]] = None
-        self.prev_req_id_to_index: Optional[dict[str, int]] = None
-
     @property
     def req_ids(self) -> list[str]:
         # None elements should only be present transiently
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -69,8 +69,8 @@
                                         FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheSpec,
                                         MambaSpec, SlidingWindowSpec)
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput,
-                             DraftTokenIds, LogprobsTensors, ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
+                             LogprobsTensors, ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -238,8 +238,6 @@ def __init__(
             is_pooling_model=self.is_pooling_model,
         )
 
-        self.use_async_scheduling = self.scheduler_config.async_scheduling
-
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
@@ -711,73 +709,6 @@ def _get_cumsum_and_arange(
 
         return cu_num_tokens, arange
 
-    def _prepare_input_ids(self, total_num_scheduled_tokens: int,
-                           cu_num_tokens: np.ndarray) -> None:
-        """Prepare the input IDs for the current batch.
-        
-        Carefully handles the `prev_sampled_token_ids` which can be cached
-        from the previous engine iteration, in which case those tokens on the
-        GPU need to be copied into the corresponding slots into input_ids."""
-
-        if self.input_batch.prev_sampled_token_ids is not None:
-            # Async scheduling case, we need to copy the sampled token ids
-            # from the previous iteration.
-            prev_req_id_to_index = self.input_batch.prev_req_id_to_index
-            current_req_id_to_index = self.input_batch.req_id_to_index
-            assert prev_req_id_to_index is not None
-            common_req_ids = set(prev_req_id_to_index.keys()).intersection(
-                set(current_req_id_to_index.keys()))
-            if common_req_ids:
-                current_common_req_indices = [
-                    current_req_id_to_index[req_id]
-                    for req_id in common_req_ids
-                ]
-                prev_common_req_indices = [
-                    prev_req_id_to_index[req_id] for req_id in common_req_ids
-                ]
-                # We need to compute the flattened input_ids index of the
-                # last token in each common request.
-                flattened_indices = [
-                    int(cu_num_tokens[idx]) - 1
-                    for idx in current_common_req_indices
-                ]
-                if len(flattened_indices) < total_num_scheduled_tokens:
-                    # If not all requests are decodes from the last iteration,
-                    # We need to copy the input_ids_cpu to the GPU first.
-                    self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
-                if flattened_indices == prev_common_req_indices and \
-                    set(flattened_indices) == \
-                        set(range(len(flattened_indices))):
-                    # Common-case optimization: the batch is unchanged
-                    # and no reordering happened.
-                    # The indices are both the same permutation of 0..N-1
-                    self.input_ids.gpu[:len(flattened_indices)].copy_(
-                        self.input_batch.prev_sampled_token_ids[:len(
-                            flattened_indices)].squeeze(1),
-                        non_blocking=True)
-                else:
-                    # Upload the index tensors asynchronously
-                    # so the scatter can be non-blocking
-                    input_ids_index_tensor = torch.tensor(
-                        flattened_indices,
-                        dtype=torch.int64,
-                        pin_memory=self.pin_memory).to(self.device,
-                                                       non_blocking=True)
-                    prev_common_req_indices_tensor = torch.tensor(
-                        prev_common_req_indices,
-                        dtype=torch.int64,
-                        pin_memory=self.pin_memory).to(self.device,
-                                                       non_blocking=True)
-                    self.input_ids.gpu.scatter_(
-                        dim=0,
-                        index=input_ids_index_tensor,
-                        src=self.input_batch.prev_sampled_token_ids[
-                            prev_common_req_indices_tensor].squeeze(1))
-            else:
-                self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
-        else:
-            self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
-
     def _prepare_inputs(
         self, scheduler_output: "SchedulerOutput"
     ) -> tuple[PerLayerAttnMetadata, torch.Tensor,
@@ -869,8 +800,7 @@ def _prepare_inputs(
         max_seq_len = self.seq_lens.np[:num_reqs].max().item()
 
         # Copy the tensors to the GPU.
-        self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
-
+        self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
         if self.uses_mrope:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             self.mrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
@@ -986,10 +916,6 @@ def _prepare_inputs(
                         builder,
                     )
 
-<<<<<<< HEAD
-
-=======
->>>>>>> nm/sage/dbo-full-cudagraphs
                 if ubatch_slices is not None:
                     common_attn_metadata_list = split_attn_metadata(
                         ubatch_slices, common_attn_metadata)
@@ -1637,7 +1563,6 @@ def get_dp_padding_ubatch(
             should_ubatch = False
 
         # Note that we compute the number of padded tokens per ubatch
-
         (should_ubatch,
          num_tokens_across_dp) = self.should_ubatch_with_num_tokens(
              should_ubatch, num_tokens_unpadded // 2, num_tokens_per_ubatch)
@@ -1724,7 +1649,7 @@ def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
+    ) -> Union[ModelRunnerOutput, IntermediateTensors]:
         self._update_states(scheduler_output)
         if not scheduler_output.total_num_scheduled_tokens:
             if not has_kv_transfer_group():
@@ -1927,12 +1852,6 @@ def execute_model(
                 # so that we could clear the sampled tokens before returning.
                 discard_sampled_tokens_req_indices.append(i)
 
-        # Copy some objects so they don't get modified after returning.
-        # This is important when using async scheduling.
-        req_ids_output_copy = self.input_batch.req_ids.copy()
-        req_id_to_index_output_copy = \
-            self.input_batch.req_id_to_index.copy()
-
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.
         logprobs_tensors = sampler_output.logprobs_tensors
@@ -1945,54 +1864,29 @@ def execute_model(
             scheduler_output.num_scheduled_tokens,
         )
 
-        num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
+        # Get the valid generated tokens.
         sampled_token_ids = sampler_output.sampled_token_ids
-        if not self.use_async_scheduling:
-            # Get the valid generated tokens.
-            max_gen_len = sampled_token_ids.shape[-1]
-            if max_gen_len == 1:
-                # No spec decode tokens.
-                valid_sampled_token_ids = self._to_list(sampled_token_ids)
-            else:
-                # Includes spec decode tokens.
-                valid_sampled_token_ids = self.rejection_sampler.parse_output(
-                    sampled_token_ids,
-                    self.input_batch.vocab_size,
-                )
-            # Mask out the sampled tokens that should not be sampled.
-            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i].clear()
+        max_gen_len = sampled_token_ids.shape[-1]
+        if max_gen_len == 1:
+            # No spec decode tokens.
+            valid_sampled_token_ids = self._to_list(sampled_token_ids)
         else:
-            valid_sampled_token_ids = []
-            invalid_req_indices = list(discard_sampled_tokens_req_indices)
-            invalid_req_indices_set = set(invalid_req_indices)
-            assert sampled_token_ids.shape[-1] == 1
-
-            # Cache the sampled tokens on the GPU and avoid CPU sync.
-            # These will be copied into input_ids in the next step
-            # when preparing inputs.
-            self.input_batch.prev_sampled_token_ids = \
-                sampled_token_ids
-            self.input_batch.prev_sampled_token_ids_invalid_indices = \
-                invalid_req_indices_set
-            self.input_batch.prev_req_id_to_index = {
-                req_id: i
-                for i, req_id in enumerate(self.input_batch.req_ids)
-                if i not in invalid_req_indices_set
-            }
+            # Includes spec decode tokens.
+            valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                sampled_token_ids,
+                self.input_batch.vocab_size,
+            )
+        # Mask out the sampled tokens that should not be sampled.
+        for i in discard_sampled_tokens_req_indices:
+            valid_sampled_token_ids[i].clear()
 
         # Cache the sampled tokens in the model runner, so that the scheduler
         # doesn't need to send them back.
         # NOTE(woosuk): As an exception, when using PP, the scheduler sends
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
         req_ids = self.input_batch.req_ids
-        for req_idx in range(num_sampled_tokens):
-            if self.use_async_scheduling:
-                sampled_ids = [-1] * 1 if \
-                    req_idx not in invalid_req_indices_set else None
-            else:
-                sampled_ids = valid_sampled_token_ids[req_idx]
+        for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
             if not sampled_ids:
                 continue
 
@@ -2007,7 +1901,6 @@ def execute_model(
                                            start_idx:end_idx] = sampled_ids
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
             self.input_batch.num_tokens[req_idx] = end_idx
-
             req_id = req_ids[req_idx]
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
@@ -2029,9 +1922,9 @@ def execute_model(
 
         self.eplb_step()
 
-        output = ModelRunnerOutput(
-            req_ids=req_ids_output_copy,
-            req_id_to_index=req_id_to_index_output_copy,
+        return ModelRunnerOutput(
+            req_ids=self.input_batch.req_ids,
+            req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
@@ -2040,15 +1933,6 @@ def execute_model(
             num_nans_in_logits=num_nans_in_logits,
         )
 
-        if self.use_async_scheduling:
-            return AsyncModelRunnerOutput(
-                model_runner_output=output,
-                sampled_token_ids=sampled_token_ids,
-                invalid_req_indices=invalid_req_indices,
-            )
-
-        return output
-
     def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
         if self._draft_token_ids is None:
             return None
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py