misc review comments

SageMoore · SageMoore · commit ec9f13df0e3e · 2025-09-04T18:10:02.000Z
Signed-off-by: Sage Moore &lt;sage@neuralmagic.com&gt;
diff --git a/vllm/compilation/ubatch_wrapper.py b/vllm/compilation/ubatch_wrapper.py
@@ -44,8 +44,7 @@ def __init__(self, runnable: Callable, vllm_config: VllmConfig,
         self.runnable = runnable
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
-        self.comm_stream = torch.cuda.Stream()
-        self.device = device
+        self.comm_stream = torch.cuda.Stream(device=device)
         self.ready_barrier = threading.Barrier(3)
 
         self.cudagraphs: dict[int, CUDAGraphMetaData] = {}
@@ -204,8 +203,7 @@ def _make_ubatch_metadata(self, ubatch_slices, attn_metadata, input_ids,
             comm_stream=self.comm_stream,
             compute_stream=compute_stream,
             forward_contexts=forward_contexts,
-            ready_barrier=self.ready_barrier,
-            device=self.device)
+            ready_barrier=self.ready_barrier)
 
         ubatch_metadata: list[UbatchMetadata] = []
         for i, ubatch_slice in enumerate(ubatch_slices):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2494,12 +2494,6 @@ def _dummy_run(
             # We only support decode-only cudagraphs
             assert num_reqs == num_tokens
             assert num_tokens % 2 == 0
-            # num_tokens_per_ubatch = num_tokens // 2
-            # dp_size = self.vllm_config.parallel_config.data_parallel_size
-            # num_tokens_across_dp = torch.tensor([num_tokens_per_ubatch] *
-            #                                     dp_size,
-            #                                     device="cpu",
-            #                                     dtype=torch.int32)
             ubatch_slices = [
                 UbatchSlice(slice(0, num_reqs // 2), slice(0,
                                                            num_tokens // 2)),
diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py
@@ -172,7 +172,6 @@ def make_ubatch_contexts(
     comm_stream: torch.cuda.Stream,
     forward_contexts: list[ForwardContext],
     ready_barrier: threading.Barrier,
-    device: Optional[torch.device] = None,
     schedule: str = "default",
 ) -> list[UBatchContext]:
     assert num_micro_batches == 2, "only been tested with 2 micro-batches"
@@ -186,8 +185,6 @@ def make_ubatch_contexts(
     gpu_compute_done_events = [
         torch.cuda.Event() for _ in range(num_micro_batches)
     ]
-    device = device or torch.cuda.current_device()
-    # comm_stream = torch.cuda.Stream(device)
 
     assert len(forward_contexts) == 2