some working version

luccafong · luccafong · commit 569485cba487 · 2025-09-16T22:42:09.000-07:00
diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py
@@ -6,8 +6,9 @@
 no internal lb supported in external_launcher mode.
 """
 
-from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
 
+from vllm import LLM, SamplingParams
 # Create prompts, the same across all ranks
 prompts = [
     "Hello, my name is",
@@ -26,14 +27,17 @@
 # deterministic across ranks.
 llm = LLM(
     model="/data/local/models/oss/qwen1.5_2.7B_moe_chat",
-    tensor_parallel_size=2,
-    data_parallel_size=4,
+    tensor_parallel_size=1,
+    data_parallel_size=2,
     pipeline_parallel_size=1,
     enable_expert_parallel=True,
     distributed_executor_backend="external_launcher",
     max_model_len=32768,
+    compilation_config={
+        "cudagraph_mode": "FULL",
+    },
     # FIXME: with torch.compile, the torchrun processes do not exit properly
-    enforce_eager=True,
+    # enforce_eager=True,
     seed=1,
 )
 
@@ -55,6 +59,7 @@
     print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
     print("-" * 50)
 
+cleanup_dist_env_and_memory()
 """
 Further tips:
 
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -820,14 +820,32 @@ def recv(self,
         return self.device_communicator.recv(size, dtype, src)
 
     def destroy(self):
+        print(f"fanglu: Destroying device group, {self.unique_name=}")
+        cudagraph_wrapper = getattr(self, "model", None)
+        if cudagraph_wrapper is not None:
+            print(f"Clean up cudagraph keys")
+            for key in cudagraph_wrapper.concrete_cudagraph_entries:
+                del cudagraph_wrapper.concrete_cudagraph_entries[key].cudagraph
+            torch.cuda.empty_cache()
+            gc.collect()
+        # torch._dynamo.reset_code_caches()
+        # from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+        # reset_cudagraph_trees()
+        # torch._dynamo.reset()
+        print("fanglu: Reset torch._dynamo done")
         if hasattr(self, "device_group"):
             torch.distributed.destroy_process_group(self.device_group)
+            print("fanglu: Destroying device group done")
             del self.device_group
         if hasattr(self, "cpu_group"):
+            print("fanglu: Destroying cpu group")
             torch.distributed.destroy_process_group(self.cpu_group)
+            print("fanglu: Destroying cpu group done")
             del self.cpu_group
         if self.device_communicator is not None:
+            print("fanglu: Destroying device communicator")
             self.device_communicator.destroy()
+            print("fanglu: Destroying device communicator done")
         if self.mq_broadcaster is not None:
             self.mq_broadcaster = None
 
@@ -1317,12 +1335,18 @@ def destroy_distributed_environment():
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
+    print("fanglu: Cleaning up dist env and memory")
+    torch._dynamo.reset()
+    print("fanglu: Reset torch._dynamo done")
     destroy_model_parallel()
+    print("fanglu: Destroy model parallel done")
     destroy_distributed_environment()
+    print("fanglu: Destroy dist env done")
     if shutdown_ray:
         import ray  # Lazy import Ray
         ray.shutdown()
     gc.collect()
+    print("GC done")
     from vllm.platforms import current_platform
     empty_cache = current_platform.empty_cache
     if empty_cache is not None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -30,7 +30,7 @@
                                           has_kv_transfer_group)
 from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
 from vllm.distributed.parallel_state import (
-    get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
+    get_dp_group, get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
     prepare_communication_buffer_for_model)
 from vllm.forward_context import (BatchDescriptor, DPMetadata,
                                   set_forward_context)
@@ -2442,6 +2442,7 @@ def load_model(self, eep_scale_up: bool = False) -> None:
             self.model = CUDAGraphWrapper(self.model,
                                           self.vllm_config,
                                           runtime_mode=CUDAGraphMode.FULL)
+            setattr(get_dp_group(), "model", self.model)
 
     def reload_weights(self) -> None:
         assert getattr(self, "model", None) is not None, \
@@ -3093,6 +3094,7 @@ def freeze_gc():
         set_cudagraph_capturing_enabled(True)
         with freeze_gc(), graph_capture(device=self.device):
             cudagraph_mode = self.compilation_config.cudagraph_mode
+            print(f"{cudagraph_mode}")
             if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
                 cudagraph_runtime_mode = cudagraph_mode.mixed_mode()
 
@@ -3306,6 +3308,7 @@ def initialize_cudagraph_capture(self) -> None:
         self.cudagraph_dispatcher.initialize_cudagraph_keys(
             self.compilation_config.cudagraph_mode,
             self.uniform_decode_query_len)
+        setattr(get_dp_group(), "dispatcher", self.cudagraph_dispatcher)
 
     def calculate_reorder_batch_threshold(self) -> None:
         """
@@ -3757,3 +3760,6 @@ def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         self.transfer_event.record()
         self.transfer_event.synchronize()
         return pinned.tolist()
+
+    def __del__(self):
+        print("GPU Model Runner is called.")