[Core] Run garbage collector after CUDA graph capture to fix throughput regression (vllm-project#24128)

micah-wil · gshtras · FeiDaLI · commit e4111a4667f5 · 2025-09-25T18:54:14.000+08:00
Signed-off-by: Gregory Shtrasberg &lt;Gregory.Shtrasberg@amd.com&gt;
Co-authored-by: Gregory Shtrasberg &lt;Gregory.Shtrasberg@amd.com&gt;
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2885,6 +2885,7 @@ def freeze_gc():
             finally:
                 if should_freeze:
                     gc.unfreeze()
+                    gc.collect()
 
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes