[Fix] Fix memory profiling when GPU is used by multiple processes (#2863)

WoosukKwon · web-flow · commit 7e45107f51bc · 2024-02-13T19:52:34.000-08:00
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -84,6 +84,8 @@ def init_model(self, cupy_port: Optional[int] = None) -> None:
             torch.cuda.set_device(self.device)
 
             _check_if_gpu_supports_dtype(self.model_config.dtype)
+            torch.cuda.empty_cache()
+            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
@@ -126,7 +128,9 @@ def profile_num_available_blocks(
         # profiled peak memory.
         torch.cuda.synchronize()
         free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
-        peak_memory = total_gpu_memory - free_gpu_memory
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        peak_memory = self.init_gpu_memory - free_gpu_memory
 
         cache_block_size = CacheEngine.get_cache_block_size(
             block_size, cache_dtype, self.model_config, self.parallel_config)