Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion ci/L0_multi_gpu_vllm/vllm_backend/vllm_multi_gpu_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,29 @@ def _test_vllm_multi_gpu_utilization(self, model_name: str):

print("=============== After Loading vLLM Model ===============")
vllm_model_used_gpus = 0
gpu_memory_utilizations = []

for gpu_id in gpu_ids:
memory_utilization = self.get_gpu_memory_utilization(gpu_id)
print(f"GPU {gpu_id} Memory Utilization: {memory_utilization} bytes")
if memory_utilization > mem_util_before_loading_model[gpu_id]:
memory_delta = memory_utilization - mem_util_before_loading_model[gpu_id]
if memory_delta > 0:
vllm_model_used_gpus += 1
gpu_memory_utilizations.append(memory_delta)

self.assertGreaterEqual(vllm_model_used_gpus, 2)

# Check if memory utilization is approximately equal across GPUs
if len(gpu_memory_utilizations) >= 2:
max_memory = max(gpu_memory_utilizations)
min_memory = min(gpu_memory_utilizations)
relative_diff = (max_memory - min_memory) / max_memory
self.assertLessEqual(
relative_diff,
0.1,
f"GPU memory utilization differs by {relative_diff:.2%} which exceeds the 10% threshold",
)

def _test_vllm_model(self, model_name: str, send_parameters_as_tensor: bool = True):
user_data = UserData()
stream = False
Expand Down
2 changes: 1 addition & 1 deletion src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def _validate_device_config(self):
f"Detected KIND_GPU model instance, explicitly setting GPU device={triton_device_id} for {triton_instance}"
)
# vLLM doesn't currently (v0.4.2) expose device selection in the APIs
torch.cuda.set_device(triton_device_id)
os.environ["CUDA_VISIBLE_DEVICES"] = str(triton_device_id)

def _setup_lora(self):
self.enable_lora = False
Expand Down
Loading