diff --git a/ci/L0_multi_gpu_vllm/vllm_backend/vllm_multi_gpu_test.py b/ci/L0_multi_gpu_vllm/vllm_backend/vllm_multi_gpu_test.py index c7d42fcd..21591845 100644 --- a/ci/L0_multi_gpu_vllm/vllm_backend/vllm_multi_gpu_test.py +++ b/ci/L0_multi_gpu_vllm/vllm_backend/vllm_multi_gpu_test.py @@ -77,14 +77,29 @@ def _test_vllm_multi_gpu_utilization(self, model_name: str): print("=============== After Loading vLLM Model ===============") vllm_model_used_gpus = 0 + gpu_memory_utilizations = [] + for gpu_id in gpu_ids: memory_utilization = self.get_gpu_memory_utilization(gpu_id) print(f"GPU {gpu_id} Memory Utilization: {memory_utilization} bytes") - if memory_utilization > mem_util_before_loading_model[gpu_id]: + memory_delta = memory_utilization - mem_util_before_loading_model[gpu_id] + if memory_delta > 0: vllm_model_used_gpus += 1 + gpu_memory_utilizations.append(memory_delta) self.assertGreaterEqual(vllm_model_used_gpus, 2) + # Check if memory utilization is approximately equal across GPUs + if len(gpu_memory_utilizations) >= 2: + max_memory = max(gpu_memory_utilizations) + min_memory = min(gpu_memory_utilizations) + relative_diff = (max_memory - min_memory) / max_memory + self.assertLessEqual( + relative_diff, + 0.1, + f"GPU memory utilization differs by {relative_diff:.2%} which exceeds the 10% threshold", + ) + def _test_vllm_model(self, model_name: str, send_parameters_as_tensor: bool = True): user_data = UserData() stream = False diff --git a/src/model.py b/src/model.py index d201244c..7a135dcf 100644 --- a/src/model.py +++ b/src/model.py @@ -320,7 +320,7 @@ def _validate_device_config(self): f"Detected KIND_GPU model instance, explicitly setting GPU device={triton_device_id} for {triton_instance}" ) # vLLM doesn't currently (v0.4.2) expose device selection in the APIs - torch.cuda.set_device(triton_device_id) + os.environ["CUDA_VISIBLE_DEVICES"] = str(triton_device_id) def _setup_lora(self): self.enable_lora = False