[fix] replacing torch.cuda.set_device with CUDA_VISIBLE_DEVICES (#85)

oandreeva-nv · web-flow · commit b40fca446a7b · 2025-04-02T15:36:07.000-07:00
diff --git a/ci/L0_multi_gpu_vllm/vllm_backend/vllm_multi_gpu_test.py b/ci/L0_multi_gpu_vllm/vllm_backend/vllm_multi_gpu_test.py
@@ -77,14 +77,29 @@ def _test_vllm_multi_gpu_utilization(self, model_name: str):
 
         print("=============== After Loading vLLM Model ===============")
         vllm_model_used_gpus = 0
+        gpu_memory_utilizations = []
+
         for gpu_id in gpu_ids:
             memory_utilization = self.get_gpu_memory_utilization(gpu_id)
             print(f"GPU {gpu_id} Memory Utilization: {memory_utilization} bytes")
-            if memory_utilization > mem_util_before_loading_model[gpu_id]:
+            memory_delta = memory_utilization - mem_util_before_loading_model[gpu_id]
+            if memory_delta > 0:
                 vllm_model_used_gpus += 1
+                gpu_memory_utilizations.append(memory_delta)
 
         self.assertGreaterEqual(vllm_model_used_gpus, 2)
 
+        # Check if memory utilization is approximately equal across GPUs
+        if len(gpu_memory_utilizations) >= 2:
+            max_memory = max(gpu_memory_utilizations)
+            min_memory = min(gpu_memory_utilizations)
+            relative_diff = (max_memory - min_memory) / max_memory
+            self.assertLessEqual(
+                relative_diff,
+                0.1,
+                f"GPU memory utilization differs by {relative_diff:.2%} which exceeds the 10% threshold",
+            )
+
     def _test_vllm_model(self, model_name: str, send_parameters_as_tensor: bool = True):
         user_data = UserData()
         stream = False
diff --git a/src/model.py b/src/model.py
@@ -320,7 +320,7 @@ def _validate_device_config(self):
                 f"Detected KIND_GPU model instance, explicitly setting GPU device={triton_device_id} for {triton_instance}"
             )
             # vLLM doesn't currently (v0.4.2) expose device selection in the APIs
-            torch.cuda.set_device(triton_device_id)
+            os.environ["CUDA_VISIBLE_DEVICES"] = str(triton_device_id)
 
     def _setup_lora(self):
         self.enable_lora = False

Original file line number	Diff line number	Diff line change
`@@ -320,7 +320,7 @@ def _validate_device_config(self):`
`320`	`320`	`f"Detected KIND_GPU model instance, explicitly setting GPU device={triton_device_id} for {triton_instance}"`
`321`	`321`	`)`
`322`	`322`	`# vLLM doesn't currently (v0.4.2) expose device selection in the APIs`
`323`		`- torch.cuda.set_device(triton_device_id)`
	`323`	`+ os.environ["CUDA_VISIBLE_DEVICES"] = str(triton_device_id)`
`324`	`324`
`325`	`325`	`def _setup_lora(self):`
`326`	`326`	`self.enable_lora = False`