[CUDA] Skip pynvml test on platforms that don't have complete support (pytorch#159689)

eqy · pytorchmergebot · commit 68e31e2f814f · 2025-11-03T19:40:20.000Z
Pull Request resolved: pytorch#159689 Approved by: https://github.com/msaroufim, https://github.com/Skylion007
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -35,6 +35,7 @@
 from torch.testing._internal.autocast_test_lists import AutocastTestLists, TestAutocast
 from torch.testing._internal.common_cuda import (
     _create_scaling_case,
+    HAS_WORKING_NVML,
     SM70OrLater,
     TEST_CUDNN,
     TEST_MULTIGPU,
@@ -4803,6 +4804,7 @@ def test_nvml_get_handler(self):
     def test_temperature(self):
         self.assertTrue(0 <= torch.cuda.temperature() <= 150)
 
+    @unittest.skipIf(not HAS_WORKING_NVML, "pynvml availble but broken")
     @unittest.skipIf(TEST_WITH_ROCM, "flaky for AMD gpu")
     @unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
     def test_device_memory_used(self):
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
@@ -376,6 +376,20 @@ def xfailIfSM120OrLater(func):
 def xfailIfDistributedNotSupported(func):
     return func if not (IS_MACOS or IS_JETSON) else unittest.expectedFailure(func)
 
+def _check_has_working_nvml() -> bool:
+    try:
+        if not torch.cuda.is_available():
+            return False
+        import pynvml
+        torch.cuda.device_memory_used()
+        return True
+    except ModuleNotFoundError:
+        return False
+    except pynvml.NVMLError_NotSupported:
+        return False
+
+HAS_WORKING_NVML = _check_has_working_nvml()
+
 # Importing this module should NOT eagerly initialize CUDA
 if not CUDA_ALREADY_INITIALIZED_ON_IMPORT:
     assert not torch.cuda.is_initialized()