intel · againull · Mar 26, 2025 · Mar 17, 2025 · Mar 22, 2025 · Mar 24, 2025
@@ -119,6 +119,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE
     ${PROJECT_NAME}::umf
     Threads::Threads
     cudadrv
+    CUDA::nvml
 )
 
 target_include_directories(${TARGET_NAME} PRIVATE

@@ -12,6 +12,7 @@
 #include "logger/ur_logger.hpp"
 
 #include <cuda.h>
+#include <nvml.h>
 
 #include <sstream>
 
@@ -36,6 +37,23 @@ ur_result_t mapErrorUR(CUresult Result) {
   }
 }
 
+ur_result_t mapErrorUR(nvmlReturn_t Result) {
+  switch (Result) {
+  case NVML_SUCCESS:
+    return UR_RESULT_SUCCESS;
+  case NVML_ERROR_NOT_SUPPORTED:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  case NVML_ERROR_GPU_IS_LOST:
+    return UR_RESULT_ERROR_DEVICE_LOST;
+  case NVML_ERROR_MEMORY:
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  case NVML_ERROR_INSUFFICIENT_RESOURCES:
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  default:
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
 void checkErrorUR(CUresult Result, const char *Function, int Line,
                   const char *File) {
   if (Result == CUDA_SUCCESS || Result == CUDA_ERROR_DEINITIALIZED) {
@@ -63,6 +81,30 @@ void checkErrorUR(CUresult Result, const char *Function, int Line,
   throw mapErrorUR(Result);
 }
 
+void checkErrorUR(nvmlReturn_t Result, const char *Function, int Line,
+                  const char *File) {
+  if (Result == NVML_SUCCESS) {
+    return;
+  }
+
+  const char *ErrorString = nullptr;
+  ErrorString = nvmlErrorString(Result);
+  std::stringstream SS;
+  SS << "\nUR NVML ERROR:"
+     << "\n\tValue:           " << Result
+     << "\n\tDescription:     " << ErrorString
+     << "\n\tFunction:        " << Function << "\n\tSource Location: " << File
+     << ":" << Line << "\n";
+  logger::error("{}", SS.str());
+
+  if (std::getenv("PI_CUDA_ABORT") != nullptr ||
+      std::getenv("UR_CUDA_ABORT") != nullptr) {
+    std::abort();
+  }
+
+  throw mapErrorUR(Result);
+}
+
 void checkErrorUR(ur_result_t Result, const char *Function, int Line,
                   const char *File) {
   if (Result == UR_RESULT_SUCCESS) {

@@ -10,6 +10,7 @@
 #pragma once
 
 #include <cuda.h>
+#include <nvml.h>
 #include <ur/ur.hpp>
 
 #include <umf/base.h>
@@ -35,6 +36,9 @@ ur_result_t mapErrorUR(CUresult Result);
 void checkErrorUR(CUresult Result, const char *Function, int Line,
                   const char *File);
 
+void checkErrorUR(nvmlReturn_t Result, const char *Function, int Line,
+                  const char *File);
+
 void checkErrorUR(ur_result_t Result, const char *Function, int Line,
                   const char *File);
 

@@ -18,6 +18,7 @@
 #include "logger/ur_logger.hpp"
 #include "platform.hpp"
 #include "ur_util.hpp"
+#include <nvml.h>
 
 int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) {
   int value;
@@ -1083,11 +1084,64 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
   case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case UR_DEVICE_INFO_IP_VERSION:
-  case UR_DEVICE_INFO_CURRENT_CLOCK_THROTTLE_REASONS:
-  case UR_DEVICE_INFO_FAN_SPEED:
-  case UR_DEVICE_INFO_MIN_POWER_LIMIT:
-  case UR_DEVICE_INFO_MAX_POWER_LIMIT:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  case UR_DEVICE_INFO_CURRENT_CLOCK_THROTTLE_REASONS: {
+    unsigned long long ClocksEventReasons;
+    UR_CHECK_ERROR(nvmlDeviceGetCurrentClocksEventReasons(hDevice->getNVML(),
+                                                          &ClocksEventReasons));
+    ur_device_throttle_reasons_flags_t ThrottleReasons = 0;
+    constexpr unsigned long long NVMLThrottleFlags[] = {
+        nvmlClocksThrottleReasonSwPowerCap,
+        nvmlClocksThrottleReasonHwThermalSlowdown ||
+            nvmlClocksThrottleReasonSwThermalSlowdown,
+        nvmlClocksThrottleReasonHwPowerBrakeSlowdown,
+        nvmlClocksThrottleReasonApplicationsClocksSetting};
+
+    constexpr ur_device_throttle_reasons_flags_t UrThrottleFlags[] = {
+        UR_DEVICE_THROTTLE_REASONS_FLAG_POWER_CAP,
+        UR_DEVICE_THROTTLE_REASONS_FLAG_THERMAL_LIMIT,
+        UR_DEVICE_THROTTLE_REASONS_FLAG_PSU_ALERT,
+        UR_DEVICE_THROTTLE_REASONS_FLAG_SW_RANGE};
+
+    for (size_t i = 0;
+         i < sizeof(NVMLThrottleFlags) / sizeof(NVMLThrottleFlags[0]); ++i) {
+      if (ClocksEventReasons & NVMLThrottleFlags[i]) {
+        ThrottleReasons |= UrThrottleFlags[i];
+        ClocksEventReasons &= ~NVMLThrottleFlags[i];
+      }
+    }
+    if (ClocksEventReasons) {
+      ThrottleReasons |= UR_DEVICE_THROTTLE_REASONS_FLAG_OTHER;
+    }
+    return ReturnValue(ThrottleReasons);
+  }
+  case UR_DEVICE_INFO_MIN_POWER_LIMIT:
+  case UR_DEVICE_INFO_MAX_POWER_LIMIT: {
+    unsigned int minLimit, maxLimit;
+    auto NVMLHandle = hDevice->getNVML();
+    auto NVMLError = nvmlDeviceGetPowerManagementLimitConstraints(
+        NVMLHandle, &minLimit, &maxLimit);
+    if (NVMLError == NVML_ERROR_NOT_SUPPORTED) {
+      if (propName == UR_DEVICE_INFO_MAX_POWER_LIMIT) {
+        UR_CHECK_ERROR(
+            nvmlDeviceGetPowerManagementLimit(NVMLHandle, &maxLimit));
+        return ReturnValue(static_cast<int32_t>(maxLimit));
+      } else if (propName == UR_DEVICE_INFO_MIN_POWER_LIMIT) {
+        return ReturnValue(static_cast<int32_t>(-1));
+      }
+    }
+    if (propName == UR_DEVICE_INFO_MAX_POWER_LIMIT) {
+      return ReturnValue(static_cast<int32_t>(maxLimit));
+    } else if (propName == UR_DEVICE_INFO_MIN_POWER_LIMIT) {
+      return ReturnValue(static_cast<int32_t>(minLimit));
+    }
+    break;
+  }
+  case UR_DEVICE_INFO_FAN_SPEED: {
+    unsigned int Speed;
+    UR_CHECK_ERROR(nvmlDeviceGetFanSpeed(hDevice->getNVML(), &Speed));
+    return ReturnValue(static_cast<int32_t>(Speed));
+  }
   case UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP:
     return ReturnValue(
         static_cast<ur_exp_device_2d_block_array_capability_flags_t>(0));

@@ -36,6 +36,8 @@ struct ur_device_handle_t_ {
   int MaxChosenLocalMem{0};
   bool MaxLocalMemSizeChosen{false};
   uint32_t NumComputeUnits{0};
+  std::once_flag NVMLInitFlag;
+  bool NVMLUsed{false};
 
 public:
   ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
@@ -102,11 +104,28 @@ struct ur_device_handle_t_ {
     if (MemoryProviderShared) {
       umfMemoryProviderDestroy(MemoryProviderShared);
     }
+    if (NVMLUsed) {
+      UR_CHECK_ERROR(nvmlShutdown());
+    }
     cuDevicePrimaryCtxRelease(CuDevice);
   }
 
   native_type get() const noexcept { return CuDevice; };
 
+  nvmlDevice_t getNVML() {
+    // Initialization happens lazily once per device object. Call to nvmlInit by
+    // different objects will just increase the reference count. Each object's
+    // destructor calls shutdown method, so once there will be no NVML users
+    // left, resources will be released.
+    std::call_once(NVMLInitFlag, [this]() {
+      UR_CHECK_ERROR(nvmlInit());
+      NVMLUsed = true;
+    });
+    nvmlDevice_t NVMLDevice;
+    UR_CHECK_ERROR(nvmlDeviceGetHandleByIndex(DeviceIndex, &NVMLDevice));
+    return NVMLDevice;
+  };
+
   CUcontext getNativeContext() const noexcept { return CuContext; };
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }

@@ -29,4 +29,6 @@ target_include_directories(test-adapter-cuda PRIVATE
     ${PROJECT_SOURCE_DIR}/source/adapters/cuda
 )
 
-target_link_libraries(test-adapter-cuda PRIVATE cudadrv ${PROJECT_NAME}::umf)
+find_package(CUDAToolkit 10.1 REQUIRED)
+
+target_link_libraries(test-adapter-cuda PRIVATE cudadrv CUDA::nvml ${PROJECT_NAME}::umf)
@@ -2561,6 +2561,9 @@ TEST_P(urDeviceGetInfoTest, SuccessUseNativeAssert) {
 }
 
 TEST_P(urDeviceGetInfoTest, SuccessThrottleReasons) {
+  // TODO: enable when driver/library version mismatch is fixed in CI.
+  UUR_KNOWN_FAILURE_ON(uur::CUDA{});
+
   size_t property_size = 0;
   const ur_device_info_t property_name =
       UR_DEVICE_INFO_CURRENT_CLOCK_THROTTLE_REASONS;
@@ -2578,6 +2581,9 @@ TEST_P(urDeviceGetInfoTest, SuccessThrottleReasons) {
 }
 
 TEST_P(urDeviceGetInfoTest, SuccessFanSpeed) {
+  // TODO: enable when driver/library version mismatch is fixed in CI.
+  UUR_KNOWN_FAILURE_ON(uur::CUDA{});
+
   size_t property_size = 0;
   const ur_device_info_t property_name = UR_DEVICE_INFO_FAN_SPEED;
 
@@ -2595,6 +2601,9 @@ TEST_P(urDeviceGetInfoTest, SuccessFanSpeed) {
 }
 
 TEST_P(urDeviceGetInfoTest, SuccessMaxPowerLimit) {
+  // TODO: enable when driver/library version mismatch is fixed in CI.
+  UUR_KNOWN_FAILURE_ON(uur::CUDA{});
+
   size_t property_size = 0;
   const ur_device_info_t property_name = UR_DEVICE_INFO_MAX_POWER_LIMIT;
 
@@ -2612,6 +2621,9 @@ TEST_P(urDeviceGetInfoTest, SuccessMaxPowerLimit) {
 }
 
 TEST_P(urDeviceGetInfoTest, SuccessMinPowerLimit) {
+  // TODO: enable when driver/library version mismatch is fixed in CI.
+  UUR_KNOWN_FAILURE_ON(uur::CUDA{});
+
   size_t property_size = 0;
   const ur_device_info_t property_name = UR_DEVICE_INFO_MIN_POWER_LIMIT;