Skip to content

Commit c827d2c

Browse files
Merge pull request #525 from mpatrou/energy_counters
Support for NVML counters: GPU energy, instant GPU power, avg memory power
2 parents e990fca + 4b26585 commit c827d2c

File tree

2 files changed

+125
-0
lines changed

2 files changed

+125
-0
lines changed

src/components/nvml/linux-nvml.c

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ template.
3434
#include <inttypes.h>
3535
#include <string.h>
3636
#include <dirent.h>
37+
#include <limits.h>
3738
/* Headers required by PAPI */
3839
#include "papi.h"
3940
#include "papi_internal.h"
@@ -70,6 +71,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t, unsigned int *);
7071
nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t, nvmlMemory_t *);
7172
nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t, nvmlPstates_t *);
7273
nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t, unsigned int *);
74+
nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t, unsigned long long *);
75+
nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t,int, nvmlFieldValue_t *);
7376
nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *);
7477
nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t, nvmlEccBitType_t, nvmlEccCounterType_t, unsigned long long *);
7578
nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t, nvmlUtilization_t *);
@@ -92,6 +95,8 @@ static nvmlReturn_t (*nvmlDeviceGetFanSpeedPtr)(nvmlDevice_t, unsigned int *);
9295
static nvmlReturn_t (*nvmlDeviceGetMemoryInfoPtr)(nvmlDevice_t, nvmlMemory_t *);
9396
static nvmlReturn_t (*nvmlDeviceGetPerformanceStatePtr)(nvmlDevice_t, nvmlPstates_t *);
9497
static nvmlReturn_t (*nvmlDeviceGetPowerUsagePtr)(nvmlDevice_t, unsigned int *);
98+
static nvmlReturn_t (*nvmlDeviceGetTotalEnergyConsumptionPtr)(nvmlDevice_t, unsigned long long *);
99+
static nvmlReturn_t (*nvmlDeviceGetFieldValuesPtr)(nvmlDevice_t,int, nvmlFieldValue_t *);
95100
static nvmlReturn_t (*nvmlDeviceGetTemperaturePtr)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *);
96101
static nvmlReturn_t (*nvmlDeviceGetTotalEccErrorsPtr)(nvmlDevice_t, nvmlEccBitType_t, nvmlEccCounterType_t, unsigned long long *);
97102
static nvmlReturn_t (*nvmlDeviceGetUtilizationRatesPtr)(nvmlDevice_t, nvmlUtilization_t *);
@@ -315,6 +320,35 @@ getPState(nvmlDevice_t dev)
315320
return (unsigned long long)ret;
316321
}
317322

323+
unsigned long long
324+
getTotalEnergyConsumption(nvmlDevice_t dev)
325+
{
326+
unsigned long long energy = 0;
327+
nvmlReturn_t bad;
328+
bad = (*nvmlDeviceGetTotalEnergyConsumptionPtr)(dev, &energy);
329+
330+
if (NVML_SUCCESS != bad) {
331+
SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
332+
return (unsigned long long) - 1;
333+
}
334+
if (energy > ULLONG_MAX) {
335+
energy = ULLONG_MAX;
336+
}
337+
return energy;
338+
}
339+
340+
unsigned long long
341+
getDeviceFieldValue(nvmlDevice_t dev, int value_count, nvmlFieldValue_t* field_value)
342+
{
343+
nvmlReturn_t bad;
344+
bad = (*nvmlDeviceGetFieldValuesPtr)(dev, value_count,field_value);
345+
if (NVML_SUCCESS != bad) {
346+
SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
347+
return (unsigned long long) - 1;
348+
}
349+
return field_value->value.ullVal;
350+
}
351+
318352
unsigned long long
319353
getPowerUsage(nvmlDevice_t dev)
320354
{
@@ -454,6 +488,8 @@ nvml_hardware_read(long long *value, int which_one)
454488
nvml_native_event_entry_t *entry;
455489
nvmlDevice_t handle;
456490
int cudaIdx = -1;
491+
int value_count = 1;
492+
nvmlFieldValue_t field_value[value_count];
457493

458494
entry = &nvml_native_table[which_one];
459495
*value = (long long) - 1;
@@ -495,6 +531,23 @@ nvml_hardware_read(long long *value, int which_one)
495531
case FEATURE_POWER:
496532
*value = getPowerUsage(handle);
497533
break;
534+
case FEATURE_TOTAL_ENERGY_CONSUMPTION:
535+
*value = getTotalEnergyConsumption(handle);
536+
break;
537+
#if defined(NVML_FI_DEV_POWER_INSTANT) && defined(NVML_POWER_SCOPE_GPU)
538+
case FEATURE_GPU_INST:
539+
field_value->fieldId = NVML_FI_DEV_POWER_INSTANT;
540+
field_value->scopeId = NVML_POWER_SCOPE_GPU;
541+
*value = getDeviceFieldValue(handle, value_count, field_value);
542+
break;
543+
#endif
544+
#if defined(NVML_FI_DEV_POWER_AVERAGE) && defined(NVML_POWER_SCOPE_MEMORY)
545+
case FEATURE_GPU_MEMORY_AVG:
546+
field_value->fieldId = NVML_FI_DEV_POWER_AVERAGE;
547+
field_value->scopeId = NVML_POWER_SCOPE_MEMORY;
548+
*value = getDeviceFieldValue(handle, value_count, field_value);
549+
break;
550+
#endif
498551
case FEATURE_TEMP:
499552
*value = getTemperature(handle);
500553
break;
@@ -693,6 +746,36 @@ detectDevices()
693746
SUBDBG("nvmlDeviceGetPowerUsage does not appear to be supported on this card. (nvml return code %d)\n", ret);
694747
}
695748

749+
/*Check if energy consumption data are available */
750+
if (getTotalEnergyConsumption(devices[i]) != (unsigned long long) - 1) {
751+
features[i] |= FEATURE_TOTAL_ENERGY_CONSUMPTION;
752+
num_events++;
753+
}
754+
755+
756+
int value_count=1;
757+
nvmlFieldValue_t field_value[value_count];
758+
759+
#if defined(NVML_FI_DEV_POWER_INSTANT) && defined(NVML_POWER_SCOPE_GPU)
760+
// GPU instant power
761+
field_value->fieldId = NVML_FI_DEV_POWER_INSTANT;
762+
field_value->scopeId = NVML_POWER_SCOPE_GPU;
763+
/* Check if the device field for gpu instant power data are available */
764+
if (getDeviceFieldValue(devices[i],value_count, field_value) != (unsigned long long) - 1) {
765+
features[i] |= FEATURE_GPU_INST;
766+
num_events++;
767+
}
768+
#endif
769+
#if defined(NVML_FI_DEV_POWER_AVERAGE) && defined(NVML_POWER_SCOPE_MEMORY)
770+
// GPU Memory average power
771+
field_value->fieldId = NVML_FI_DEV_POWER_AVERAGE;
772+
field_value->scopeId = NVML_POWER_SCOPE_MEMORY;
773+
/* Check if the device field for gpu memory data are available */
774+
if (getDeviceFieldValue(devices[i],value_count, field_value) != (unsigned long long) - 1) {
775+
features[i] |= FEATURE_GPU_MEMORY_AVG;
776+
num_events++;
777+
}
778+
#endif
696779
/* Check if temperature data are available */
697780
if (getTemperature(devices[i]) != (unsigned long long) - 1) {
698781
features[i] |= FEATURE_TEMP;
@@ -986,6 +1069,35 @@ createNativeEvents()
9861069
devTableIdx++;
9871070
}
9881071

1072+
if (HAS_FEATURE(features[i], FEATURE_TOTAL_ENERGY_CONSUMPTION)) {
1073+
sprintf(entry->name, "%s:total_energy_consumption", sanitized_name);
1074+
strncpy(entry->description, "Total energy consumption of the GPU in millijoules since the driver was last reloaded.", PAPI_MAX_STR_LEN);
1075+
entry->type = FEATURE_TOTAL_ENERGY_CONSUMPTION;
1076+
entry++;
1077+
nvml_dev_id_table[devTableIdx] = i;
1078+
devTableIdx++;
1079+
}
1080+
1081+
if (HAS_FEATURE(features[i], FEATURE_GPU_INST)) {
1082+
sprintf(entry->name, "%s:gpu_inst_power", sanitized_name);
1083+
strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
1084+
strncpy(entry->description, "Instantaneous power usage for GPU.", PAPI_MAX_STR_LEN);
1085+
entry->type = FEATURE_GPU_INST;
1086+
entry++;
1087+
nvml_dev_id_table[devTableIdx] = i;
1088+
devTableIdx++;
1089+
}
1090+
1091+
if (HAS_FEATURE(features[i], FEATURE_GPU_MEMORY_AVG)) {
1092+
sprintf(entry->name, "%s:gpu_memory_avg_power", sanitized_name);
1093+
strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
1094+
strncpy(entry->description, "Average power usage for GPU Memory.", PAPI_MAX_STR_LEN);
1095+
entry->type = FEATURE_GPU_MEMORY_AVG;
1096+
entry++;
1097+
nvml_dev_id_table[devTableIdx] = i;
1098+
devTableIdx++;
1099+
}
1100+
9891101
if (HAS_FEATURE(features[i], FEATURE_TEMP)) {
9901102
sprintf(entry->name, "%s:temperature", sanitized_name);
9911103
strncpy(entry->description, "Current temperature readings for the device, in degrees C.", PAPI_MAX_STR_LEN);
@@ -1440,6 +1552,16 @@ linkCudaLibraries()
14401552
strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPowerUsage not found.", PAPI_MAX_STR_LEN);
14411553
return (PAPI_ENOSUPP);
14421554
}
1555+
nvmlDeviceGetTotalEnergyConsumptionPtr = dlsym(dl3, "nvmlDeviceGetTotalEnergyConsumption");
1556+
if (dlerror() != NULL) {
1557+
strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetTotalEnergyConsumption not found.", PAPI_MAX_STR_LEN);
1558+
return (PAPI_ENOSUPP);
1559+
}
1560+
nvmlDeviceGetFieldValuesPtr = dlsym(dl3, "nvmlDeviceGetFieldValues");
1561+
if (dlerror() != NULL) {
1562+
strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetFieldValues not found.", PAPI_MAX_STR_LEN);
1563+
return (PAPI_ENOSUPP);
1564+
}
14431565
nvmlDeviceGetTemperaturePtr = dlsym(dl3, "nvmlDeviceGetTemperature");
14441566
if (dlerror() != NULL) {
14451567
strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetTemperature not found.", PAPI_MAX_STR_LEN);

src/components/nvml/linux-nvml.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
#define FEATURE_POWER_MANAGEMENT 1024
1717
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MIN 2048
1818
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MAX 4096
19+
#define FEATURE_TOTAL_ENERGY_CONSUMPTION 8192
20+
#define FEATURE_GPU_INST 16384
21+
#define FEATURE_GPU_MEMORY_AVG 32768
1922

2023
#define HAS_FEATURE( features, query ) ( features & query )
2124

0 commit comments

Comments
 (0)