icl-utk-edu · Treece-Burgess · Jan 25, 2026 · Feb 5, 2026
diff --git a/src/components/cuda/cupti_event_and_metric.c b/src/components/cuda/cupti_event_and_metric.c
@@ -807,7 +807,7 @@ int cuptie_ctx_create(cuptic_info_t thr_info, cuptie_control_t *pstate, uint32_t
             CUcontext internalContext; 
             cudaArtCheckErrors( cudaSetDevicePtr(native_event_info.device), return PAPI_EMISC );
             cudaArtCheckErrors( cudaFreePtr(NULL), return PAPI_EMISC );  
-            cudaCheckErrors( cuCtxGetCurrentPtr(&internalContext), return PAPI_EMISC); 
+            cudaCheckErrors( cuCtxGetCurrentPtr(&internalContext), return PAPI_EMISC);
             thr_info[native_event_info.device].ctx = internalContext;
             // Pop the context off so verify_user_added_event_or_metric functions properly
             cudaCheckErrors( cuCtxPopCurrentPtr(&internalContext), return PAPI_EMISC );
@@ -867,13 +867,26 @@ int cuptie_ctx_start(cuptie_control_t state)
 {
     SUBDBG("ENTERING: Setting up profiling for the Event and Metric APIs.\n");
 
+    CUcontext currentUserContext;
+    cudaCheckErrors( cuCtxGetCurrentPtr(&currentUserContext), return PAPI_EMISC);
+    if (currentUserContext != NULL) {
+        cudaCheckErrors( cuCtxPopCurrentPtr(&currentUserContext), return PAPI_EMISC );
+    }
+
     int deviceIdx;
     for (deviceIdx = 0; deviceIdx < numDevicesOnMachine; deviceIdx++) {
         cuptie_gpu_state_t *gpu_ctl = &(state->gpu_ctl[deviceIdx]);
         if (gpu_ctl->added_events->totalNumberOfUserAddedNativeEvents == 0) {
             continue;
         }
 
+        int papi_errno = cuptic_device_acquire(gpu_ctl->added_events, API_LEGACY);
+        if (papi_errno != PAPI_OK) {
+            SUBDBG("Profiling the same gpu from multiple event sets is not allowed.\n");
+            return papi_errno;
+        }
+
+
         cudaCheckErrors( cuCtxSetCurrentPtr(state->info[deviceIdx].ctx), return PAPI_EMISC );
 
         // Calculate the total number of user added events
@@ -924,6 +937,10 @@ int cuptie_ctx_start(cuptie_control_t state)
         cuptiCheckErrors( cuCtxPopCurrentPtr(&state->info[deviceIdx].ctx), return PAPI_EMISC );
     }
 
+    if (currentUserContext != NULL) {
+        cudaCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC );
+    }
+
     SUBDBG("EXITING: Profiling setup completed.\n");
     return PAPI_OK;
 }
@@ -941,6 +958,12 @@ int cuptie_ctx_read(cuptie_control_t state, long long **counterValues)
 {
     SUBDBG("ENTERING: Reading values for the Event and Metric APIs.\n");
 
+    CUcontext currentUserContext;
+    cudaCheckErrors( cuCtxGetCurrentPtr(&currentUserContext), return PAPI_EMISC);
+    if (currentUserContext != NULL) {
+        cuptiCheckErrors( cuCtxPopCurrentPtr(&currentUserContext), return PAPI_EMISC );
+    }
+
     int numCountersRead = 0;
     long long *readCounterValues = state->counters;
 
@@ -1117,6 +1140,10 @@ int cuptie_ctx_read(cuptie_control_t state, long long **counterValues)
     state->read_count = numCountersRead;
     *counterValues = readCounterValues;
 
+    if (currentUserContext != NULL) {
+        cuptiCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC );
+    }
+
     SUBDBG("EXITING: Reading values completed.\n");
     return PAPI_OK;
 }
@@ -1131,6 +1158,12 @@ int cuptie_ctx_stop(cuptie_control_t state)
 {
     SUBDBG("ENTERING: Disabling and destroying the event group sets created. Collection of events will be stopped.\n");
 
+    CUcontext currentUserContext;
+    cudaCheckErrors( cuCtxGetCurrentPtr(&currentUserContext), return PAPI_EMISC);
+    if (currentUserContext != NULL) {
+        cudaCheckErrors( cuCtxPopCurrentPtr(&currentUserContext), return PAPI_EMISC );
+    }
+
     int deviceIdx;
     for (deviceIdx = 0; deviceIdx < numDevicesOnMachine; deviceIdx++) {
         cuptie_gpu_state_t *gpu_ctl = &(state->gpu_ctl[deviceIdx]);
@@ -1146,9 +1179,18 @@ int cuptie_ctx_stop(cuptie_control_t state)
         cuptiCheckErrors( cuptiEventGroupSetDisablePtr(eventGroupSet), return PAPI_EMISC );
         cuptiCheckErrors( cuptiEventGroupSetsDestroyPtr(eventGroupSets), return PAPI_EMISC );
 
+        int papi_errno = cuptic_device_release(gpu_ctl->added_events, API_LEGACY);
+        if (papi_errno != PAPI_OK) {
+            return papi_errno;
+        }
+
         cudaCheckErrors( cuCtxPopCurrentPtr(&state->info[deviceIdx].ctx), return PAPI_EMISC );
     }
 
+    if (currentUserContext != NULL) {
+        cudaCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC );
+    }
+
     SUBDBG("EXITING: Disabling event group sets completed.\n");
     return PAPI_OK;
 }
@@ -1164,6 +1206,12 @@ int cuptie_ctx_reset(cuptie_control_t state)
 {
     SUBDBG("ENTERING: Resetting counter values.\n");
 
+    CUcontext currentUserContext;
+    cudaCheckErrors( cuCtxGetCurrentPtr(&currentUserContext), return PAPI_EMISC);
+    if (currentUserContext != NULL) {
+        cudaCheckErrors( cuCtxPopCurrentPtr(&currentUserContext), return PAPI_EMISC );
+    }
+
     int counterIdx;
     for (counterIdx = 0; counterIdx < state->read_count; counterIdx++) {
         state->counters[counterIdx] = 0;
@@ -1192,6 +1240,10 @@ int cuptie_ctx_reset(cuptie_control_t state)
         cudaCheckErrors( cuCtxPopCurrentPtr(&state->info[deviceIdx].ctx), return PAPI_EMISC );
     }
 
+    if (currentUserContext != NULL) {
+        cudaCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC );
+    }
+
     SUBDBG("EXITING: Resetting counter values completed.\n");
     return PAPI_OK;
 }
@@ -1425,11 +1477,16 @@ static int verify_user_added_event_or_metric(uint32_t *events_id, int num_events
         }
         totalNumberOfUserAddedEvents++;
         state->gpu_ctl[native_event_info.device].added_events->totalNumberOfUserAddedNativeEvents = totalNumberOfUserAddedEvents;
+        // For a specific device table, get the current event index
+        int idx = state->gpu_ctl[native_event_info.device].added_events->count;
+        state->gpu_ctl[native_event_info.device].added_events->cuda_devs[idx] = native_event_info.device;
+        state->gpu_ctl[native_event_info.device].added_events->count++;
 
         // Pop off the set context
         cudaCheckErrors( cuCtxPopCurrentPtr(&thr_info[native_event_info.device].ctx), return PAPI_EMISC );
     }
 
+
     SUBDBG("EXITING: Checking user added a valid event completed.\n");
     return PAPI_OK;
 }
@@ -1491,6 +1548,7 @@ static int create_event_and_metric_table(int totalNumberOfEntries, cuptiu_event_
         goto fn_fail;
     }
 
+    eventTable->count = 0;
     eventTable->capacity = totalNumberOfEntries;
     eventTable->startTimeStampNs = 0;
     eventTable->totalNumberOfUserAddedNativeEvents = 0;

diff --git a/src/components/cuda/cupti_event_and_metric.h b/src/components/cuda/cupti_event_and_metric.h
@@ -56,6 +56,7 @@ typedef struct event_and_metric_record_s {
 typedef struct event_and_metric_table_s {
     unsigned int count;
     unsigned int capacity;
+    int cuda_devs[30];
     CUpti_EventGroupSets *eventGroupSets;
     CUpti_MetricID metricIDs[PAPI_CUDA_MAX_COUNTERS];
     int *idsThatMakeupAUserAddedEventArray[PAPI_CUDA_MAX_COUNTERS];

diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c
@@ -817,7 +817,7 @@ int cuptip_ctx_start(cuptip_control_t state)
         }
 
         LOGDBG("Device num %d: event_count %d, rmr count %d\n", dev_id, gpu_ctl->added_events->count, gpu_ctl->numberOfRawMetricRequests);
-        papi_errno = cuptic_device_acquire(state->gpu_ctl[dev_id].added_events);
+        papi_errno = cuptic_device_acquire(state->gpu_ctl[dev_id].added_events, API_PERFWORKS);
         if (papi_errno != PAPI_OK) {
             ERRDBG("Profiling same gpu from multiple event sets not allowed.\n");
             return papi_errno;
@@ -1134,7 +1134,7 @@ int cuptip_ctx_stop(cuptip_control_t state)
             return papi_errno;
         }
 
-        papi_errno = cuptic_device_release(state->gpu_ctl[dev_id].added_events);
+        papi_errno = cuptic_device_release(state->gpu_ctl[dev_id].added_events, API_PERFWORKS);
         if (papi_errno != PAPI_OK) {
             return papi_errno;
         }

diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c
@@ -15,6 +15,7 @@
 
 #include "cupti_config.h"
 #include "papi_cupti_common.h"
+#include "cupti_event_and_metric.h"
 
 static void *dl_drv, *dl_rt;
 
@@ -848,47 +849,77 @@ int cuptic_ctxarr_destroy(cuptic_info_t *pinfo)
     return PAPI_OK;
 }
 
-int _devmask_events_get(cuptiu_event_table_t *evt_table, gpu_occupancy_t *bitmask)
+int cuptic_device_acquire(void *evt_table, int flag)
 {
-    gpu_occupancy_t acq_mask = 0;
-    long i;
-    for (i = 0; i < evt_table->count; i++) {
-        acq_mask |= (1 << evt_table->cuda_devs[i]);
+    int i;
+    gpu_occupancy_t bitmask = 0;
+    switch(flag) {
+        case API_LEGACY:
+        {
+            cuptiu_event_and_metric_table_t *legacy_evt_table = (cuptiu_event_and_metric_table_t *) evt_table;
+            for (i = 0; i < legacy_evt_table->count; i++) {
+                bitmask |= (1 << legacy_evt_table->cuda_devs[i]);
+            }
+            break;
+        }
+        case API_PERFWORKS:
+        {
+            cuptiu_event_table_t *perfworks_evt_table = (cuptiu_event_table_t *) evt_table;
+            for (i = 0; i < perfworks_evt_table->count; i++) {
+                bitmask |= (1 << perfworks_evt_table->cuda_devs[i]);
+            }
+            break;
+        }
+        default:
+            SUBDBG("Provided flag is not accounted for in this switch statement. Code needs to be updated.\n");
+            return PAPI_EBUG;
     }
-    *bitmask = acq_mask;
 
-    return PAPI_OK;
-}
-
-int cuptic_device_acquire(cuptiu_event_table_t *evt_table)
-{
-    gpu_occupancy_t bitmask;
-    int papi_errno = _devmask_events_get(evt_table, &bitmask);
-    if (papi_errno != PAPI_OK) {
-        return papi_errno;
-    }
     if (bitmask & global_gpu_bitmask) {
         return PAPI_ECNFLCT;
     }
+
     _papi_hwi_lock(_cuda_lock);
     global_gpu_bitmask |= bitmask;
     _papi_hwi_unlock(_cuda_lock);
+
     return PAPI_OK;
 }
 
-int cuptic_device_release(cuptiu_event_table_t *evt_table)
+int cuptic_device_release(void *evt_table, int flag)
 {
-    gpu_occupancy_t bitmask;
-    int papi_errno = _devmask_events_get(evt_table, &bitmask);
-    if (papi_errno != PAPI_OK) {
-        return papi_errno;
+    int i;
+    gpu_occupancy_t bitmask = 0;
+    switch(flag) {
+        case API_LEGACY:
+        {
+            cuptiu_event_and_metric_table_t *legacy_evt_table = (cuptiu_event_and_metric_table_t *) evt_table;
+            for (i = 0; i < legacy_evt_table->count; i++) {
+                bitmask |= (1 << legacy_evt_table->cuda_devs[i]);
+            }
+            break;
+        }
+        case API_PERFWORKS:
+        {
+            cuptiu_event_table_t *perfworks_evt_table = (cuptiu_event_table_t *) evt_table;
+            for (i = 0; i < perfworks_evt_table->count; i++) {
+                bitmask |= (1 << perfworks_evt_table->cuda_devs[i]);
+            }
+            break;
+        }
+        default:
+            SUBDBG("Provided flag is not accounted for in this switch statement. Code needs to be updated.\n");
+            return PAPI_EBUG;
     }
+
     if ((bitmask & global_gpu_bitmask) != bitmask) {
         return PAPI_EMISC;
     }
+
     _papi_hwi_lock(_cuda_lock);
     global_gpu_bitmask ^= bitmask;
     _papi_hwi_unlock(_cuda_lock);
+
     return PAPI_OK;
 }
 

diff --git a/src/components/cuda/papi_cupti_common.h b/src/components/cuda/papi_cupti_common.h
@@ -76,8 +76,8 @@ int cuptic_ctxarr_get_ctx(cuptic_info_t info, int dev_id, CUcontext *ctx);
 int cuptic_ctxarr_destroy(cuptic_info_t *pinfo);
 
 /* functions to track the occupancy of gpu counters in event sets */
-int cuptic_device_acquire(cuptiu_event_table_t *evt_table);
-int cuptic_device_release(cuptiu_event_table_t *evt_table);
+int cuptic_device_acquire(void *evt_table, int flag);
+int cuptic_device_release(void *evt_table, int flag);
 
 /* device qualifier interfaces */
 int cuptiu_dev_set(cuptiu_bitmap_t *bitmap, int i);