Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion src/components/cuda/cupti_event_and_metric.c
Original file line number Diff line number Diff line change
Expand Up @@ -807,7 +807,7 @@ int cuptie_ctx_create(cuptic_info_t thr_info, cuptie_control_t *pstate, uint32_t
CUcontext internalContext;
cudaArtCheckErrors( cudaSetDevicePtr(native_event_info.device), return PAPI_EMISC );
cudaArtCheckErrors( cudaFreePtr(NULL), return PAPI_EMISC );
cudaCheckErrors( cuCtxGetCurrentPtr(&internalContext), return PAPI_EMISC);
cudaCheckErrors( cuCtxGetCurrentPtr(&internalContext), return PAPI_EMISC);
thr_info[native_event_info.device].ctx = internalContext;
// Pop the context off so verify_user_added_event_or_metric functions properly
cudaCheckErrors( cuCtxPopCurrentPtr(&internalContext), return PAPI_EMISC );
Expand Down Expand Up @@ -867,13 +867,26 @@ int cuptie_ctx_start(cuptie_control_t state)
{
SUBDBG("ENTERING: Setting up profiling for the Event and Metric APIs.\n");

CUcontext currentUserContext;
cudaCheckErrors( cuCtxGetCurrentPtr(&currentUserContext), return PAPI_EMISC);
if (currentUserContext != NULL) {
cudaCheckErrors( cuCtxPopCurrentPtr(&currentUserContext), return PAPI_EMISC );
}

int deviceIdx;
for (deviceIdx = 0; deviceIdx < numDevicesOnMachine; deviceIdx++) {
cuptie_gpu_state_t *gpu_ctl = &(state->gpu_ctl[deviceIdx]);
if (gpu_ctl->added_events->totalNumberOfUserAddedNativeEvents == 0) {
continue;
}

int papi_errno = cuptic_device_acquire(gpu_ctl->added_events, API_LEGACY);
if (papi_errno != PAPI_OK) {
SUBDBG("Profiling the same gpu from multiple event sets is not allowed.\n");
return papi_errno;
}


cudaCheckErrors( cuCtxSetCurrentPtr(state->info[deviceIdx].ctx), return PAPI_EMISC );

// Calculate the total number of user added events
Expand Down Expand Up @@ -924,6 +937,10 @@ int cuptie_ctx_start(cuptie_control_t state)
cuptiCheckErrors( cuCtxPopCurrentPtr(&state->info[deviceIdx].ctx), return PAPI_EMISC );
}

if (currentUserContext != NULL) {
cudaCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC );
}

SUBDBG("EXITING: Profiling setup completed.\n");
return PAPI_OK;
}
Expand All @@ -941,6 +958,12 @@ int cuptie_ctx_read(cuptie_control_t state, long long **counterValues)
{
SUBDBG("ENTERING: Reading values for the Event and Metric APIs.\n");

CUcontext currentUserContext;
cudaCheckErrors( cuCtxGetCurrentPtr(&currentUserContext), return PAPI_EMISC);
if (currentUserContext != NULL) {
cuptiCheckErrors( cuCtxPopCurrentPtr(&currentUserContext), return PAPI_EMISC );
}

int numCountersRead = 0;
long long *readCounterValues = state->counters;

Expand Down Expand Up @@ -1117,6 +1140,10 @@ int cuptie_ctx_read(cuptie_control_t state, long long **counterValues)
state->read_count = numCountersRead;
*counterValues = readCounterValues;

if (currentUserContext != NULL) {
cuptiCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC );
}

SUBDBG("EXITING: Reading values completed.\n");
return PAPI_OK;
}
Expand All @@ -1131,6 +1158,12 @@ int cuptie_ctx_stop(cuptie_control_t state)
{
SUBDBG("ENTERING: Disabling and destroying the event group sets created. Collection of events will be stopped.\n");

CUcontext currentUserContext;
cudaCheckErrors( cuCtxGetCurrentPtr(&currentUserContext), return PAPI_EMISC);
if (currentUserContext != NULL) {
cudaCheckErrors( cuCtxPopCurrentPtr(&currentUserContext), return PAPI_EMISC );
}

int deviceIdx;
for (deviceIdx = 0; deviceIdx < numDevicesOnMachine; deviceIdx++) {
cuptie_gpu_state_t *gpu_ctl = &(state->gpu_ctl[deviceIdx]);
Expand All @@ -1146,9 +1179,18 @@ int cuptie_ctx_stop(cuptie_control_t state)
cuptiCheckErrors( cuptiEventGroupSetDisablePtr(eventGroupSet), return PAPI_EMISC );
cuptiCheckErrors( cuptiEventGroupSetsDestroyPtr(eventGroupSets), return PAPI_EMISC );

int papi_errno = cuptic_device_release(gpu_ctl->added_events, API_LEGACY);
if (papi_errno != PAPI_OK) {
return papi_errno;
}

cudaCheckErrors( cuCtxPopCurrentPtr(&state->info[deviceIdx].ctx), return PAPI_EMISC );
}

if (currentUserContext != NULL) {
cudaCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC );
}

SUBDBG("EXITING: Disabling event group sets completed.\n");
return PAPI_OK;
}
Expand All @@ -1164,6 +1206,12 @@ int cuptie_ctx_reset(cuptie_control_t state)
{
SUBDBG("ENTERING: Resetting counter values.\n");

CUcontext currentUserContext;
cudaCheckErrors( cuCtxGetCurrentPtr(&currentUserContext), return PAPI_EMISC);
if (currentUserContext != NULL) {
cudaCheckErrors( cuCtxPopCurrentPtr(&currentUserContext), return PAPI_EMISC );
}

int counterIdx;
for (counterIdx = 0; counterIdx < state->read_count; counterIdx++) {
state->counters[counterIdx] = 0;
Expand Down Expand Up @@ -1192,6 +1240,10 @@ int cuptie_ctx_reset(cuptie_control_t state)
cudaCheckErrors( cuCtxPopCurrentPtr(&state->info[deviceIdx].ctx), return PAPI_EMISC );
}

if (currentUserContext != NULL) {
cudaCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC );
}

SUBDBG("EXITING: Resetting counter values completed.\n");
return PAPI_OK;
}
Expand Down Expand Up @@ -1425,11 +1477,16 @@ static int verify_user_added_event_or_metric(uint32_t *events_id, int num_events
}
totalNumberOfUserAddedEvents++;
state->gpu_ctl[native_event_info.device].added_events->totalNumberOfUserAddedNativeEvents = totalNumberOfUserAddedEvents;
// For a specific device table, get the current event index
int idx = state->gpu_ctl[native_event_info.device].added_events->count;
state->gpu_ctl[native_event_info.device].added_events->cuda_devs[idx] = native_event_info.device;
state->gpu_ctl[native_event_info.device].added_events->count++;

// Pop off the set context
cudaCheckErrors( cuCtxPopCurrentPtr(&thr_info[native_event_info.device].ctx), return PAPI_EMISC );
}


SUBDBG("EXITING: Checking user added a valid event completed.\n");
return PAPI_OK;
}
Expand Down Expand Up @@ -1491,6 +1548,7 @@ static int create_event_and_metric_table(int totalNumberOfEntries, cuptiu_event_
goto fn_fail;
}

eventTable->count = 0;
eventTable->capacity = totalNumberOfEntries;
eventTable->startTimeStampNs = 0;
eventTable->totalNumberOfUserAddedNativeEvents = 0;
Expand Down
1 change: 1 addition & 0 deletions src/components/cuda/cupti_event_and_metric.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ typedef struct event_and_metric_record_s {
typedef struct event_and_metric_table_s {
unsigned int count;
unsigned int capacity;
int cuda_devs[30];
CUpti_EventGroupSets *eventGroupSets;
CUpti_MetricID metricIDs[PAPI_CUDA_MAX_COUNTERS];
int *idsThatMakeupAUserAddedEventArray[PAPI_CUDA_MAX_COUNTERS];
Expand Down
4 changes: 2 additions & 2 deletions src/components/cuda/cupti_profiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -817,7 +817,7 @@ int cuptip_ctx_start(cuptip_control_t state)
}

LOGDBG("Device num %d: event_count %d, rmr count %d\n", dev_id, gpu_ctl->added_events->count, gpu_ctl->numberOfRawMetricRequests);
papi_errno = cuptic_device_acquire(state->gpu_ctl[dev_id].added_events);
papi_errno = cuptic_device_acquire(state->gpu_ctl[dev_id].added_events, API_PERFWORKS);
if (papi_errno != PAPI_OK) {
ERRDBG("Profiling same gpu from multiple event sets not allowed.\n");
return papi_errno;
Expand Down Expand Up @@ -1134,7 +1134,7 @@ int cuptip_ctx_stop(cuptip_control_t state)
return papi_errno;
}

papi_errno = cuptic_device_release(state->gpu_ctl[dev_id].added_events);
papi_errno = cuptic_device_release(state->gpu_ctl[dev_id].added_events, API_PERFWORKS);
if (papi_errno != PAPI_OK) {
return papi_errno;
}
Expand Down
73 changes: 52 additions & 21 deletions src/components/cuda/papi_cupti_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "cupti_config.h"
#include "papi_cupti_common.h"
#include "cupti_event_and_metric.h"

static void *dl_drv, *dl_rt;

Expand Down Expand Up @@ -848,47 +849,77 @@ int cuptic_ctxarr_destroy(cuptic_info_t *pinfo)
return PAPI_OK;
}

int _devmask_events_get(cuptiu_event_table_t *evt_table, gpu_occupancy_t *bitmask)
int cuptic_device_acquire(void *evt_table, int flag)
{
gpu_occupancy_t acq_mask = 0;
long i;
for (i = 0; i < evt_table->count; i++) {
acq_mask |= (1 << evt_table->cuda_devs[i]);
int i;
gpu_occupancy_t bitmask = 0;
switch(flag) {
case API_LEGACY:
{
cuptiu_event_and_metric_table_t *legacy_evt_table = (cuptiu_event_and_metric_table_t *) evt_table;
for (i = 0; i < legacy_evt_table->count; i++) {
bitmask |= (1 << legacy_evt_table->cuda_devs[i]);
}
break;
}
case API_PERFWORKS:
{
cuptiu_event_table_t *perfworks_evt_table = (cuptiu_event_table_t *) evt_table;
for (i = 0; i < perfworks_evt_table->count; i++) {
bitmask |= (1 << perfworks_evt_table->cuda_devs[i]);
}
break;
}
default:
SUBDBG("Provided flag is not accounted for in this switch statement. Code needs to be updated.\n");
return PAPI_EBUG;
}
*bitmask = acq_mask;

return PAPI_OK;
}

int cuptic_device_acquire(cuptiu_event_table_t *evt_table)
{
gpu_occupancy_t bitmask;
int papi_errno = _devmask_events_get(evt_table, &bitmask);
if (papi_errno != PAPI_OK) {
return papi_errno;
}
if (bitmask & global_gpu_bitmask) {
return PAPI_ECNFLCT;
}

_papi_hwi_lock(_cuda_lock);
global_gpu_bitmask |= bitmask;
_papi_hwi_unlock(_cuda_lock);

return PAPI_OK;
}

int cuptic_device_release(cuptiu_event_table_t *evt_table)
int cuptic_device_release(void *evt_table, int flag)
{
gpu_occupancy_t bitmask;
int papi_errno = _devmask_events_get(evt_table, &bitmask);
if (papi_errno != PAPI_OK) {
return papi_errno;
int i;
gpu_occupancy_t bitmask = 0;
switch(flag) {
case API_LEGACY:
{
cuptiu_event_and_metric_table_t *legacy_evt_table = (cuptiu_event_and_metric_table_t *) evt_table;
for (i = 0; i < legacy_evt_table->count; i++) {
bitmask |= (1 << legacy_evt_table->cuda_devs[i]);
}
break;
}
case API_PERFWORKS:
{
cuptiu_event_table_t *perfworks_evt_table = (cuptiu_event_table_t *) evt_table;
for (i = 0; i < perfworks_evt_table->count; i++) {
bitmask |= (1 << perfworks_evt_table->cuda_devs[i]);
}
break;
}
default:
SUBDBG("Provided flag is not accounted for in this switch statement. Code needs to be updated.\n");
return PAPI_EBUG;
}

if ((bitmask & global_gpu_bitmask) != bitmask) {
return PAPI_EMISC;
}

_papi_hwi_lock(_cuda_lock);
global_gpu_bitmask ^= bitmask;
_papi_hwi_unlock(_cuda_lock);

return PAPI_OK;
}

Expand Down
4 changes: 2 additions & 2 deletions src/components/cuda/papi_cupti_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ int cuptic_ctxarr_get_ctx(cuptic_info_t info, int dev_id, CUcontext *ctx);
int cuptic_ctxarr_destroy(cuptic_info_t *pinfo);

/* functions to track the occupancy of gpu counters in event sets */
int cuptic_device_acquire(cuptiu_event_table_t *evt_table);
int cuptic_device_release(cuptiu_event_table_t *evt_table);
int cuptic_device_acquire(void *evt_table, int flag);
int cuptic_device_release(void *evt_table, int flag);

/* device qualifier interfaces */
int cuptiu_dev_set(cuptiu_bitmap_t *bitmap, int i);
Expand Down
Loading