Skip to content

Commit c9eb2f7

Browse files
committed
Enable appdaemon wrapper mode for GPU monitoring
1 parent 200c8f8 commit c9eb2f7

File tree

2 files changed

+317
-104
lines changed

2 files changed

+317
-104
lines changed

src/access-daemon/appDaemon.c

Lines changed: 199 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ static FILE* output_file = NULL;
5353

5454
// Timeline mode
5555
static int stopIssued = 0;
56-
static pthread_mutex_t stopMutex;
56+
static pthread_mutex_t stopMutex = PTHREAD_MUTEX_INITIALIZER;
5757

5858
static int appdaemon_register_exit(appdaemon_exit_func f)
5959
{
@@ -88,7 +88,6 @@ static void prepare_ldpreload()
8888
char* ldpreload = getenv("LD_PRELOAD");
8989
if (ldpreload)
9090
{
91-
printf("Old LD_PRELOAD=%s\n", ldpreload);
9291
bstring bldpre = bfromcstr(ldpreload);
9392
bstring new_bldpre = bfromcstr("");
9493
struct bstrList *liblist = bsplit(bldpre, ':');
@@ -100,7 +99,6 @@ static void prepare_ldpreload()
10099
bconchar(new_bldpre, ':');
101100
}
102101
}
103-
printf("New LD_PRELOAD=%s\n", bdata(new_bldpre));
104102
mysetenv("LD_PRELOAD", bdata(new_bldpre), 1);
105103
bstrListDestroy(liblist);
106104
bdestroy(new_bldpre);
@@ -144,6 +142,59 @@ static int parse_gpustr(char* gpuStr, int* numGpus, int** gpuIds)
144142
return 0;
145143
}
146144

145+
typedef struct {
146+
int numDevices;
147+
int *devices;
148+
int numGroups;
149+
int *groups;
150+
double (*getTime)(int group);
151+
int (*numEvents)(int group);
152+
double (*getResult)(int gpu, int group, int event);
153+
} appdaemon_output_data;
154+
155+
static int appdaemon_write_output_file(const char* markerfile, appdaemon_output_data* data) {
156+
157+
/* MarkerAPI File format
158+
* 1 numberOfGPUs numberOfRegions numberOfGpuGroups
159+
* 2 regionID:regionTag0
160+
* 3 regionID:regionTag1
161+
* 4 regionID groupID gpuID callCount timeActive numEvents countersvalues(space separated)
162+
* 5 regionID groupID gpuID callCount timeActive numEvents countersvalues(space separated)
163+
*/
164+
/* Here we use it to hand over the results to likwid-perfctr */
165+
166+
// Open file in write mode
167+
FILE* file = fopen(markerfile,"w");
168+
if (file == NULL)
169+
{
170+
int ret = errno;
171+
fprintf(stderr, "Cannot open file %s\n", markerfile);
172+
fprintf(stderr, "%s", strerror(errno));
173+
return -ret;
174+
}
175+
fprintf(file,"%d 1 %d\n", data->numDevices, data->numGroups);
176+
int regionId = 0;
177+
for (int i = 0; i < data->numGroups; i++) {
178+
fprintf(file, "%d:appdaemon-%d\n", regionId, data->groups[i]);
179+
}
180+
for (int i = 0; i < data->numGroups; i++) {
181+
int groupId = data->groups[i];
182+
int numEvents = data->numEvents(groupId);
183+
double time = data->getTime(groupId);
184+
for (int j = 0; j < data->numDevices; j++) {
185+
fprintf(file, "%d %d %d %u %e %d ", regionId, groupId, data->devices[j], 1, time, numEvents);
186+
for (int k = 0; k < numEvents; k++) {
187+
fprintf(file, "%e ", data->getResult(groupId, k, j));
188+
}
189+
fprintf(file, "\n");
190+
}
191+
}
192+
fflush(file);
193+
fclose(file);
194+
return 0;
195+
}
196+
197+
147198
/*
148199
Nvmon
149200
*/
@@ -153,12 +204,13 @@ static int* nvmon_gpulist = NULL;
153204
static int nvmon_numgpus = 0;
154205
static int* nvmon_gids = NULL;
155206
static int nvmon_numgids = 0;
207+
int likwid_nvmon_verbosity = 0;
156208

157209
static int appdaemon_setup_nvmon(char* gpuStr, char* eventStr)
158210
{
159211
int ret = 0;
160-
printf("Nvmon GPU string: %s\n", gpuStr);
161-
printf("Nvmon Event string: %s\n", eventStr);
212+
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, "Nvmon GPU string: %s", gpuStr);
213+
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, "Nvmon Event string: %s", eventStr);
162214

163215
// Parse gpu string
164216
ret = parse_gpustr(gpuStr, &nvmon_numgpus, &nvmon_gpulist);
@@ -257,18 +309,35 @@ static void appdaemon_close_nvmon(void)
257309
}
258310

259311
// Print results
260-
for (int g = 0; g < nvmon_numgids; g++)
312+
if (getenv("LIKWID_NVMON_MARKER_FORMAT") == NULL)
261313
{
262-
int gid = nvmon_gids[g];
263-
for (int i = 0; i < nvmon_getNumberOfEvents(gid); i++)
314+
for (int g = 0; g < nvmon_numgids; g++)
264315
{
265-
for (int j = 0; j < nvmon_numgpus; j++)
316+
int gid = nvmon_gids[g];
317+
for (int i = 0; i < nvmon_getNumberOfEvents(gid); i++)
266318
{
267-
fprintf(output_file, "Nvmon, %d, %f, %s, %f, %f\n", nvmon_gpulist[j], nvmon_getTimeOfGroup(nvmon_gpulist[j]), nvmon_getEventName(gid, i), nvmon_getResult(gid, i, j), nvmon_getLastResult(gid, i, j));
319+
for (int j = 0; j < nvmon_numgpus; j++)
320+
{
321+
fprintf(output_file, "Nvmon, %d, %f, %s, %f, %f\n", nvmon_gpulist[j], nvmon_getTimeOfGroup(gid), nvmon_getEventName(gid, i), nvmon_getResult(gid, i, j), nvmon_getLastResult(gid, i, j));
322+
}
268323
}
269324
}
325+
fflush(output_file);
326+
} else {
327+
appdaemon_output_data data = {
328+
.numDevices = nvmon_numgpus,
329+
.devices = nvmon_gpulist,
330+
.numGroups = nvmon_numgids,
331+
.groups = nvmon_gids,
332+
.getTime = nvmon_getTimeOfGroup,
333+
.getResult = nvmon_getResult,
334+
.numEvents = nvmon_getNumberOfEvents,
335+
};
336+
ret = appdaemon_write_output_file(getenv("LIKWID_NVMON_OUTPUTFILE"), &data);
337+
if (ret < 0) {
338+
ERROR_PRINT("Failed to write appdaemon data to %s", getenv("LIKWID_NVMON_OUTPUTFILE"));
339+
}
270340
}
271-
fflush(output_file);
272341

273342
// Cleanup
274343
if (nvmon_initialized)
@@ -308,7 +377,7 @@ static void appdaemon_read_nvmon(void)
308377
{
309378
for (int j = 0; j < nvmon_numgpus; j++)
310379
{
311-
fprintf(output_file, "Nvmon, %d, %f, %s, %f, %f\n", nvmon_gpulist[j], nvmon_getTimeToLastReadOfGroup(nvmon_gpulist[j]), nvmon_getEventName(gid, i), nvmon_getResult(gid, i, j), nvmon_getLastResult(gid, i, j));
380+
fprintf(output_file, "Nvmon, %d, %f, %s, %f, %f\n", nvmon_gpulist[j], nvmon_getTimeToLastReadOfGroup(gid), nvmon_getEventName(gid, i), nvmon_getResult(gid, i, j), nvmon_getLastResult(gid, i, j));
312381
}
313382
}
314383
}
@@ -328,8 +397,8 @@ static int rocmon_numgids = 0;
328397
static int appdaemon_setup_rocmon(char* gpuStr, char* eventStr)
329398
{
330399
int ret = 0;
331-
printf("Rocmon GPU string: %s\n", gpuStr);
332-
printf("Rocmon Event string: %s\n", eventStr);
400+
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, "Rocmon GPU string: %s\n", gpuStr);
401+
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, "Rocmon Event string: %s\n", eventStr);
333402

334403
// Parse gpu string
335404
ret = parse_gpustr(gpuStr, &rocmon_numgpus, &rocmon_gpulist);
@@ -425,17 +494,34 @@ static void appdaemon_close_rocmon(void)
425494
ERROR_PRINT("Failed to stop rocmon");
426495
}
427496

428-
// Print results
429-
for (int g = 0; g < rocmon_numgids; g++)
497+
if (getenv("LIKWID_ROCMON_MARKER_FORMAT") == NULL)
430498
{
431-
int gid = rocmon_gids[g];
432-
for (int i = 0; i < rocmon_getNumberOfEvents(gid); i++)
499+
// Print results
500+
for (int g = 0; g < rocmon_numgids; g++)
433501
{
434-
for (int j = 0; j < rocmon_numgpus; j++)
502+
int gid = rocmon_gids[g];
503+
for (int i = 0; i < rocmon_getNumberOfEvents(gid); i++)
435504
{
436-
fprintf(output_file, "Rocmon, %d, %f, %s, %f, %f\n", rocmon_gpulist[j], rocmon_getTimeOfGroup(rocmon_gpulist[j]), rocmon_getEventName(gid, i), rocmon_getResult(j, gid, i), rocmon_getLastResult(j, gid, i));
505+
for (int j = 0; j < rocmon_numgpus; j++)
506+
{
507+
fprintf(output_file, "Rocmon, %d, %f, %s, %f, %f\n", rocmon_gpulist[j], rocmon_getTimeOfGroup(rocmon_gpulist[j]), rocmon_getEventName(gid, i), rocmon_getResult(j, gid, i), rocmon_getLastResult(j, gid, i));
508+
}
437509
}
438510
}
511+
} else {
512+
appdaemon_output_data data = {
513+
.numDevices = rocmon_numgpus,
514+
.devices = rocmon_gpulist,
515+
.numGroups = rocmon_numgids,
516+
.groups = rocmon_gids,
517+
.getTime = rocmon_getTimeOfGroup,
518+
.getResult = rocmon_getResult,
519+
.numEvents = rocmon_getNumberOfEvents,
520+
};
521+
ret = appdaemon_write_output_file(getenv("LIKWID_ROCMON_OUTPUTFILE"), &data);
522+
if (ret < 0) {
523+
ERROR_PRINT("Failed to write appdaemon data to %s", getenv("LIKWID_ROCMON_OUTPUTFILE"));
524+
}
439525
}
440526

441527
// Cleanup
@@ -484,32 +570,77 @@ static void appdaemon_read_rocmon(void)
484570
#endif
485571

486572

573+
487574
/*
488575
Timeline mode
489576
*/
490577
static void* appdaemon_timeline_main(void* arg)
491578
{
579+
int ret = 0;
492580
int stop = 0;
493581
int target_delay_ms = *((int*)arg);
494-
;
495582

496-
while (1)
583+
#ifdef LIKWID_NVMON
584+
char* nvEventStr = getenv("LIKWID_NVMON_EVENTS");
585+
char* nvGpuStr = getenv("LIKWID_NVMON_GPUS");
586+
if (nvEventStr && nvGpuStr)
587+
{
588+
char *nvVerbosity = getenv("LIKWID_NVMON_VERBOSITY");
589+
if (nvVerbosity) {
590+
likwid_nvmon_verbosity = atoi(nvVerbosity);
591+
nvmon_setVerbosity(likwid_nvmon_verbosity);
592+
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, "Setting verbosity to %d", likwid_nvmon_verbosity);
593+
}
594+
ret = appdaemon_setup_nvmon(nvGpuStr, nvEventStr);
595+
if (!ret)
596+
{
597+
appdaemon_register_exit(appdaemon_close_nvmon);
598+
} else {
599+
fprintf(stderr, "Failed to setup NVMON: %d\n", ret);
600+
}
601+
}
602+
printf("NVMON initialized\n");
603+
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, "NVMON initialized");
604+
#endif
605+
606+
#ifdef LIKWID_ROCMON
607+
char* rocmonEventStr = getenv("LIKWID_ROCMON_EVENTS");
608+
char* rocmonGpuStr = getenv("LIKWID_ROCMON_GPUS");
609+
if (rocmonEventStr && rocmonGpuStr)
610+
{
611+
ret = appdaemon_setup_rocmon(rocmonGpuStr, rocmonEventStr);
612+
if (!ret)
613+
{
614+
appdaemon_register_exit(appdaemon_close_rocmon);
615+
}
616+
}
617+
#endif
618+
619+
while (stop == 0)
497620
{
498-
usleep(target_delay_ms * 1E3);
621+
printf("Thread sleeps for %d ms\n", target_delay_ms);
622+
usleep(target_delay_ms / 1000);
499623

500624
// Check stop status
501625
pthread_mutex_lock(&stopMutex);
502626
stop = stopIssued;
503627
pthread_mutex_unlock(&stopMutex);
504628
if (stop > 0) break;
505-
629+
printf("Thread Reads\n");
506630
#ifdef LIKWID_NVMON
507631
appdaemon_read_nvmon();
508632
#endif
509633
#ifdef LIKWID_ROCMON
510634
appdaemon_read_rocmon();
511635
#endif
512636
}
637+
638+
#ifdef LIKWID_NVMON
639+
appdaemon_close_nvmon();
640+
#endif
641+
#ifdef LIKWID_ROCMON
642+
appdaemon_close_rocmon();
643+
#endif
513644
}
514645

515646

@@ -543,11 +674,13 @@ int __libc_start_main(int (*main) (int,char **,char **),
543674
// Get timeline mode info
544675
char* timelineStr = getenv("LIKWID_INTERVAL");
545676
int timelineInterval = -1; // in ms
677+
int gotTimelineInterval = 0;
546678
if (timelineStr != NULL)
547679
{
548680
timelineInterval = atoi(timelineStr);
681+
gotTimelineInterval = 1;
549682
}
550-
if (timelineInterval == 0)
683+
if (gotTimelineInterval && timelineInterval <= 0)
551684
{
552685
fprintf(stderr, "Invalid timeline interval\n");
553686
return -1;
@@ -568,44 +701,59 @@ int __libc_start_main(int (*main) (int,char **,char **),
568701
fprintf(stderr, "%s", strerror(errno));
569702
return -1;
570703
}
571-
fprintf(output_file, "Backend, GPU, Time, Event, Full Value, Last Value\n");
572-
573-
#ifdef LIKWID_NVMON
574-
char* nvEventStr = getenv("LIKWID_NVMON_EVENTS");
575-
char* nvGpuStr = getenv("LIKWID_NVMON_GPUS");
576-
if (nvEventStr && nvGpuStr)
577-
{
578-
ret = appdaemon_setup_nvmon(nvGpuStr, nvEventStr);
579-
if (!ret)
580-
{
581-
appdaemon_register_exit(appdaemon_close_nvmon);
582-
}
704+
if ((getenv("LIKWID_NVMON_MARKER_FORMAT") == NULL) && (getenv("LIKWID_ROCMON_MARKER_FORMAT") == NULL)) {
705+
fprintf(output_file, "Backend, GPU, Time, Event, Full Value, Last Value\n");
583706
}
584-
#endif
585707

586-
#ifdef LIKWID_ROCMON
587-
char* rocmonEventStr = getenv("LIKWID_ROCMON_EVENTS");
588-
char* rocmonGpuStr = getenv("LIKWID_ROCMON_GPUS");
589-
if (rocmonEventStr && rocmonGpuStr)
590-
{
591-
ret = appdaemon_setup_rocmon(rocmonGpuStr, rocmonEventStr);
592-
if (!ret)
593-
{
594-
appdaemon_register_exit(appdaemon_close_rocmon);
595-
}
596-
}
597-
#endif
708+
598709

599710
// Start timeline thread
600-
if (timelineInterval >= 0)
711+
if (timelineInterval > 0)
601712
{
713+
//printf("Start thread with interval %d\n", timelineInterval);
602714
pthread_t tid;
603715
ret = pthread_create(&tid, NULL, &appdaemon_timeline_main, &timelineInterval);
604716
if (ret < 0)
605717
{
606718
fprintf(stderr, "Failed to create timeline thread\n");
607719
return -1;
608720
}
721+
} else {
722+
#ifdef LIKWID_NVMON
723+
char* nvEventStr = getenv("LIKWID_NVMON_EVENTS");
724+
char* nvGpuStr = getenv("LIKWID_NVMON_GPUS");
725+
if (nvEventStr && nvGpuStr)
726+
{
727+
char *nvVerbosity = getenv("LIKWID_NVMON_VERBOSITY");
728+
if (nvVerbosity) {
729+
likwid_nvmon_verbosity = atoi(nvVerbosity);
730+
nvmon_setVerbosity(likwid_nvmon_verbosity);
731+
GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, "Setting verbosity to %d", likwid_nvmon_verbosity);
732+
}
733+
ret = appdaemon_setup_nvmon(nvGpuStr, nvEventStr);
734+
if (!ret)
735+
{
736+
appdaemon_register_exit(appdaemon_close_nvmon);
737+
}
738+
}
739+
#endif
740+
741+
#ifdef LIKWID_ROCMON
742+
char* rocmonEventStr = getenv("LIKWID_ROCMON_EVENTS");
743+
char* rocmonGpuStr = getenv("LIKWID_ROCMON_GPUS");
744+
if (rocmonEventStr && rocmonGpuStr)
745+
{
746+
char *rocmomVerbosity = getenv("LIKWID_ROCMON_VERBOSITY");
747+
if (rocmomVerbosity) {
748+
rocmon_setVerbosity(atoi(rocmomVerbosity));
749+
}
750+
ret = appdaemon_setup_rocmon(rocmonGpuStr, rocmonEventStr);
751+
if (!ret)
752+
{
753+
appdaemon_register_exit(appdaemon_close_rocmon);
754+
}
755+
}
756+
#endif
609757
}
610758

611759
return original__libc_start_main(main,argc,ubp_av,

0 commit comments

Comments
 (0)