@@ -53,7 +53,7 @@ static FILE* output_file = NULL;
5353
5454// Timeline mode
5555static int stopIssued = 0 ;
56- static pthread_mutex_t stopMutex ;
56+ static pthread_mutex_t stopMutex = PTHREAD_MUTEX_INITIALIZER ;
5757
5858static int appdaemon_register_exit (appdaemon_exit_func f )
5959{
@@ -88,7 +88,6 @@ static void prepare_ldpreload()
8888 char * ldpreload = getenv ("LD_PRELOAD" );
8989 if (ldpreload )
9090 {
91- printf ("Old LD_PRELOAD=%s\n" , ldpreload );
9291 bstring bldpre = bfromcstr (ldpreload );
9392 bstring new_bldpre = bfromcstr ("" );
9493 struct bstrList * liblist = bsplit (bldpre , ':' );
@@ -100,7 +99,6 @@ static void prepare_ldpreload()
10099 bconchar (new_bldpre , ':' );
101100 }
102101 }
103- printf ("New LD_PRELOAD=%s\n" , bdata (new_bldpre ));
104102 mysetenv ("LD_PRELOAD" , bdata (new_bldpre ), 1 );
105103 bstrListDestroy (liblist );
106104 bdestroy (new_bldpre );
@@ -144,6 +142,59 @@ static int parse_gpustr(char* gpuStr, int* numGpus, int** gpuIds)
144142 return 0 ;
145143}
146144
145+ typedef struct {
146+ int numDevices ;
147+ int * devices ;
148+ int numGroups ;
149+ int * groups ;
150+ double (* getTime )(int group );
151+ int (* numEvents )(int group );
152+ double (* getResult )(int gpu , int group , int event );
153+ } appdaemon_output_data ;
154+
155+ static int appdaemon_write_output_file (const char * markerfile , appdaemon_output_data * data ) {
156+
157+ /* MarkerAPI File format
158+ * 1 numberOfGPUs numberOfRegions numberOfGpuGroups
159+ * 2 regionID:regionTag0
160+ * 3 regionID:regionTag1
161+ * 4 regionID groupID gpuID callCount timeActive numEvents countersvalues(space separated)
162+ * 5 regionID groupID gpuID callCount timeActive numEvents countersvalues(space separated)
163+ */
164+ /* Here we use it to hand over the results to likwid-perfctr */
165+
166+ // Open file in write mode
167+ FILE * file = fopen (markerfile ,"w" );
168+ if (file == NULL )
169+ {
170+ int ret = errno ;
171+ fprintf (stderr , "Cannot open file %s\n" , markerfile );
172+ fprintf (stderr , "%s" , strerror (errno ));
173+ return - ret ;
174+ }
175+ fprintf (file ,"%d 1 %d\n" , data -> numDevices , data -> numGroups );
176+ int regionId = 0 ;
177+ for (int i = 0 ; i < data -> numGroups ; i ++ ) {
178+ fprintf (file , "%d:appdaemon-%d\n" , regionId , data -> groups [i ]);
179+ }
180+ for (int i = 0 ; i < data -> numGroups ; i ++ ) {
181+ int groupId = data -> groups [i ];
182+ int numEvents = data -> numEvents (groupId );
183+ double time = data -> getTime (groupId );
184+ for (int j = 0 ; j < data -> numDevices ; j ++ ) {
185+ fprintf (file , "%d %d %d %u %e %d " , regionId , groupId , data -> devices [j ], 1 , time , numEvents );
186+ for (int k = 0 ; k < numEvents ; k ++ ) {
187+ fprintf (file , "%e " , data -> getResult (groupId , k , j ));
188+ }
189+ fprintf (file , "\n" );
190+ }
191+ }
192+ fflush (file );
193+ fclose (file );
194+ return 0 ;
195+ }
196+
197+
147198/*
148199Nvmon
149200*/
@@ -153,12 +204,13 @@ static int* nvmon_gpulist = NULL;
153204static int nvmon_numgpus = 0 ;
154205static int * nvmon_gids = NULL ;
155206static int nvmon_numgids = 0 ;
207+ int likwid_nvmon_verbosity = 0 ;
156208
157209static int appdaemon_setup_nvmon (char * gpuStr , char * eventStr )
158210{
159211 int ret = 0 ;
160- printf ( "Nvmon GPU string: %s\n " , gpuStr );
161- printf ( "Nvmon Event string: %s\n " , eventStr );
212+ GPUDEBUG_PRINT ( DEBUGLEV_DEVELOP , "Nvmon GPU string: %s" , gpuStr );
213+ GPUDEBUG_PRINT ( DEBUGLEV_DEVELOP , "Nvmon Event string: %s" , eventStr );
162214
163215 // Parse gpu string
164216 ret = parse_gpustr (gpuStr , & nvmon_numgpus , & nvmon_gpulist );
@@ -257,18 +309,35 @@ static void appdaemon_close_nvmon(void)
257309 }
258310
259311 // Print results
260- for ( int g = 0 ; g < nvmon_numgids ; g ++ )
312+ if ( getenv ( "LIKWID_NVMON_MARKER_FORMAT" ) == NULL )
261313 {
262- int gid = nvmon_gids [g ];
263- for (int i = 0 ; i < nvmon_getNumberOfEvents (gid ); i ++ )
314+ for (int g = 0 ; g < nvmon_numgids ; g ++ )
264315 {
265- for (int j = 0 ; j < nvmon_numgpus ; j ++ )
316+ int gid = nvmon_gids [g ];
317+ for (int i = 0 ; i < nvmon_getNumberOfEvents (gid ); i ++ )
266318 {
267- fprintf (output_file , "Nvmon, %d, %f, %s, %f, %f\n" , nvmon_gpulist [j ], nvmon_getTimeOfGroup (nvmon_gpulist [j ]), nvmon_getEventName (gid , i ), nvmon_getResult (gid , i , j ), nvmon_getLastResult (gid , i , j ));
319+ for (int j = 0 ; j < nvmon_numgpus ; j ++ )
320+ {
321+ fprintf (output_file , "Nvmon, %d, %f, %s, %f, %f\n" , nvmon_gpulist [j ], nvmon_getTimeOfGroup (gid ), nvmon_getEventName (gid , i ), nvmon_getResult (gid , i , j ), nvmon_getLastResult (gid , i , j ));
322+ }
268323 }
269324 }
325+ fflush (output_file );
326+ } else {
327+ appdaemon_output_data data = {
328+ .numDevices = nvmon_numgpus ,
329+ .devices = nvmon_gpulist ,
330+ .numGroups = nvmon_numgids ,
331+ .groups = nvmon_gids ,
332+ .getTime = nvmon_getTimeOfGroup ,
333+ .getResult = nvmon_getResult ,
334+ .numEvents = nvmon_getNumberOfEvents ,
335+ };
336+ ret = appdaemon_write_output_file (getenv ("LIKWID_NVMON_OUTPUTFILE" ), & data );
337+ if (ret < 0 ) {
338+ ERROR_PRINT ("Failed to write appdaemon data to %s" , getenv ("LIKWID_NVMON_OUTPUTFILE" ));
339+ }
270340 }
271- fflush (output_file );
272341
273342 // Cleanup
274343 if (nvmon_initialized )
@@ -308,7 +377,7 @@ static void appdaemon_read_nvmon(void)
308377 {
309378 for (int j = 0 ; j < nvmon_numgpus ; j ++ )
310379 {
311- fprintf (output_file , "Nvmon, %d, %f, %s, %f, %f\n" , nvmon_gpulist [j ], nvmon_getTimeToLastReadOfGroup (nvmon_gpulist [ j ] ), nvmon_getEventName (gid , i ), nvmon_getResult (gid , i , j ), nvmon_getLastResult (gid , i , j ));
380+ fprintf (output_file , "Nvmon, %d, %f, %s, %f, %f\n" , nvmon_gpulist [j ], nvmon_getTimeToLastReadOfGroup (gid ), nvmon_getEventName (gid , i ), nvmon_getResult (gid , i , j ), nvmon_getLastResult (gid , i , j ));
312381 }
313382 }
314383 }
@@ -328,8 +397,8 @@ static int rocmon_numgids = 0;
328397static int appdaemon_setup_rocmon (char * gpuStr , char * eventStr )
329398{
330399 int ret = 0 ;
331- printf ( "Rocmon GPU string: %s\n" , gpuStr );
332- printf ( "Rocmon Event string: %s\n" , eventStr );
400+ GPUDEBUG_PRINT ( DEBUGLEV_DEVELOP , "Rocmon GPU string: %s\n" , gpuStr );
401+ GPUDEBUG_PRINT ( DEBUGLEV_DEVELOP , "Rocmon Event string: %s\n" , eventStr );
333402
334403 // Parse gpu string
335404 ret = parse_gpustr (gpuStr , & rocmon_numgpus , & rocmon_gpulist );
@@ -425,17 +494,34 @@ static void appdaemon_close_rocmon(void)
425494 ERROR_PRINT ("Failed to stop rocmon" );
426495 }
427496
428- // Print results
429- for (int g = 0 ; g < rocmon_numgids ; g ++ )
497+ if (getenv ("LIKWID_ROCMON_MARKER_FORMAT" ) == NULL )
430498 {
431- int gid = rocmon_gids [ g ];
432- for (int i = 0 ; i < rocmon_getNumberOfEvents ( gid ); i ++ )
499+ // Print results
500+ for (int g = 0 ; g < rocmon_numgids ; g ++ )
433501 {
434- for (int j = 0 ; j < rocmon_numgpus ; j ++ )
502+ int gid = rocmon_gids [g ];
503+ for (int i = 0 ; i < rocmon_getNumberOfEvents (gid ); i ++ )
435504 {
436- fprintf (output_file , "Rocmon, %d, %f, %s, %f, %f\n" , rocmon_gpulist [j ], rocmon_getTimeOfGroup (rocmon_gpulist [j ]), rocmon_getEventName (gid , i ), rocmon_getResult (j , gid , i ), rocmon_getLastResult (j , gid , i ));
505+ for (int j = 0 ; j < rocmon_numgpus ; j ++ )
506+ {
507+ fprintf (output_file , "Rocmon, %d, %f, %s, %f, %f\n" , rocmon_gpulist [j ], rocmon_getTimeOfGroup (rocmon_gpulist [j ]), rocmon_getEventName (gid , i ), rocmon_getResult (j , gid , i ), rocmon_getLastResult (j , gid , i ));
508+ }
437509 }
438510 }
511+ } else {
512+ appdaemon_output_data data = {
513+ .numDevices = rocmon_numgpus ,
514+ .devices = rocmon_gpulist ,
515+ .numGroups = rocmon_numgids ,
516+ .groups = rocmon_gids ,
517+ .getTime = rocmon_getTimeOfGroup ,
518+ .getResult = rocmon_getResult ,
519+ .numEvents = rocmon_getNumberOfEvents ,
520+ };
521+ ret = appdaemon_write_output_file (getenv ("LIKWID_ROCMON_OUTPUTFILE" ), & data );
522+ if (ret < 0 ) {
523+ ERROR_PRINT ("Failed to write appdaemon data to %s" , getenv ("LIKWID_ROCMON_OUTPUTFILE" ));
524+ }
439525 }
440526
441527 // Cleanup
@@ -484,32 +570,77 @@ static void appdaemon_read_rocmon(void)
484570#endif
485571
486572
573+
487574/*
488575Timeline mode
489576*/
490577static void * appdaemon_timeline_main (void * arg )
491578{
579+ int ret = 0 ;
492580 int stop = 0 ;
493581 int target_delay_ms = * ((int * )arg );
494- ;
495582
496- while (1 )
583+ #ifdef LIKWID_NVMON
584+ char * nvEventStr = getenv ("LIKWID_NVMON_EVENTS" );
585+ char * nvGpuStr = getenv ("LIKWID_NVMON_GPUS" );
586+ if (nvEventStr && nvGpuStr )
587+ {
588+ char * nvVerbosity = getenv ("LIKWID_NVMON_VERBOSITY" );
589+ if (nvVerbosity ) {
590+ likwid_nvmon_verbosity = atoi (nvVerbosity );
591+ nvmon_setVerbosity (likwid_nvmon_verbosity );
592+ GPUDEBUG_PRINT (DEBUGLEV_DEVELOP , "Setting verbosity to %d" , likwid_nvmon_verbosity );
593+ }
594+ ret = appdaemon_setup_nvmon (nvGpuStr , nvEventStr );
595+ if (!ret )
596+ {
597+ appdaemon_register_exit (appdaemon_close_nvmon );
598+ } else {
599+ fprintf (stderr , "Failed to setup NVMON: %d\n" , ret );
600+ }
601+ }
602+ printf ("NVMON initialized\n" );
603+ GPUDEBUG_PRINT (DEBUGLEV_DEVELOP , "NVMON initialized" );
604+ #endif
605+
606+ #ifdef LIKWID_ROCMON
607+ char * rocmonEventStr = getenv ("LIKWID_ROCMON_EVENTS" );
608+ char * rocmonGpuStr = getenv ("LIKWID_ROCMON_GPUS" );
609+ if (rocmonEventStr && rocmonGpuStr )
610+ {
611+ ret = appdaemon_setup_rocmon (rocmonGpuStr , rocmonEventStr );
612+ if (!ret )
613+ {
614+ appdaemon_register_exit (appdaemon_close_rocmon );
615+ }
616+ }
617+ #endif
618+
619+ while (stop == 0 )
497620 {
498- usleep (target_delay_ms * 1E3 );
621+ printf ("Thread sleeps for %d ms\n" , target_delay_ms );
622+ usleep (target_delay_ms / 1000 );
499623
500624 // Check stop status
501625 pthread_mutex_lock (& stopMutex );
502626 stop = stopIssued ;
503627 pthread_mutex_unlock (& stopMutex );
504628 if (stop > 0 ) break ;
505-
629+ printf ( "Thread Reads\n" );
506630#ifdef LIKWID_NVMON
507631 appdaemon_read_nvmon ();
508632#endif
509633#ifdef LIKWID_ROCMON
510634 appdaemon_read_rocmon ();
511635#endif
512636 }
637+
638+ #ifdef LIKWID_NVMON
639+ appdaemon_close_nvmon ();
640+ #endif
641+ #ifdef LIKWID_ROCMON
642+ appdaemon_close_rocmon ();
643+ #endif
513644}
514645
515646
@@ -543,11 +674,13 @@ int __libc_start_main(int (*main) (int,char **,char **),
543674 // Get timeline mode info
544675 char * timelineStr = getenv ("LIKWID_INTERVAL" );
545676 int timelineInterval = -1 ; // in ms
677+ int gotTimelineInterval = 0 ;
546678 if (timelineStr != NULL )
547679 {
548680 timelineInterval = atoi (timelineStr );
681+ gotTimelineInterval = 1 ;
549682 }
550- if (timelineInterval = = 0 )
683+ if (gotTimelineInterval && timelineInterval < = 0 )
551684 {
552685 fprintf (stderr , "Invalid timeline interval\n" );
553686 return -1 ;
@@ -568,44 +701,59 @@ int __libc_start_main(int (*main) (int,char **,char **),
568701 fprintf (stderr , "%s" , strerror (errno ));
569702 return -1 ;
570703 }
571- fprintf (output_file , "Backend, GPU, Time, Event, Full Value, Last Value\n" );
572-
573- #ifdef LIKWID_NVMON
574- char * nvEventStr = getenv ("LIKWID_NVMON_EVENTS" );
575- char * nvGpuStr = getenv ("LIKWID_NVMON_GPUS" );
576- if (nvEventStr && nvGpuStr )
577- {
578- ret = appdaemon_setup_nvmon (nvGpuStr , nvEventStr );
579- if (!ret )
580- {
581- appdaemon_register_exit (appdaemon_close_nvmon );
582- }
704+ if ((getenv ("LIKWID_NVMON_MARKER_FORMAT" ) == NULL ) && (getenv ("LIKWID_ROCMON_MARKER_FORMAT" ) == NULL )) {
705+ fprintf (output_file , "Backend, GPU, Time, Event, Full Value, Last Value\n" );
583706 }
584- #endif
585707
586- #ifdef LIKWID_ROCMON
587- char * rocmonEventStr = getenv ("LIKWID_ROCMON_EVENTS" );
588- char * rocmonGpuStr = getenv ("LIKWID_ROCMON_GPUS" );
589- if (rocmonEventStr && rocmonGpuStr )
590- {
591- ret = appdaemon_setup_rocmon (rocmonGpuStr , rocmonEventStr );
592- if (!ret )
593- {
594- appdaemon_register_exit (appdaemon_close_rocmon );
595- }
596- }
597- #endif
708+
598709
599710 // Start timeline thread
600- if (timelineInterval >= 0 )
711+ if (timelineInterval > 0 )
601712 {
713+ //printf("Start thread with interval %d\n", timelineInterval);
602714 pthread_t tid ;
603715 ret = pthread_create (& tid , NULL , & appdaemon_timeline_main , & timelineInterval );
604716 if (ret < 0 )
605717 {
606718 fprintf (stderr , "Failed to create timeline thread\n" );
607719 return -1 ;
608720 }
721+ } else {
722+ #ifdef LIKWID_NVMON
723+ char * nvEventStr = getenv ("LIKWID_NVMON_EVENTS" );
724+ char * nvGpuStr = getenv ("LIKWID_NVMON_GPUS" );
725+ if (nvEventStr && nvGpuStr )
726+ {
727+ char * nvVerbosity = getenv ("LIKWID_NVMON_VERBOSITY" );
728+ if (nvVerbosity ) {
729+ likwid_nvmon_verbosity = atoi (nvVerbosity );
730+ nvmon_setVerbosity (likwid_nvmon_verbosity );
731+ GPUDEBUG_PRINT (DEBUGLEV_DEVELOP , "Setting verbosity to %d" , likwid_nvmon_verbosity );
732+ }
733+ ret = appdaemon_setup_nvmon (nvGpuStr , nvEventStr );
734+ if (!ret )
735+ {
736+ appdaemon_register_exit (appdaemon_close_nvmon );
737+ }
738+ }
739+ #endif
740+
741+ #ifdef LIKWID_ROCMON
742+ char * rocmonEventStr = getenv ("LIKWID_ROCMON_EVENTS" );
743+ char * rocmonGpuStr = getenv ("LIKWID_ROCMON_GPUS" );
744+ if (rocmonEventStr && rocmonGpuStr )
745+ {
746+ char * rocmomVerbosity = getenv ("LIKWID_ROCMON_VERBOSITY" );
747+ if (rocmomVerbosity ) {
748+ rocmon_setVerbosity (atoi (rocmomVerbosity ));
749+ }
750+ ret = appdaemon_setup_rocmon (rocmonGpuStr , rocmonEventStr );
751+ if (!ret )
752+ {
753+ appdaemon_register_exit (appdaemon_close_rocmon );
754+ }
755+ }
756+ #endif
609757 }
610758
611759 return original__libc_start_main (main ,argc ,ubp_av ,
0 commit comments