From f2909654bfede2aec356b4a9ae170fdb7cb04884 Mon Sep 17 00:00:00 2001 From: Thomas Naughton Date: Tue, 29 Nov 2022 12:16:05 -0500 Subject: [PATCH 1/4] add OMPI_SPC_TIME_ALLTOALL for MPI_Alltoall timings Signed-off-by: Thomas Naughton --- ompi/mpi/c/alltoall.c | 6 ++++++ ompi/runtime/ompi_spc.c | 1 + ompi/runtime/ompi_spc.h | 1 + 3 files changed, 8 insertions(+) diff --git a/ompi/mpi/c/alltoall.c b/ompi/mpi/c/alltoall.c index 41bf608c890..eec88870fb8 100644 --- a/ompi/mpi/c/alltoall.c +++ b/ompi/mpi/c/alltoall.c @@ -52,6 +52,7 @@ int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, { int err; size_t recvtype_size; + opal_timer_t timer = 0; /* SPC */ SPC_RECORD(OMPI_SPC_ALLTOALL, 1); @@ -116,10 +117,15 @@ int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, } } + SPC_TIMER_START(OMPI_SPC_TIME_ALLTOALL, &timer); + /* Invoke the coll component to perform the back-end operation */ err = comm->c_coll->coll_alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm, comm->c_coll->coll_alltoall_module); + + SPC_TIMER_STOP(OMPI_SPC_TIME_ALLTOALL, &timer); + OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME); } diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 6f1d8aa7d6a..12fe1b9becb 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -143,6 +143,7 @@ static const ompi_spc_event_t ompi_spc_events_desc[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_TESTALL, "The number of times MPI_Testall was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_TESTANY, "The number of times MPI_Testany was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_TESTSOME, "The number of times MPI_Testsome was called.", false, false), + SET_COUNTER_ARRAY(OMPI_SPC_TIME_ALLTOALL, "The number microseconds spent performing the MPI_Alltoall operation. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate.", false, true), SET_COUNTER_ARRAY(OMPI_SPC_WAIT, "The number of times MPI_Wait was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_WAITALL, "The number of times MPI_Waitall was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_WAITANY, "The number of times MPI_Waitany was called.", false, false), diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 76ec7f25f16..59c05af7b75 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -130,6 +130,7 @@ typedef enum ompi_spc_counters { OMPI_SPC_TESTALL, OMPI_SPC_TESTANY, OMPI_SPC_TESTSOME, + OMPI_SPC_TIME_ALLTOALL, OMPI_SPC_WAIT, OMPI_SPC_WAITALL, OMPI_SPC_WAITANY, From d724c7d0eee7810e461320331057cd91ef4bdf11 Mon Sep 17 00:00:00 2001 From: Thomas Naughton Date: Tue, 29 Nov 2022 12:21:34 -0500 Subject: [PATCH 2/4] add OMPI_SPC_TIME_ALLTOALLV for MPI_Alltoallv timings Signed-off-by: Thomas Naughton --- ompi/mpi/c/alltoallv.c | 5 +++++ ompi/runtime/ompi_spc.c | 1 + ompi/runtime/ompi_spc.h | 1 + 3 files changed, 7 insertions(+) diff --git a/ompi/mpi/c/alltoallv.c b/ompi/mpi/c/alltoallv.c index 4106d96d249..36f99e16508 100644 --- a/ompi/mpi/c/alltoallv.c +++ b/ompi/mpi/c/alltoallv.c @@ -50,6 +50,7 @@ int MPI_Alltoallv(const void *sendbuf, const int sendcounts[], MPI_Datatype recvtype, MPI_Comm comm) { int i, size, err; + opal_timer_t timer = 0; /* SPC */ SPC_RECORD(OMPI_SPC_ALLTOALLV, 1); @@ -135,10 +136,14 @@ int MPI_Alltoallv(const void *sendbuf, const int sendcounts[], } #endif + SPC_TIMER_START(OMPI_SPC_TIME_ALLTOALLV, &timer); + /* Invoke the coll component to perform the back-end operation */ err = comm->c_coll->coll_alltoallv(sendbuf, sendcounts, sdispls, sendtype, recvbuf, recvcounts, rdispls, recvtype, comm, comm->c_coll->coll_alltoallv_module); + SPC_TIMER_STOP(OMPI_SPC_TIME_ALLTOALLV, &timer); + OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME); } diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 12fe1b9becb..e1fd046ed50 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -144,6 +144,7 @@ static const ompi_spc_event_t ompi_spc_events_desc[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_TESTANY, "The number of times MPI_Testany was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_TESTSOME, "The number of times MPI_Testsome was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_TIME_ALLTOALL, "The number microseconds spent performing the MPI_Alltoall operation. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate.", false, true), + SET_COUNTER_ARRAY(OMPI_SPC_TIME_ALLTOALLV, "The number microseconds spent performing the MPI_Alltoallv operation. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate.", false, true), SET_COUNTER_ARRAY(OMPI_SPC_WAIT, "The number of times MPI_Wait was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_WAITALL, "The number of times MPI_Waitall was called.", false, false), SET_COUNTER_ARRAY(OMPI_SPC_WAITANY, "The number of times MPI_Waitany was called.", false, false), diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 59c05af7b75..9e475864f29 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -131,6 +131,7 @@ typedef enum ompi_spc_counters { OMPI_SPC_TESTANY, OMPI_SPC_TESTSOME, OMPI_SPC_TIME_ALLTOALL, + OMPI_SPC_TIME_ALLTOALLV, OMPI_SPC_WAIT, OMPI_SPC_WAITALL, OMPI_SPC_WAITANY, From b4bd007b6933b07c3cb873a1e663827ab20f39a9 Mon Sep 17 00:00:00 2001 From: Thomas Naughton Date: Tue, 29 Nov 2022 12:18:00 -0500 Subject: [PATCH 3/4] add ompi_spc_value_diff() SPC utility function revise spc_diff utility fn, now returns converted microseconds instead of cycles, and resets any highwater events Signed-off-by: Thomas Naughton --- ompi/runtime/ompi_spc.c | 67 +++++++++++++++++++++++++++++++++++++++++ ompi/runtime/ompi_spc.h | 6 ++++ 2 files changed, 73 insertions(+) diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index e1fd046ed50..eb56d513009 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -399,6 +399,73 @@ static void ompi_spc_dump(void) ompi_spc_comm->c_coll->coll_barrier(ompi_spc_comm, ompi_spc_comm->c_coll->coll_barrier_module); } + +/* + * Helper function for checking diff with given SPC. + * + * Given a specific SPC name and prior value, we + * get the new value and return the difference between + * the prior and new values (diff = new - prev). + * If do not care about the diff you can pass NULL for spc_diff, + * and will simply get the new_value. + * + * Note: The value for timer events are converted to microseconds. + * Note: Any highwater events are reset after being read. + * + * On success, return MPI_SUCCESS, otherwise return -1. + */ +int ompi_spc_value_diff(char *spc_name, + long long prev_value, + long long *cur_value, + long long *diff) +{ + int i; + long long value = -1; + int found = 0; + + if (NULL == ompi_spc_events) { + //fprintf(stderr, " #-- DBG: WARN: SPC system not setup/available\n"); + return -1; + } + + /* Find the index of given SPC. */ + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { + + /* If this is our requested counter */ + if( 0 == strcmp(ompi_spc_events_desc[i].counter_name, spc_name) ) { + + value = (long long)ompi_spc_events[i].value; + + /* If this is a timer-based counter, convert from cycles to microseconds */ + if( ompi_spc_events[i].is_timer_event ) { + value = ompi_spc_cycles_to_usecs_internal(value); + } + + /* If this is a high watermark counter, reset it after it has been read */ + if( ompi_spc_events[i].is_high_watermark) { + ompi_spc_events[i].value = 0; + } + + found = 1; + break; + } + } + + if (found != 1) { + fprintf(stderr, "Error: Failed to find SPC counter '%s'\n", spc_name); + return -1; + } + + *cur_value = value; + + if (NULL != diff) { + *diff = value - prev_value; + } + + return MPI_SUCCESS; +} + + /* Frees any dynamically allocated OMPI SPC data structures */ void ompi_spc_fini(void) { diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 9e475864f29..7400df0772a 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -183,6 +183,7 @@ typedef struct ompi_spc_s{ void ompi_spc_init(void); void ompi_spc_fini(void); void ompi_spc_cycles_to_usecs(opal_timer_t *cycles); +int ompi_spc_value_diff(char *name, long long prev_value, long long *cur_value, long long *diff); /* An array of event structures to store the event data value, attachments, flags) * The memory is statically allocated to reduce the number of loads required. @@ -214,6 +215,8 @@ ompi_spc_t ompi_spc_events[OMPI_SPC_NUM_COUNTERS] __opal_attribute_aligned__(siz #define SPC_UPDATE_WATERMARK(watermark_enum, value_enum) \ ompi_spc_update_watermark(watermark_enum, value_enum) +#define SPC_VALUE_DIFF(name, prev_value, cur_value, diff) \ + ompi_spc_value_diff(name, prev_value, *cur_value, diff) /* Records an update to a counter using an atomic add operation. */ static inline @@ -313,6 +316,9 @@ void ompi_spc_timer_stop(unsigned int event_id, opal_timer_t *cycles) #define SPC_UPDATE_WATERMARK(watermark_enum, value_enum) \ ((void)0) +#define SPC_VALUE_DIFF(name, prev_value, cur_value, diff) \ + ((void)0) + #endif #endif From ef257b66d8cd74645b532325b2fc45cb40f73830 Mon Sep 17 00:00:00 2001 From: Thomas Naughton Date: Tue, 29 Nov 2022 20:15:27 -0500 Subject: [PATCH 4/4] contrib: add SPC example (Alltoall looper) Signed-off-by: Thomas Naughton --- contrib/spc-a2a-looper/README.md | 45 +++++++ contrib/spc-a2a-looper/a2a_looper.c | 193 ++++++++++++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 contrib/spc-a2a-looper/README.md create mode 100644 contrib/spc-a2a-looper/a2a_looper.c diff --git a/contrib/spc-a2a-looper/README.md b/contrib/spc-a2a-looper/README.md new file mode 100644 index 00000000000..55264ba1fb5 --- /dev/null +++ b/contrib/spc-a2a-looper/README.md @@ -0,0 +1,45 @@ +Example SPC Alltoall Looper +--------------------------- + +Simple example that loops over `MPI_Alltoall()`. It tests +the `OMPI_SPC_TIME_ALLTOALL` counter. The test calculates +the diff per-rank at the App level and shows this info each +loop iteration (at all ranks). + +The counter acculates and shows the full time for all Alltoall, +but the per-rank view shows the diff per loop. + +Pre-reqs +-------- + - Patch with `OMPI_SPC_TIME_ALLTOALL` counter + - OMPI build with `--enable-spc` + +Usage +----- + +```sh + mpirun -np $nprocs ./a2a_looper [N] + + # (optional) arg1 - positive-integer for number of loops +``` + +Example +------- + +Run for just 9 loops: + +```sh + mpirun \ + -np 4 \ + --mca mpi_spc_attach OMPI_SPC_TIME_ALLTOALL \ + --mca mpi_spc_dump_enabled true \ + ./a2a_looper 9 +``` + +Notes +----- + - Less than 10 will print each loop, and + above that will print at each interval of 10 loops. + + - Initial SPC code bits adapted from `ompi/examples/spc_example.c` + diff --git a/contrib/spc-a2a-looper/a2a_looper.c b/contrib/spc-a2a-looper/a2a_looper.c new file mode 100644 index 00000000000..c5f2ba01772 --- /dev/null +++ b/contrib/spc-a2a-looper/a2a_looper.c @@ -0,0 +1,193 @@ +/* + * Tue Nov 29 2022 Thomas Naughton + * + * Loops over MPI_Alltoall() 'MAX_NLOOP' times. + * + * Usage: mpirun -np $nprocs ./a2a_looper [N] + * + * Optional position-sensitive argument: + * arg1 - positive-integer for number of loops + * + * If no args are provided the program uses default values. + * + * Note: Initial SPC code bits adapted from 'ompi/examples/spc_example.c' + * + * TJN: Modified to have only one counter (OMPI_SPC_TIME_ALLTOALL), + * also we calculate the diff per-rank at the App level and show + * this info each run (at all ranks). + */ +#include +#include +#include +#include +#include +#include + +int MAX_NLOOP = 100; + +int main (int argc, char **argv) +{ + int rank, size; + int *inbuf = NULL; + int *outbuf = NULL; + int i, j; + int nloop; + + int rc; + int provided, num, name_len, desc_len, verbosity, bind, var_class, readonly, continuous, atomic, count, index; + char name[256], description[256]; + MPI_Datatype datatype; + MPI_T_enum enumtype; + long long value; + int found = 0; + int num_elem = 1024; + long long _time_alltoall_past_value = 0; + + if (argc > 1) { + MAX_NLOOP = atoi(argv[1]); + } + + MPI_Init (&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + /* Counter names to be read by ranks 0 and 1 */ + /* (See also: ompi_spc_counters_t for list) */ + char *counter_name = "runtime_spc_OMPI_SPC_TIME_ALLTOALL"; + MPI_T_pvar_handle handle; + MPI_T_pvar_session session; + + MPI_T_init_thread(MPI_THREAD_SINGLE, &provided); + + /* Determine the MPI_T pvar indices for the OMPI_BYTES_SENT/RECIEVED_USER SPCs */ + MPI_T_pvar_get_num(&num); + + rc = MPI_T_pvar_session_create(&session); + + for(i = 0; i < num; i++) { + name_len = desc_len = 256; + rc = PMPI_T_pvar_get_info(i, name, &name_len, &verbosity, + &var_class, &datatype, &enumtype, description, &desc_len, &bind, + &readonly, &continuous, &atomic); + if( MPI_SUCCESS != rc ) + continue; + + if(strcmp(name, counter_name) == 0) { + /* Create the MPI_T sessions/handles for the counters and start the counters */ + rc = MPI_T_pvar_handle_alloc(session, i, NULL, &handle, &count); + rc = MPI_T_pvar_start(session, handle); + found = 1; + //printf("[%d] =====================================\n", rank); + //printf("[%d] %s -> %s\n", rank, name, description); + //printf("[%d] =====================================\n", rank); + //fflush(stdout); + } + } + + /* Make sure we found the counters */ + if(found == 0) { + fprintf(stderr, "ERROR: Couldn't find the appropriate SPC counter in the MPI_T pvars.\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + inbuf = (int *) malloc ( size * num_elem * sizeof(int) ); + if (NULL == inbuf) { + fprintf(stderr, "Error: malloc failed (inbuf)\n"); + goto cleanup; + } + + outbuf = (int *) malloc ( size * num_elem * sizeof(int) ); + if (NULL == outbuf) { + fprintf(stderr, "Error: malloc failed (outbuf)\n"); + goto cleanup; + } + + for (i=0; i < size * num_elem; i++) { + inbuf[i] = 100 + rank; + outbuf[i] = 0; + } + + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + for (nloop=0; nloop < MAX_NLOOP; nloop++) { + long long tmp_max; + int global_rc; + long long new_value = 0; + long long diff = 0; + + MPI_Barrier(MPI_COMM_WORLD); + fflush(NULL); + + rc = MPI_Alltoall(inbuf, num_elem, MPI_INT, outbuf, num_elem, MPI_INT, MPI_COMM_WORLD); + + /* Check if alltoall had any problems? */ + MPI_Allreduce( &rc, &global_rc, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); + if (rank == 0) { + if (global_rc != 0) { + fprintf(stderr, "Error: Alltoall failed! (rc=%d)\n", global_rc); + goto cleanup; + } + } + + MPI_T_pvar_read(session, handle, &value); + MPI_Allreduce(&value, &tmp_max, 1, MPI_LONG_LONG, MPI_MAX, MPI_COMM_WORLD); + + rc = ompi_spc_value_diff("OMPI_SPC_TIME_ALLTOALL", + _time_alltoall_past_value, + &new_value, + &diff); + + + MPI_Barrier(MPI_COMM_WORLD); + + if ((MAX_NLOOP <= 20) || ( !(nloop % 10) )) { + //int usecs = 0; + int usecs = 250000; /* 0.25 sec */ + //int usecs = 100000; /* 0.1 sec */ + //int usecs = 2000000; /* 2 sec */ + + printf("%12s: Rank: %5d Size: %5d Loop: %8d %s: %lld max: %lld prev_value: %lld new_value: %lld diff: %lld -- SLEEP: %dus\n", + "a2a_looper", rank, size, nloop, counter_name, value, tmp_max, _time_alltoall_past_value, new_value, diff, usecs); + usleep(usecs); + } + + _time_alltoall_past_value = new_value; + + fflush(NULL); + MPI_Barrier(MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + +#if 0 + printf("[%d] ==========================\n", rank); + fflush(NULL); + + rc = MPI_T_pvar_read(session, handle, &value); + printf("TJN: [%d] Value Read: %lld (%s)\n", rank, value, counter_name); + fflush(stdout); + + MPI_Barrier(MPI_COMM_WORLD); +#endif + + /* Stop the MPI_T session, free the handle, and then free the session */ + rc = MPI_T_pvar_stop(session, handle); + rc = MPI_T_pvar_handle_free(session, &handle); + + /* Stop the MPI_T session, free the handle, and then free the session */ + rc = MPI_T_pvar_session_free(&session); + +cleanup: + if (NULL != inbuf) + free(inbuf); + + if (NULL != outbuf) + free(outbuf); + + MPI_T_finalize(); + MPI_Finalize(); + + return (0); +}