From dc761bb7bd774acc8e265b4be1f4cb0125f52237 Mon Sep 17 00:00:00 2001 From: Thomas Naughton Date: Thu, 15 Oct 2020 15:56:47 -0400 Subject: [PATCH 1/5] add timer 'OMPI_SPC_TIME_ALLTOALL' for alltoall Signed-off-by: Thomas Naughton --- ompi/mpi/c/alltoall.c | 6 ++++++ ompi/runtime/ompi_spc.c | 2 ++ ompi/runtime/ompi_spc.h | 1 + 3 files changed, 9 insertions(+) diff --git a/ompi/mpi/c/alltoall.c b/ompi/mpi/c/alltoall.c index 303b82a2e81..41030f9470a 100644 --- a/ompi/mpi/c/alltoall.c +++ b/ompi/mpi/c/alltoall.c @@ -51,6 +51,7 @@ int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, { int err; size_t recvtype_size; + opal_timer_t timer = 0; /* SPC */ SPC_RECORD(OMPI_SPC_ALLTOALL, 1); @@ -106,10 +107,15 @@ int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, OPAL_CR_ENTER_LIBRARY(); + SPC_TIMER_START(OMPI_SPC_TIME_ALLTOALL, &timer); + /* Invoke the coll component to perform the back-end operation */ err = comm->c_coll->coll_alltoall(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm, comm->c_coll->coll_alltoall_module); + + SPC_TIMER_STOP(OMPI_SPC_TIME_ALLTOALL, &timer); + OMPI_ERRHANDLER_RETURN(err, comm, err, FUNC_NAME); } diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index 099934c658c..d4e70165bac 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -120,6 +120,7 @@ static ompi_spc_event_t ompi_spc_events_names[OMPI_SPC_NUM_COUNTERS] = { SET_COUNTER_ARRAY(OMPI_SPC_TESTALL, "The number of times MPI_Testall was called."), SET_COUNTER_ARRAY(OMPI_SPC_TESTANY, "The number of times MPI_Testany was called."), SET_COUNTER_ARRAY(OMPI_SPC_TESTSOME, "The number of times MPI_Testsome was called."), + SET_COUNTER_ARRAY(OMPI_SPC_TIME_ALLTOALL, "The number microseconds spent performing the MPI_Alltoall operation. Note: The timer used on the back end is in cycles, which could potentially be problematic on a system where the clock frequency can change. On such a system, this counter could be inaccurate since we assume a fixed clock rate."), SET_COUNTER_ARRAY(OMPI_SPC_WAIT, "The number of times MPI_Wait was called."), SET_COUNTER_ARRAY(OMPI_SPC_WAITALL, "The number of times MPI_Waitall was called."), SET_COUNTER_ARRAY(OMPI_SPC_WAITANY, "The number of times MPI_Waitany was called."), @@ -331,6 +332,7 @@ void ompi_spc_init(void) /* If this is a timer event, set the corresponding timer_event entry */ SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_MATCH_TIME); + SET_SPC_BIT(ompi_spc_timer_event, OMPI_SPC_TIME_ALLTOALL); opal_argv_free(arg_strings); } diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 5d040511c34..5ba88acc6cc 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -138,6 +138,7 @@ typedef enum ompi_spc_counters { OMPI_SPC_TESTALL, OMPI_SPC_TESTANY, OMPI_SPC_TESTSOME, + OMPI_SPC_TIME_ALLTOALL, OMPI_SPC_WAIT, OMPI_SPC_WAITALL, OMPI_SPC_WAITANY, From b32119e2369d6cc261428d6fa9140573fe037d7a Mon Sep 17 00:00:00 2001 From: Thomas Naughton Date: Mon, 26 Oct 2020 16:33:18 -0400 Subject: [PATCH 2/5] add MCA params for congestion stuff Example: - `--mca coll_tuned_alltoall_congest_algorithm 3` - `--mca coll_tuned_alltoall_congest_threshold 200` Signed-off-by: Thomas Naughton --- ompi/mca/coll/tuned/coll_tuned.h | 4 ++++ ompi/mca/coll/tuned/coll_tuned_component.c | 24 ++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index e4d66cc6004..4cb9d5d8131 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -47,6 +47,10 @@ extern int ompi_coll_tuned_scatter_large_msg; extern int ompi_coll_tuned_scatter_min_procs; extern int ompi_coll_tuned_scatter_blocking_send_ratio; +/* Congestion variables */ +extern int ompi_coll_tuned_alltoall_congest_algorithm; +extern int ompi_coll_tuned_alltoall_congest_threshold; + /* forced algorithm choices */ /* this structure is for storing the indexes to the forced algorithm mca params... */ /* we get these at component query (so that registered values appear in ompi_infoi) */ diff --git a/ompi/mca/coll/tuned/coll_tuned_component.c b/ompi/mca/coll/tuned/coll_tuned_component.c index 7f6764d5f98..a572cdbc7df 100644 --- a/ompi/mca/coll/tuned/coll_tuned_component.c +++ b/ompi/mca/coll/tuned/coll_tuned_component.c @@ -65,6 +65,12 @@ int ompi_coll_tuned_alltoall_large_msg = 3000; int ompi_coll_tuned_alltoall_min_procs = 0; /* disable by default */ int ompi_coll_tuned_alltoall_max_requests = 0; /* no limit for alltoall by default */ + +/* Congestion variables */ +int ompi_coll_tuned_alltoall_congest_algorithm = 3; /* Default algo during congestion */ +int ompi_coll_tuned_alltoall_congest_threshold = 100; /* Threshold to decide on congestion */ + + /* Disable by default */ int ompi_coll_tuned_scatter_intermediate_msg = 0; int ompi_coll_tuned_scatter_large_msg = 0; @@ -191,6 +197,24 @@ static int tuned_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_coll_tuned_dynamic_rules_filename); + ompi_coll_tuned_alltoall_congest_threshold = 100; + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "alltoall_congest_threshold", + "Threshold (if SPCs enabled) to decide if congestion present in alltoall algorithm (integer: num cycles diff)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_alltoall_congest_threshold); + + ompi_coll_tuned_alltoall_congest_algorithm = 3; + (void) mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version, + "alltoall_congest_algorithm", + "Algorithm to use when congestion is present in alltoall operation (integer: 0-5)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_coll_tuned_alltoall_congest_algorithm); + /* register forced params */ ompi_coll_tuned_allreduce_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLREDUCE]); ompi_coll_tuned_alltoall_intra_check_forced_init(&ompi_coll_tuned_forced_params[ALLTOALL]); From 35c4d6041fe49cd13db599f7fea6e9ae3c83dd36 Mon Sep 17 00:00:00 2001 From: Thomas Naughton Date: Mon, 26 Oct 2020 16:44:27 -0400 Subject: [PATCH 3/5] add isCongested/get_congest_algo/ompi_spc_value_diff bits Signed-off-by: Thomas Naughton --- ompi/mca/coll/tuned/coll_tuned.h | 4 + .../coll/tuned/coll_tuned_alltoall_decision.c | 25 +++ ompi/mca/coll/tuned/coll_tuned_module.c | 160 ++++++++++++++++++ ompi/runtime/ompi_spc.c | 147 ++++++++++++++++ ompi/runtime/ompi_spc.h | 2 + 5 files changed, 338 insertions(+) diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index 4cb9d5d8131..ee7f0b2d910 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -188,6 +188,10 @@ int ompi_coll_tuned_scan_intra_check_forced_init (coll_tuned_force_algorithm_mca int mca_coll_tuned_ft_event(int state); +/* Congestion functions */ +int ompi_coll_tuned_get_congest_algo(void); +int ompi_coll_tuned_isCongested(struct ompi_communicator_t *comm); + struct mca_coll_tuned_component_t { /** Base coll component */ mca_coll_base_component_2_0_0_t super; diff --git a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c index b63037e1237..094fcb39f82 100644 --- a/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_alltoall_decision.c @@ -161,9 +161,34 @@ int ompi_coll_tuned_alltoall_intra_do_this(const void *sbuf, int scount, int algorithm, int faninout, int segsize, int max_requests) { + + /* + * TJN: Check if congested, and if so change the algorithm. + * TODO: Should likely make this coll-type specific, + * i.e., ompi_coll_tuned_alltoall_isCongested() + * They can share code, but likely different heuristics + * for different coll types. + */ + if ( ompi_coll_tuned_isCongested(comm) ) { + int new_alg; + int comm_rank = 0; + comm_rank = ompi_comm_rank(comm); + + new_alg = ompi_coll_tuned_get_congest_algo(); + + if (new_alg >= 0) { + OPAL_OUTPUT((ompi_coll_tuned_stream, " # (Rank %d) DBG: intra_do_this CONGESTED OVERRIDE algorithm = %d with new_alg = %d\n", comm_rank, algorithm, new_alg)); + //fprintf(stderr, " # (Rank %d) DBG: intra_do_this CONGESTED OVERRIDE algorithm = %d with new_alg = %d\n", comm_rank, algorithm, new_alg); + algorithm = new_alg; + } + } + OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d", algorithm, faninout, segsize)); + //fprintf(stderr, " # (Rank %d) DBG: coll:tuned:alltoall_intra_do_this selected algorithm %d topo faninout %d segsize %d\n", + // comm_rank, algorithm, faninout, segsize); + switch (algorithm) { case (0): return ompi_coll_tuned_alltoall_intra_dec_fixed(sbuf, scount, sdtype, rbuf, rcount, rdtype, comm, module); diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index bf2c7da1434..d39d8b44a36 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -27,6 +27,7 @@ #include "mpi.h" #include "ompi/communicator/communicator.h" +#include "ompi/runtime/ompi_spc.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/base.h" #include "ompi/mca/coll/base/coll_base_topo.h" @@ -119,6 +120,141 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority) /* We put all routines that handle the MCA user forced algorithm and parameter choices here */ /* recheck the setting of forced, called on module create (i.e. for each new comm) */ + +/* Congestion - past value for alltoall */ +static int _congest_spc_time_alltoall_past_value = 0; + + + +/* + * Congestion - detection function + * + * TJN: This is a very simplistic congestion detection method. + * It has several flaws, but shows a basic threshold based + * method for determining "congested". + * The threshold can be adjust via an MCA parameter. + * There are also a few magic envvars to override things + * for testing. + * + * NOTE: This is specific to Alltoall for now. + * + * - MCA: coll_tuned_alltoall_congest_threshold + * The threshold at which point we decide the difference + * in current/past value of SPC indicates "congested". + * + * - EnvVar: 'OMPIX_SKIP_CONGESTED_ALLREDUCE' + * This will not detect congestion, becasue only do a + * local check and need to have concensus across the comm. + * This is just a way to show the difference at each + * rank, but always returns "not-congested" overall. + * + * - EnvVar: 'OMPIX_FORCE_CONGESTED' + * This is just a hardcoded flag to force the congestion + * check to return true regardless of the actual status + * of the network. + */ +int +ompi_coll_tuned_isCongested(struct ompi_communicator_t *comm) +{ + long long new_value = 0; + long long diff = 0; + int rc; + int comm_rank; + long long diff_max = 0; + + /* Get our local congestion info */ + rc = ompi_spc_value_diff("OMPI_SPC_TIME_ALLTOALL", + _congest_spc_time_alltoall_past_value, + &new_value, + &diff); + if (0 != rc) { + return 0; /* Ignore error for now (treat as not congested) */ + } + + comm_rank = ompi_comm_rank(comm); + + OPAL_OUTPUT((ompi_coll_tuned_stream, " #-- DBG: (Rank %d) MY-CONGESTION-INFO (thresh=%d, past_value=%d, new_value=%d, diff=%d)\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, _congest_spc_time_alltoall_past_value, new_value, diff)); + fprintf(stderr, " #-- DBG: (Rank %d) MY-CONGESTION-INFO (thresh=%d, past_value=%d, new_value=%d, diff=%d)\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, _congest_spc_time_alltoall_past_value, new_value, diff); + + _congest_spc_time_alltoall_past_value = new_value; + + /* + * TJN: Skip the allreduce and do *only* the local diff check, + * but in this case we will not adjust the algorithm, just + * report the difference and move on. + */ + if (NULL != getenv("OMPIX_SKIP_CONGESTED_ALLREDUCE")) { + + /* diff: (local-only) Decide how different from past */ + if ((0 != diff) && (diff > ompi_coll_tuned_alltoall_congest_threshold)) { + OPAL_OUTPUT((ompi_coll_tuned_stream, " #-- DBG: (Rank %d) LOCAL-ONLY CONGESTION SKIP-ALLREDUCE (thresh=%d, new_value=%d, diff=%d)!\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, new_value, diff)); + fprintf(stderr, " #-- DBG: (Rank %d) LOCAL-ONLY CONGESTION SKIP-ALLREDUCE (thresh=%d, new_value=%d, diff=%d)!\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, new_value, diff); + + return 0; /* Always return 'not congested' for this case */ + } + + } else { + comm_rank = ompi_comm_rank(comm); + + /* + * Aggregate all of the information using MPI_Allreduce(MAX) + * on diff value to see if any rank in comm exceeded the + * max threshold. + * + * TODO TJN: Change this when we add congestion checks for MPI_Reduce()! + */ + (void)comm->c_coll->coll_allreduce(&diff, &diff_max, + 1, MPI_LONG_LONG, MPI_MAX, + comm, + comm->c_coll->coll_allreduce_module); + + (void)comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module); + + /* diff_max: (global max) Decide how different from past */ + if ((0 != diff_max) && (diff_max > ompi_coll_tuned_alltoall_congest_threshold)) { + OPAL_OUTPUT((ompi_coll_tuned_stream, " #-- DBG: (Rank %d) EXCEED CONGESTION THRESHOLD -- CONGESTED (thresh=%d, new_value=%d, diff=%d, diff_max=%d)!\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, new_value, diff, diff_max)); + //fprintf(stderr, " #-- DBG: (Rank %d) EXCEED CONGESTION THRESHOLD -- CONGESTED (thresh=%d, new_value=%d, diff=%d, diff_max=%d)!\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, new_value, diff, diff_max); + return 1; /* Yes congested */ + } + } + +#if 1 + /* XXX: for now if have env var set we call it congested */ + if (NULL != getenv("OMPIX_FORCE_CONGESTED")) { + fprintf(stderr, " #-- DBG: TJN_HACK_CONGESTED CONGESTION FORCED -- CONGESTED!\n"); + return 1; /* Yes congested */ + } +#endif + + return 0; /* Not congested */ +} + +/* + * Congestion - get algorithm to use when congested + * + * TJN: This is a very simplistic method to return the registered + * default algorithm to use when congestion is detected. + * This is set via an MCA parameter, which can be overriden + * at runtime. + * + * NOTE: This is specific to Alltoall for now. + * + * - MCA: coll_tuned_alltoall_congest_algorithm + * The alltoall algorithm to use when we detect + * congestion, i.e., ompi_coll_tuned_isCongested() is true. + */ +int +ompi_coll_tuned_get_congest_algo(void) +{ + int alg = -1; + + /* TODO: Should check this is a valid alltoall algo */ + alg = ompi_coll_tuned_alltoall_congest_algorithm; + + return (alg); +} + + static int ompi_coll_tuned_forced_getvalues( enum COLLTYPE type, coll_tuned_force_algorithm_params_t *forced_values ) @@ -136,6 +272,30 @@ ompi_coll_tuned_forced_getvalues( enum COLLTYPE type, mca_base_var_get_value(mca_params->algorithm_param_index, &tmp, NULL, NULL); forced_values->algorithm = tmp ? tmp[0] : 0; +#if 0 + /* Congestion stuff (likely cut this) */ + /* TJN: We are only changing the algorithm for ALLTOALL (if congested) */ + if( ALLTOALL == type ) { + + //fprintf(stderr, " #-- DBG: FORCED_GETVALUES (cur) algorithm = %d\n", + // forced_values->algorithm); + + if ( ompi_coll_tuned_isCongested() ) { + int new_alg; + new_alg = ompi_coll_tuned_get_congest_algo(); + if (new_alg >= 0) { + forced_values->algorithm = new_alg; + + //fprintf(stderr, " #-- DBG: HACK OVERRIDE ALLTOALL FORCED_GETVALUES algorithm = %d (new_alg=%d)\n", + // forced_values->algorithm, new_alg); + } + } + + //fprintf(stderr, " #-- DBG: FORCED_GETVALUES (new) algorithm = %d\n", + // forced_values->algorithm); + } +#endif + if( BARRIER != type ) { mca_base_var_get_value(mca_params->segsize_param_index, &tmp, NULL, NULL); if (tmp) forced_values->segsize = tmp[0]; diff --git a/ompi/runtime/ompi_spc.c b/ompi/runtime/ompi_spc.c index d4e70165bac..bbe114d3e66 100644 --- a/ompi/runtime/ompi_spc.c +++ b/ompi/runtime/ompi_spc.c @@ -21,6 +21,7 @@ static opal_timer_t sys_clock_freq_mhz = 0; static void ompi_spc_dump(void); +//static void ompi_spc_dump_diff(char *spc_name, long long spc_prev_value); /* Array for converting from SPC indices to MPI_T indices */ static bool mpi_t_enabled = false; @@ -405,11 +406,157 @@ static void ompi_spc_dump(void) ompi_spc_comm->c_coll->coll_barrier(ompi_spc_comm, ompi_spc_comm->c_coll->coll_barrier_module); } + +/* + * Congestion - helper function for checking diff w/ SPCs + * + * Given a specific SPC name and prior value, we + * get the new value and return the difference between + * the prior and new values (diff = new - prev). + * If do not care about the diff you can pass NULL for spc_diff, + * and will simply get the new_value. + * + * Note: Return the value as-is (do not convert cycles, etc.) + * + * On success, return MPI_SUCCESS, otherwise return -1. + */ +int ompi_spc_value_diff(char *spc_name, + long long spc_prev_value, + long long *spc_new_value, + long long *spc_diff) +{ + int i; + long long value = -1; + int found = 0; + + if (NULL == ompi_spc_events) { + //fprintf(stderr, " #-- DBG: WARN: SPC system not available\n"); + return -1; + } + + /* Find the index of given SPC. */ + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { + if( 0 == strcmp(ompi_spc_events[i].name, spc_name) ) { + + //OPAL_THREAD_LOCK(&_spc_mutex); + + /* + * TJN: Not using SPC_CYCLES_TO_USECS() macro b/c it + * appears to have side-effects. :-/ + */ + if( IS_SPC_BIT_SET(ompi_spc_timer_event, i) ) { + value = (long long)ompi_spc_events[i].value; + //fprintf(stderr, " #-- DBG: %s (tmp) value = %d sys_clock_freq_mhz = %d\n", spc_name, value, sys_clock_freq_mhz); + value = value / sys_clock_freq_mhz; + } + + //fprintf(stderr, " #-- DBG: %s value = %d\n", spc_name, value); + + //OPAL_THREAD_UNLOCK(&_spc_mutex); + + found = 1; + break; + } + } + + if (found != 1) { + printf("Error: Failed to find SPC counter '%s'\n", spc_name); + return -1; + } + + *spc_new_value = value; + + if (NULL != spc_diff) { + *spc_diff = value - spc_prev_value; + } + + return MPI_SUCCESS; +} + +#if 0 +/* + * Congestion - helper function for dumping diff SPC at all ranks. + * + * Gathers a given SPC data onto rank 0 of MPI_COMM_WORLD + * and compare with previous counter value. Showing diff to stdout. + */ +static void ompi_spc_dump_diff(char *spc_name, long long spc_prev_value) +{ + int i, j, world_size, offset; + long long *recv_buffer = NULL; + long long send_data; + int index = -1; + + int rank = ompi_comm_rank(ompi_spc_comm); + world_size = ompi_comm_size(ompi_spc_comm); + + /* Find the index of given SPC. */ + for(i = 0; i < OMPI_SPC_NUM_COUNTERS; i++) { + if (0 == strcmp(spc_name, ompi_spc_events[i].name)) { + //printf("DBG: spc[%d].name = %s\n", i, ompi_spc_events[i].name); + index = i; + /* If time-based counter: Convert from cycles to usecs */ + /* XXX: TJN: If called again (after ompi_spc_dump(), will + * clobber data as SPC_CYCLES_TO_USECS() writes back + * to value again. And we get another divide-by small + * result (usually results in value being 0). + */ +// if( IS_SPC_BIT_SET(ompi_spc_timer_event, i) ) { +// SPC_CYCLES_TO_USECS(&ompi_spc_events[i].value); +// } + break; + } + } + + if (index == -1) { + printf("Error: Failed to find SPC counter '%s'\n", spc_name); + return; + } + + send_data = (long long)ompi_spc_events[index].value; + + //opal_output(0, "(%d) DBG: send_data = %lld\n", rank, send_data); + + if( 0 == rank ) { + recv_buffer = (long long*)malloc(world_size * 1 * sizeof(long long)); + if (NULL == recv_buffer) { + opal_show_help("help-mpi-runtime.txt", "lib-call-fail", true, + "malloc", __FILE__, __LINE__); + return; + } + } + (void)ompi_spc_comm->c_coll->coll_gather(&send_data, 1, MPI_LONG_LONG, + recv_buffer, 1, MPI_LONG_LONG, + 0, ompi_spc_comm, + ompi_spc_comm->c_coll->coll_gather_module); + + /* Once rank 0 has the information, compare and print the diff for each rank in order */ + if(rank == 0) { + opal_output(0, "Open MPI Software-based Performance Counter Diff:\n"); + for(j = 0; j < world_size; j++) { + opal_output(0, "MPI_COMM_WORLD Rank %d:\n", j); + + opal_output(0, "%s: %lld (previous) -> %lld (current)\n", + ompi_spc_events[index].name, + spc_prev_value, + recv_buffer[j]); + } + printf("###########################################################################\n"); + + if (NULL != recv_buffer) + free(recv_buffer); + } + + ompi_spc_comm->c_coll->coll_barrier(ompi_spc_comm, ompi_spc_comm->c_coll->coll_barrier_module); +} +#endif + /* Frees any dynamically alocated OMPI SPC data structures */ void ompi_spc_fini(void) { if (SPC_ENABLE == 1 && ompi_mpi_spc_dump_enabled) { ompi_spc_dump(); + //ompi_spc_dump_diff("OMPI_SPC_TIME_ALLTOALL", 0.0); } free(ompi_spc_events); ompi_spc_events = NULL; diff --git a/ompi/runtime/ompi_spc.h b/ompi/runtime/ompi_spc.h index 5ba88acc6cc..66b304fc7a7 100644 --- a/ompi/runtime/ompi_spc.h +++ b/ompi/runtime/ompi_spc.h @@ -188,6 +188,8 @@ void ompi_spc_user_or_mpi(int tag, ompi_spc_value_t value, unsigned int user_enu void ompi_spc_cycles_to_usecs(ompi_spc_value_t *cycles); void ompi_spc_update_watermark(unsigned int watermark_enum, unsigned int value_enum); +int ompi_spc_value_diff(char *spc_name, long long spc_prev_value, long long *spc_new_value, long long *spc_diff); + /* Macros for using the SPC utility functions throughout the codebase. * If SPC_ENABLE is not 1, the macros become no-ops. */ From 687c05f53ec0d743957aa8c4e4e0703a2841c859 Mon Sep 17 00:00:00 2001 From: Thomas Naughton Date: Tue, 27 Oct 2020 09:36:31 -0400 Subject: [PATCH 4/5] congest output tweaks & env OMPIX_SHOW_CONGEST_INFO Signed-off-by: Thomas Naughton --- ompi/mca/coll/tuned/coll_tuned_module.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index d39d8b44a36..829827b8281 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -162,6 +162,8 @@ ompi_coll_tuned_isCongested(struct ompi_communicator_t *comm) int comm_rank; long long diff_max = 0; + OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned: alltoall_congest_threshold = %d\n", ompi_coll_tuned_alltoall_congest_threshold)); + /* Get our local congestion info */ rc = ompi_spc_value_diff("OMPI_SPC_TIME_ALLTOALL", _congest_spc_time_alltoall_past_value, @@ -174,7 +176,10 @@ ompi_coll_tuned_isCongested(struct ompi_communicator_t *comm) comm_rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, " #-- DBG: (Rank %d) MY-CONGESTION-INFO (thresh=%d, past_value=%d, new_value=%d, diff=%d)\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, _congest_spc_time_alltoall_past_value, new_value, diff)); - fprintf(stderr, " #-- DBG: (Rank %d) MY-CONGESTION-INFO (thresh=%d, past_value=%d, new_value=%d, diff=%d)\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, _congest_spc_time_alltoall_past_value, new_value, diff); + /* TJN: quick hack to see congest diff info per rank (w/o full verbose) */ + if (NULL != getenv("OMPIX_SHOW_CONGEST_INFO")) { + fprintf(stderr, " #-- DBG: (Rank %d) MY-CONGESTION-INFO (thresh=%d, past_value=%d, new_value=%d, diff=%d)\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, _congest_spc_time_alltoall_past_value, new_value, diff); + } _congest_spc_time_alltoall_past_value = new_value; @@ -251,6 +256,8 @@ ompi_coll_tuned_get_congest_algo(void) /* TODO: Should check this is a valid alltoall algo */ alg = ompi_coll_tuned_alltoall_congest_algorithm; + OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned: alltoall_congest_algorithm = %d\n", alg)); + return (alg); } From f35cb12c569985c176d23e52f90a5430026b28b3 Mon Sep 17 00:00:00 2001 From: Thomas Naughton Date: Tue, 27 Oct 2020 11:08:00 -0400 Subject: [PATCH 5/5] avoid false-positive congest on first check Signed-off-by: Thomas Naughton --- ompi/mca/coll/tuned/coll_tuned_module.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ompi/mca/coll/tuned/coll_tuned_module.c b/ompi/mca/coll/tuned/coll_tuned_module.c index 829827b8281..61c89cfd948 100644 --- a/ompi/mca/coll/tuned/coll_tuned_module.c +++ b/ompi/mca/coll/tuned/coll_tuned_module.c @@ -181,6 +181,14 @@ ompi_coll_tuned_isCongested(struct ompi_communicator_t *comm) fprintf(stderr, " #-- DBG: (Rank %d) MY-CONGESTION-INFO (thresh=%d, past_value=%d, new_value=%d, diff=%d)\n", comm_rank, ompi_coll_tuned_alltoall_congest_threshold, _congest_spc_time_alltoall_past_value, new_value, diff); } + /* Check if this is our first measurement */ + if ((diff == new_value) && (0 == _congest_spc_time_alltoall_past_value)) { + + _congest_spc_time_alltoall_past_value = new_value; + + return 0; /* Ignore first measurement as not congested */ + } + _congest_spc_time_alltoall_past_value = new_value; /*