Skip to content

Commit 6509f60

Browse files
author
Ralph Castain
committed
Complete the memprobe support. This provides a new scaling tool called "mpi_memprobe" that samples the memory footprint of the local daemon and the client procs, and then reports the results. The output contains the footprint of the daemon on each node, plus the average footprint of the client procs on that node.
Samples are taken after MPI_Init, and then again after MPI_Barrier. This allows the user to see memory consumption caused by add_procs, as well as any modex contribution from forming connections if pmix_base_async_modex is given. Using the probe simply involves executing it via mpirun, with however many copies you want per node. Example: $ mpirun -npernode 2 ./mpi_memprobe Sampling memory usage after MPI_Init Data for node rhc001 Daemon: 12.483398 Client: 6.514648 Data for node rhc002 Daemon: 11.865234 Client: 4.643555 Sampling memory usage after MPI_Barrier Data for node rhc001 Daemon: 12.520508 Client: 6.576660 Data for node rhc002 Daemon: 11.879883 Client: 4.703125 Note that the client value on node rhc001 is larger - this is where rank=0 is housed, and apparently it gets a larger footprint for some reason. Signed-off-by: Ralph Castain <[email protected]>
1 parent b4088c3 commit 6509f60

File tree

19 files changed

+468
-175
lines changed

19 files changed

+468
-175
lines changed

contrib/scaling/Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ all: $(PROGS)
44

55
CFLAGS = -O
66

7-
orte_no_op:
7+
orte_no_op: orte_no_op.c
88
ortecc -o orte_no_op orte_no_op.c
99

10-
mpi_no_op:
10+
mpi_no_op: mpi_no_op.c
1111
mpicc -o mpi_no_op mpi_no_op.c
1212

13-
mpi_memprobe:
14-
mpicc -o mpi_memprobe mpi_memprobe.c -lopen-pal
13+
mpi_memprobe: mpi_memprobe.c
14+
mpicc -o mpi_memprobe mpi_memprobe.c -lopen-pal -lopen-rte
1515

1616
clean:
1717
rm -f $(PROGS) *~

contrib/scaling/mpi_memprobe.c

Lines changed: 125 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#include "orte/mca/errmgr/errmgr.h"
1818

1919
static int rank, size;
20-
static volatile int active;
2120
static volatile bool wait_for_release = true;
2221
#define MEMPROBE_RELEASE 12345
2322

@@ -27,7 +26,6 @@ static void _release_fn(int status,
2726
opal_pmix_notification_complete_fn_t cbfunc,
2827
void *cbdata)
2928
{
30-
fprintf(stderr, "Rank %d: Release recvd\n", rank);
3129
/* must let the notifier know we are done */
3230
if (NULL != cbfunc) {
3331
cbfunc(OPAL_ERR_HANDLERS_COMPLETE, NULL, NULL, NULL, cbdata);
@@ -58,7 +56,6 @@ static void qcbfunc(int status,
5856
opal_list_t *results = (opal_list_t*)cbdata;
5957
opal_value_t *kv;
6058

61-
fprintf(stderr, "Rank %d: Query returned status %d\n", rank, status);
6259
if (NULL != info) {
6360
while (NULL != (kv = (opal_value_t*)opal_list_remove_first(info))) {
6461
opal_list_append(results, &kv->super);
@@ -70,61 +67,90 @@ static void qcbfunc(int status,
7067
wait_for_release = false;
7168
}
7269

73-
int main(int argc, char* argv[])
70+
static void notifycbfunc(int status, void *cbdata)
7471
{
75-
opal_list_t *codes;
76-
opal_value_t *kv;
77-
opal_pmix_query_t *q;
78-
opal_list_t query, response;
79-
80-
MPI_Init(&argc, &argv);
81-
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
82-
MPI_Comm_size(MPI_COMM_WORLD, &size);
72+
volatile int *active = (volatile int*)cbdata;
73+
*active = status;
74+
}
8375

84-
/* everyone registers their event handler */
85-
codes = OBJ_NEW(opal_list_t);
76+
static void sample(void)
77+
{
78+
opal_value_t *kv, *ival;
79+
opal_pmix_query_t *q;
80+
opal_list_t query, response, *lt;
81+
volatile int active;
82+
char **answer = NULL, *tmp, *msg;
83+
84+
OBJ_CONSTRUCT(&query, opal_list_t);
85+
OBJ_CONSTRUCT(&response, opal_list_t);
86+
q = OBJ_NEW(opal_pmix_query_t);
87+
opal_list_append(&query, &q->super);
88+
opal_argv_append_nosize(&q->keys, OPAL_PMIX_QUERY_MEMORY_USAGE);
89+
/* qualify that we just want local avg, min/max values reported */
8690
kv = OBJ_NEW(opal_value_t);
87-
kv->key = strdup("errorcode");
88-
kv->type = OPAL_INT;
89-
kv->data.integer = MEMPROBE_RELEASE;
90-
opal_list_append(codes, &kv->super);
91+
kv->key = strdup(OPAL_PMIX_QUERY_LOCAL_ONLY);
92+
kv->type = OPAL_BOOL;
93+
kv->data.flag = true;
94+
opal_list_append(&q->qualifiers, &kv->super);
95+
kv = OBJ_NEW(opal_value_t);
96+
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_AVG);
97+
kv->type = OPAL_BOOL;
98+
kv->data.flag = true;
99+
opal_list_append(&q->qualifiers, &kv->super);
100+
kv = OBJ_NEW(opal_value_t);
101+
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_MINMAX);
102+
kv->type = OPAL_BOOL;
103+
kv->data.flag = true;
104+
opal_list_append(&q->qualifiers, &kv->super);
105+
/* issue the request */
106+
wait_for_release = true;
107+
opal_pmix.query(&query, qcbfunc, (void*)&response);
108+
/* wait for the query to complete */
109+
while (wait_for_release) {
110+
usleep(10);
111+
}
112+
wait_for_release = true;
113+
/* log my own results as a single string so the output
114+
* doesn't get garbled on the other end */
115+
asprintf(&tmp, "Data for node %s", orte_process_info.nodename);
116+
opal_argv_append_nosize(&answer, tmp);
117+
free(tmp);
118+
OPAL_LIST_FOREACH(kv, &response, opal_value_t) {
119+
lt = (opal_list_t*)kv->data.ptr;
120+
OPAL_LIST_FOREACH(ival, lt, opal_value_t) {
121+
if (0 == strcmp(ival->key, OPAL_PMIX_DAEMON_MEMORY)) {
122+
asprintf(&tmp, "\tDaemon: %f", ival->data.fval);
123+
opal_argv_append_nosize(&answer, tmp);
124+
free(tmp);
125+
} else if (0 == strcmp(ival->key, OPAL_PMIX_CLIENT_AVG_MEMORY)) {
126+
asprintf(&tmp, "\tClient: %f", ival->data.fval);
127+
opal_argv_append_nosize(&answer, tmp);
128+
free(tmp);
129+
} else {
130+
fprintf(stderr, "\tUnknown key: %s", ival->key);
131+
}
132+
}
133+
}
134+
opal_argv_append_nosize(&answer, "\n");
135+
OPAL_LIST_DESTRUCT(&response);
91136

137+
/* construct the log output */
138+
OBJ_CONSTRUCT(&response, opal_list_t);
139+
kv = OBJ_NEW(opal_value_t);
140+
kv->key = strdup(OPAL_PMIX_LOG_STDOUT);
141+
kv->type = OPAL_STRING;
142+
kv->data.string = opal_argv_join(answer, '\n');
143+
opal_list_append(&response, &kv->super);
144+
opal_argv_free(answer);
92145
active = -1;
93-
opal_pmix.register_evhandler(codes, NULL, _release_fn, _register_fn, (void*)&active);
146+
opal_pmix.log(&response, notifycbfunc, (void*)&active);
94147
while (-1 == active) {
95148
usleep(10);
96149
}
150+
OPAL_LIST_DESTRUCT(&response);
151+
97152

98-
/* rank 0 asks for memory to be sampled, while everyone else waits */
99153
if (0 == rank) {
100-
fprintf(stderr, "Sampling memory usage after MPI_Init\n");
101-
OBJ_CONSTRUCT(&query, opal_list_t);
102-
OBJ_CONSTRUCT(&response, opal_list_t);
103-
q = OBJ_NEW(opal_pmix_query_t);
104-
opal_list_append(&query, &q->super);
105-
opal_argv_append_nosize(&q->keys, OPAL_PMIX_QUERY_MEMORY_USAGE);
106-
/* qualify that we just want avg, min/max values reported */
107-
kv = OBJ_NEW(opal_value_t);
108-
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_AVG);
109-
kv->type = OPAL_BOOL;
110-
kv->data.flag = true;
111-
opal_list_append(&q->qualifiers, &kv->super);
112-
kv = OBJ_NEW(opal_value_t);
113-
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_MINMAX);
114-
kv->type = OPAL_BOOL;
115-
kv->data.flag = true;
116-
opal_list_append(&q->qualifiers, &kv->super);
117-
/* issue the request */
118-
wait_for_release = true;
119-
opal_pmix.query(&query, qcbfunc, (void*)&response);
120-
while (wait_for_release) {
121-
usleep(10);
122-
}
123-
/* output the results */
124-
OPAL_LIST_FOREACH(kv, &response, opal_value_t) {
125-
fprintf(stderr, "\tResults: %s\n", kv->key);
126-
}
127-
OPAL_LIST_DESTRUCT(&response);
128154
/* send the notification to release the other procs */
129155
wait_for_release = true;
130156
OBJ_CONSTRUCT(&response, opal_list_t);
@@ -133,16 +159,58 @@ int main(int argc, char* argv[])
133159
kv->type = OPAL_BOOL;
134160
kv->data.flag = true;
135161
opal_list_append(&response, &kv->super);
162+
active = -1;
136163
if (OPAL_SUCCESS != opal_pmix.notify_event(MEMPROBE_RELEASE, NULL,
137164
OPAL_PMIX_RANGE_GLOBAL, &response,
138-
NULL, NULL)) {
165+
notifycbfunc, (void*)&active)) {
139166
fprintf(stderr, "Notify event failed\n");
140167
exit(1);
141168
}
142-
while (wait_for_release) {
169+
while (-1 == active) {
143170
usleep(10);
144171
}
145172
OPAL_LIST_DESTRUCT(&response);
173+
}
174+
175+
/* now wait for notification */
176+
while (wait_for_release) {
177+
usleep(10);
178+
}
179+
}
180+
181+
int main(int argc, char* argv[])
182+
{
183+
opal_list_t *codes;
184+
opal_value_t *kv;
185+
volatile int active;
186+
187+
MPI_Init(&argc, &argv);
188+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
189+
MPI_Comm_size(MPI_COMM_WORLD, &size);
190+
191+
if (0 == rank) {
192+
fprintf(stderr, "Sampling memory usage after MPI_Init\n");
193+
}
194+
195+
/* everyone registers their event handler */
196+
codes = OBJ_NEW(opal_list_t);
197+
kv = OBJ_NEW(opal_value_t);
198+
kv->key = strdup("errorcode");
199+
kv->type = OPAL_INT;
200+
kv->data.integer = MEMPROBE_RELEASE;
201+
opal_list_append(codes, &kv->super);
202+
203+
active = -1;
204+
opal_pmix.register_evhandler(codes, NULL, _release_fn, _register_fn, (void*)&active);
205+
while (-1 == active) {
206+
usleep(10);
207+
}
208+
209+
/* if I am the local leader (i.e., local_rank=0), then I ask
210+
* my daemon to report the local memory usage, and send it
211+
* to rank=0 */
212+
if (0 == orte_process_info.my_local_rank) {
213+
sample();
146214
} else {
147215
/* now wait for notification */
148216
while (wait_for_release) {
@@ -157,60 +225,21 @@ int main(int argc, char* argv[])
157225

158226
if (0 == rank) {
159227
fprintf(stderr, "\n\nSampling memory usage after MPI_Barrier\n");
160-
OBJ_CONSTRUCT(&query, opal_list_t);
161-
OBJ_CONSTRUCT(&response, opal_list_t);
162-
q = OBJ_NEW(opal_pmix_query_t);
163-
opal_list_append(&query, &q->super);
164-
opal_argv_append_nosize(&q->keys, OPAL_PMIX_QUERY_MEMORY_USAGE);
165-
/* qualify that we just want avg, min/max values reported */
166-
kv = OBJ_NEW(opal_value_t);
167-
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_AVG);
168-
kv->type = OPAL_BOOL;
169-
kv->data.flag = true;
170-
opal_list_append(&q->qualifiers, &kv->super);
171-
kv = OBJ_NEW(opal_value_t);
172-
kv->key = strdup(OPAL_PMIX_QUERY_REPORT_MINMAX);
173-
kv->type = OPAL_BOOL;
174-
kv->data.flag = true;
175-
opal_list_append(&q->qualifiers, &kv->super);
176-
/* issue the request */
177-
wait_for_release = true;
178-
opal_pmix.query(&query, qcbfunc, (void*)&response);
179-
while (wait_for_release) {
180-
usleep(10);
181-
}
182-
/* output the results */
183-
OPAL_LIST_FOREACH(kv, &response, opal_value_t) {
184-
fprintf(stderr, "\tResults: %s\n", kv->key);
185-
}
186-
OPAL_LIST_DESTRUCT(&response);
187-
/* send the notification to release the other procs */
188-
wait_for_release = true;
189-
OBJ_CONSTRUCT(&response, opal_list_t);
190-
kv = OBJ_NEW(opal_value_t);
191-
kv->key = strdup(OPAL_PMIX_EVENT_NON_DEFAULT);
192-
kv->type = OPAL_BOOL;
193-
kv->data.flag = true;
194-
opal_list_append(&response, &kv->super);
195-
if (OPAL_SUCCESS != opal_pmix.notify_event(MEMPROBE_RELEASE, NULL,
196-
OPAL_PMIX_RANGE_GLOBAL, &response,
197-
NULL, NULL)) {
198-
fprintf(stderr, "Notify event failed\n");
199-
exit(1);
200-
}
201-
while (wait_for_release) {
202-
usleep(10);
228+
}
229+
230+
if (0 == orte_process_info.my_local_rank) {
231+
if (0 != rank) {
232+
/* wait a little */
233+
usleep(1000);
203234
}
204-
OPAL_LIST_DESTRUCT(&response);
235+
sample();
205236
} else {
206237
/* wait again while memory is sampled */
207238
while (wait_for_release) {
208239
usleep(10);
209240
}
210241
}
211242

212-
fprintf(stderr, "%d: FINALIZING\n", rank);
213-
fflush(stderr);
214243
MPI_Finalize();
215244
return 0;
216245
}

opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2012 Los Alamos National Security, Inc. All rights reserved.
13-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
1414
* Copyright (c) 2015 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* Copyright (c) 2016 Mellanox Technologies, Inc.
@@ -688,6 +688,7 @@ pmix_status_t pmix_bfrop_unpack_status(pmix_buffer_t *buffer, void *dest,
688688
return PMIX_ERR_NOMEM;
689689
}
690690
if (PMIX_SUCCESS != (ret = pmix_bfrop_unpack_buffer(buffer, val->data.darray, &m, PMIX_DATA_ARRAY))) {
691+
PMIX_ERROR_LOG(ret);
691692
return ret;
692693
}
693694
break;
@@ -1274,6 +1275,9 @@ pmix_status_t pmix_bfrop_unpack_darray(pmix_buffer_t *buffer, void *dest,
12741275
case PMIX_COMPRESSED_STRING:
12751276
nbytes = sizeof(pmix_byte_object_t);
12761277
break;
1278+
case PMIX_INFO:
1279+
nbytes = sizeof(pmix_info_t);
1280+
break;
12771281
case PMIX_PERSIST:
12781282
nbytes = sizeof(pmix_persistence_t);
12791283
break;

opal/mca/pmix/pmix2x/pmix/src/common/pmix_query.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
3+
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
44
* Copyright (c) 2016 Mellanox Technologies, Inc.
55
* All rights reserved.
66
* Copyright (c) 2016 IBM Corporation. All rights reserved.
@@ -78,7 +78,11 @@ static void query_cbfunc(struct pmix_peer_t *peer,
7878
cnt = results->ninfo;
7979
if (PMIX_SUCCESS != (rc = pmix_bfrop.unpack(buf, results->info, &cnt, PMIX_INFO))) {
8080
PMIX_ERROR_LOG(rc);
81-
goto complete;
81+
pmix_output(0, "TYPE: %d", results->info[0].value.type);
82+
results->status = rc;
83+
PMIX_INFO_FREE(results->info, results->ninfo);
84+
results->info = NULL;
85+
results->ninfo = 0;
8286
}
8387
}
8488

0 commit comments

Comments
 (0)