Skip to content

Commit 39d086e

Browse files
author
rhc54
authored
Merge pull request #2035 from rhc54/topic/memprofile
Provide a mechanism for obtaining memory profiles of daemons and application profiles for use in studying our memory footprint
2 parents 39992d1 + c1050bc commit 39d086e

File tree

11 files changed

+230
-2
lines changed

11 files changed

+230
-2
lines changed

opal/dss/dss_copy.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ int opal_dss_copy_pstat(opal_pstats_t **dest, opal_pstats_t *src,
219219
p->time = src->time;
220220
p->priority = src->priority;
221221
p->num_threads = src->num_threads;
222+
p->pss = src->pss;
222223
p->vsize = src->vsize;
223224
p->rss = src->rss;
224225
p->peak_vsize = src->peak_vsize;

opal/dss/dss_open_close.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ static void opal_pstat_construct(opal_pstats_t *obj)
156156
obj->time.tv_usec = 0;
157157
obj->priority = -1;
158158
obj->num_threads = -1;
159+
obj->pss = 0.0;
159160
obj->vsize = 0.0;
160161
obj->rss = 0.0;
161162
obj->peak_vsize = 0.0;

opal/dss/dss_pack.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,9 @@ int opal_dss_pack_pstat(opal_buffer_t *buffer, const void *src,
499499
if (OPAL_SUCCESS != (ret = opal_dss_pack_buffer(buffer, &ptr[i]->num_threads, 1, OPAL_INT16))) {
500500
return ret;
501501
}
502+
if (OPAL_SUCCESS != (ret = opal_dss_pack_float(buffer, &ptr[i]->pss, 1, OPAL_FLOAT))) {
503+
return ret;
504+
}
502505
if (OPAL_SUCCESS != (ret = opal_dss_pack_float(buffer, &ptr[i]->vsize, 1, OPAL_FLOAT))) {
503506
return ret;
504507
}

opal/dss/dss_print.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -654,10 +654,10 @@ int opal_dss_print_pstat(char **output, char *prefix, opal_pstats_t *src, opal_d
654654
return OPAL_SUCCESS;
655655
}
656656
asprintf(output, "%sOPAL_PSTATS SAMPLED AT: %ld.%06ld\n%snode: %s rank: %d pid: %d cmd: %s state: %c pri: %d #threads: %d Processor: %d\n"
657-
"%s\ttime: %ld.%06ld cpu: %5.2f VMsize: %8.2f PeakVMSize: %8.2f RSS: %8.2f\n",
657+
"%s\ttime: %ld.%06ld cpu: %5.2f PSS: %8.2f VMsize: %8.2f PeakVMSize: %8.2f RSS: %8.2f\n",
658658
prefx, (long)src->sample_time.tv_sec, (long)src->sample_time.tv_usec,
659659
prefx, src->node, src->rank, src->pid, src->cmd, src->state[0], src->priority, src->num_threads, src->processor,
660-
prefx, (long)src->time.tv_sec, (long)src->time.tv_usec, src->percent_cpu, src->vsize, src->peak_vsize, src->rss);
660+
prefx, (long)src->time.tv_sec, (long)src->time.tv_usec, src->percent_cpu, src->pss, src->vsize, src->peak_vsize, src->rss);
661661
if (prefx != prefix) {
662662
free(prefx);
663663
}

opal/dss/dss_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ typedef struct {
182182
float percent_cpu;
183183
int32_t priority;
184184
int16_t num_threads;
185+
float pss; /* in MBytes */
185186
float vsize; /* in MBytes */
186187
float rss; /* in MBytes */
187188
float peak_vsize; /* in MBytes */

opal/dss/dss_unpack.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,11 @@ int opal_dss_unpack_pstat(opal_buffer_t *buffer, void *dest,
643643
return ret;
644644
}
645645
m=1;
646+
if (OPAL_SUCCESS != (ret = opal_dss_unpack_float(buffer, &ptr[i]->pss, &m, OPAL_FLOAT))) {
647+
OPAL_ERROR_LOG(ret);
648+
return ret;
649+
}
650+
m=1;
646651
if (OPAL_SUCCESS != (ret = opal_dss_unpack_float(buffer, &ptr[i]->vsize, &m, OPAL_FLOAT))) {
647652
OPAL_ERROR_LOG(ret);
648653
return ret;

opal/mca/pstat/linux/pstat_linux_module.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,31 @@ static int query(pid_t pid,
310310
}
311311
}
312312
fclose(fp);
313+
314+
/* now create the smaps filename for this proc */
315+
memset(data, 0, sizeof(data));
316+
numchars = snprintf(data, sizeof(data), "/proc/%d/smaps", pid);
317+
if (numchars >= sizeof(data)) {
318+
return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
319+
}
320+
321+
if (NULL == (fp = fopen(data, "r"))) {
322+
/* ignore this */
323+
return OPAL_SUCCESS;
324+
}
325+
326+
/* parse it to find lines that start with "Pss" */
327+
while (NULL != (dptr = local_getline(fp))) {
328+
if (NULL == (value = local_stripper(dptr))) {
329+
/* cannot process */
330+
continue;
331+
}
332+
/* look for Pss */
333+
if (0 == strncmp(dptr, "Pss", strlen("Pss"))) {
334+
stats->pss += convert_value(value);
335+
}
336+
}
337+
fclose(fp);
313338
}
314339

315340
if (NULL != nstats) {

orte/mca/odls/odls_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
8383
/* for debug purposes, get stack traces from all application procs */
8484
#define ORTE_DAEMON_GET_STACK_TRACES (orte_daemon_cmd_flag_t) 31
8585

86+
/* for memory profiling */
87+
#define ORTE_DAEMON_GET_MEMPROFILE (orte_daemon_cmd_flag_t) 32
88+
8689
/*
8790
* Struct written up the pipe from the child to the parent.
8891
*/

orte/mca/rml/rml_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ BEGIN_C_DECLS
166166
/* stacktrace for debug */
167167
#define ORTE_RML_TAG_STACK_TRACE 60
168168

169+
/* memory profile */
170+
#define ORTE_RML_TAG_MEMPROFILE 61
171+
169172
#define ORTE_RML_TAG_MAX 100
170173

171174
#define ORTE_RML_TAG_NTOH(t) ntohl(t)

orte/orted/orted_comm.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545

4646
#include "opal/mca/event/event.h"
4747
#include "opal/mca/base/base.h"
48+
#include "opal/mca/pstat/pstat.h"
4849
#include "opal/util/output.h"
4950
#include "opal/util/opal_environ.h"
5051
#include "opal/util/path.h"
@@ -115,6 +116,8 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
115116
FILE *fp;
116117
char gscmd[256], path[1035], *pathptr;
117118
char string[256], *string_ptr = string;
119+
float pss;
120+
opal_pstats_t pstat;
118121

119122
/* unpack the command */
120123
n = 1;
@@ -1151,6 +1154,44 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
11511154
}
11521155
break;
11531156

1157+
case ORTE_DAEMON_GET_MEMPROFILE:
1158+
answer = OBJ_NEW(opal_buffer_t);
1159+
/* pack our hostname so they know where it came from */
1160+
opal_dss.pack(answer, &orte_process_info.nodename, 1, OPAL_STRING);
1161+
/* collect my memory usage */
1162+
OBJ_CONSTRUCT(&pstat, opal_pstats_t);
1163+
opal_pstat.query(orte_process_info.pid, &pstat, NULL);
1164+
opal_dss.pack(answer, &pstat.pss, 1, OPAL_FLOAT);
1165+
OBJ_DESTRUCT(&pstat);
1166+
/* collect the memory usage of all my children */
1167+
pss = 0.0;
1168+
num_replies = 0;
1169+
for (i=0; i < orte_local_children->size; i++) {
1170+
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
1171+
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
1172+
/* collect the stats on this proc */
1173+
OBJ_CONSTRUCT(&pstat, opal_pstats_t);
1174+
if (OPAL_SUCCESS == opal_pstat.query(proct->pid, &pstat, NULL)) {
1175+
pss += pstat.pss;
1176+
++num_replies;
1177+
}
1178+
OBJ_DESTRUCT(&pstat);
1179+
}
1180+
}
1181+
/* compute the average value */
1182+
if (0 < num_replies) {
1183+
pss /= (float)num_replies;
1184+
}
1185+
opal_dss.pack(answer, &pss, 1, OPAL_FLOAT);
1186+
/* send it back */
1187+
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
1188+
ORTE_RML_TAG_MEMPROFILE,
1189+
orte_rml_send_callback, NULL))) {
1190+
ORTE_ERROR_LOG(ret);
1191+
OBJ_RELEASE(answer);
1192+
}
1193+
break;
1194+
11541195
default:
11551196
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
11561197
}
@@ -1222,6 +1263,9 @@ static char *get_orted_comm_cmd_str(int command)
12221263
case ORTE_DAEMON_GET_STACK_TRACES:
12231264
return strdup("ORTE_DAEMON_GET_STACK_TRACES");
12241265

1266+
case ORTE_DAEMON_GET_MEMPROFILE:
1267+
return strdup("ORTE_DAEMON_GET_MEMPROFILE");
1268+
12251269
default:
12261270
return strdup("Unknown Command!");
12271271
}

0 commit comments

Comments
 (0)