Skip to content

Commit a8b34d9

Browse files
author
Ralph Castain
authored
Merge pull request #3603 from rhc54/topic/fds
Add some debug code for detecting leaking file descriptors.
2 parents 8e583dc + f3ab326 commit a8b34d9

File tree

4 files changed

+132
-3
lines changed

4 files changed

+132
-3
lines changed

orte/mca/state/base/state_base_fns.c

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@
1313
#include "orte_config.h"
1414
#include "orte/constants.h"
1515

16+
#if HAVE_UNISTD_H
17+
#include <unistd.h>
18+
#endif
19+
#if HAVE_FCNTL_H
20+
#include <fcntl.h>
21+
#endif
22+
1623
#include "opal/class/opal_list.h"
1724
#include "opal/mca/event/event.h"
1825
#include "opal/mca/pmix/pmix.h"
@@ -714,6 +721,10 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
714721
/* track job status */
715722
jdata->num_terminated++;
716723
if (jdata->num_terminated == jdata->num_procs) {
724+
/* if requested, check fd status for leaks */
725+
if (orte_state_base_run_fdcheck) {
726+
orte_state_base_check_fds(jdata);
727+
}
717728
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
718729
/* if they requested notification upon completion, provide it */
719730
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) {
@@ -1016,3 +1027,99 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
10161027

10171028
OBJ_RELEASE(caddy);
10181029
}
1030+
1031+
1032+
void orte_state_base_check_fds(orte_job_t *jdata)
1033+
{
1034+
int nfds, i, fdflags, flflags;
1035+
char path[1024], info[256], **list=NULL, *status, *result, *r2;
1036+
ssize_t rc;
1037+
struct flock fl;
1038+
int cnt = 0;
1039+
1040+
/* get the number of available file descriptors
1041+
* for this daemon */
1042+
nfds = getdtablesize();
1043+
result = NULL;
1044+
/* loop over them and get their info */
1045+
for (i=0; i < nfds; i++) {
1046+
fdflags = fcntl(i, F_GETFD);
1047+
if (-1 == fdflags) {
1048+
/* no open fd in that slot */
1049+
continue;
1050+
}
1051+
flflags = fcntl(i, F_GETFL);
1052+
if (-1 == flflags) {
1053+
/* no open fd in that slot */
1054+
continue;
1055+
}
1056+
snprintf(path, 1024, "/proc/self/fd/%d", i);
1057+
memset(info, 0, 256);
1058+
/* read the info about this fd */
1059+
rc = readlink(path, info, 256);
1060+
if (-1 == rc) {
1061+
/* this fd is unavailable */
1062+
continue;
1063+
}
1064+
/* get any file locking status */
1065+
fl.l_type = F_WRLCK;
1066+
fl.l_whence = 0;
1067+
fl.l_start = 0;
1068+
fl.l_len = 0;
1069+
fcntl(i, F_GETLK, &fl);
1070+
/* construct the list of capabilities */
1071+
if (fdflags & FD_CLOEXEC) {
1072+
opal_argv_append_nosize(&list, "cloexec");
1073+
}
1074+
if (flflags & O_APPEND) {
1075+
opal_argv_append_nosize(&list, "append");
1076+
}
1077+
if (flflags & O_NONBLOCK) {
1078+
opal_argv_append_nosize(&list, "nonblock");
1079+
}
1080+
if (flflags & O_RDONLY) {
1081+
opal_argv_append_nosize(&list, "rdonly");
1082+
}
1083+
if (flflags & O_RDWR) {
1084+
opal_argv_append_nosize(&list, "rdwr");
1085+
}
1086+
if (flflags & O_WRONLY) {
1087+
opal_argv_append_nosize(&list, "wronly");
1088+
}
1089+
if (flflags & O_DSYNC) {
1090+
opal_argv_append_nosize(&list, "dsync");
1091+
}
1092+
if (flflags & O_RSYNC) {
1093+
opal_argv_append_nosize(&list, "rsync");
1094+
}
1095+
if (flflags & O_SYNC) {
1096+
opal_argv_append_nosize(&list, "sync");
1097+
}
1098+
if (F_UNLCK != fl.l_type) {
1099+
if (F_WRLCK == fl.l_type) {
1100+
opal_argv_append_nosize(&list, "wrlock");
1101+
} else {
1102+
opal_argv_append_nosize(&list, "rdlock");
1103+
}
1104+
}
1105+
if (NULL != list) {
1106+
status = opal_argv_join(list, ' ');
1107+
opal_argv_free(list);
1108+
list = NULL;
1109+
if (NULL == result) {
1110+
asprintf(&result, " %d\t(%s)\t%s\n", i, info, status);
1111+
} else {
1112+
asprintf(&r2, "%s %d\t(%s)\t%s\n", result, i, info, status);
1113+
free(result);
1114+
result = r2;
1115+
}
1116+
free(status);
1117+
}
1118+
++cnt;
1119+
}
1120+
asprintf(&r2, "%s: %d open file descriptors after job %d completed\n%s",
1121+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cnt, ORTE_LOCAL_JOBID(jdata->jobid), result);
1122+
opal_output(0, "%s", r2);
1123+
free(result);
1124+
free(r2);
1125+
}

orte/mca/state/base/state_base_frame.c

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* All rights reserved.
55
* Copyright (c) 2015 Research Organization for Information Science
66
* and Technology (RIST). All rights reserved.
7+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -41,6 +42,20 @@
4142
* Globals
4243
*/
4344
orte_state_base_module_t orte_state = {0};
45+
bool orte_state_base_run_fdcheck = false;
46+
47+
static int orte_state_base_register(mca_base_register_flag_t flags)
48+
{
49+
orte_state_base_run_fdcheck = false;
50+
mca_base_var_register("orte", "state", "base", "check_fds",
51+
"Daemons should check fds for leaks after each job completes",
52+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
53+
OPAL_INFO_LVL_9,
54+
MCA_BASE_VAR_SCOPE_READONLY,
55+
&orte_state_base_run_fdcheck);
56+
57+
return ORTE_SUCCESS;
58+
}
4459

4560
static int orte_state_base_close(void)
4661
{
@@ -62,7 +77,8 @@ static int orte_state_base_open(mca_base_open_flag_t flags)
6277
return mca_base_framework_components_open(&orte_state_base_framework, flags);
6378
}
6479

65-
MCA_BASE_FRAMEWORK_DECLARE(orte, state, "ORTE State Machine", NULL,
80+
MCA_BASE_FRAMEWORK_DECLARE(orte, state, "ORTE State Machine",
81+
orte_state_base_register,
6682
orte_state_base_open, orte_state_base_close,
6783
mca_state_base_static_components, 0);
6884

@@ -95,4 +111,3 @@ OBJ_CLASS_INSTANCE(orte_state_caddy_t,
95111
opal_object_t,
96112
orte_state_caddy_construct,
97113
orte_state_caddy_destruct);
98-

orte/mca/state/base/state_private.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*
22
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
33
* All rights reserved.
4+
* Copyright (c) 2017 Intel, Inc. All rights reserved.
45
* $COPYRIGHT$
56
*
67
* Additional copyrights may follow
@@ -31,6 +32,7 @@
3132

3233
BEGIN_C_DECLS
3334

35+
extern bool orte_state_base_run_fdcheck;
3436
/*
3537
* Base functions
3638
*/
@@ -75,7 +77,7 @@ ORTE_DECLSPEC void orte_state_base_cleanup_job(int fd, short argc, void *cbdata)
7577
ORTE_DECLSPEC void orte_state_base_report_progress(int fd, short argc, void *cbdata);
7678
ORTE_DECLSPEC void orte_state_base_track_procs(int fd, short argc, void *cbdata);
7779
ORTE_DECLSPEC void orte_state_base_check_all_complete(int fd, short args, void *cbdata);
78-
80+
ORTE_DECLSPEC void orte_state_base_check_fds(orte_job_t *jdata);
7981

8082
END_C_DECLS
8183
#endif

orte/mca/state/orted/state_orted.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,11 @@ static void track_procs(int fd, short argc, void *cbdata)
484484
jdata->map = NULL;
485485
}
486486

487+
/* if requested, check fd status for leaks */
488+
if (orte_state_base_run_fdcheck) {
489+
orte_state_base_check_fds(jdata);
490+
}
491+
487492
/* cleanup the job info */
488493
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
489494
OBJ_RELEASE(jdata);

0 commit comments

Comments
 (0)