|
13 | 13 | #include "orte_config.h"
|
14 | 14 | #include "orte/constants.h"
|
15 | 15 |
|
| 16 | +#if HAVE_UNISTD_H |
| 17 | +#include <unistd.h> |
| 18 | +#endif |
| 19 | +#if HAVE_FCNTL_H |
| 20 | +#include <fcntl.h> |
| 21 | +#endif |
| 22 | + |
16 | 23 | #include "opal/class/opal_list.h"
|
17 | 24 | #include "opal/mca/event/event.h"
|
18 | 25 | #include "opal/mca/pmix/pmix.h"
|
@@ -714,6 +721,10 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
|
714 | 721 | /* track job status */
|
715 | 722 | jdata->num_terminated++;
|
716 | 723 | if (jdata->num_terminated == jdata->num_procs) {
|
| 724 | + /* if requested, check fd status for leaks */ |
| 725 | + if (orte_state_base_run_fdcheck) { |
| 726 | + orte_state_base_check_fds(jdata); |
| 727 | + } |
717 | 728 | ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
|
718 | 729 | /* if they requested notification upon completion, provide it */
|
719 | 730 | if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) {
|
@@ -1016,3 +1027,99 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata)
|
1016 | 1027 |
|
1017 | 1028 | OBJ_RELEASE(caddy);
|
1018 | 1029 | }
|
| 1030 | + |
| 1031 | + |
| 1032 | +void orte_state_base_check_fds(orte_job_t *jdata) |
| 1033 | +{ |
| 1034 | + int nfds, i, fdflags, flflags; |
| 1035 | + char path[1024], info[256], **list=NULL, *status, *result, *r2; |
| 1036 | + ssize_t rc; |
| 1037 | + struct flock fl; |
| 1038 | + int cnt = 0; |
| 1039 | + |
| 1040 | + /* get the number of available file descriptors |
| 1041 | + * for this daemon */ |
| 1042 | + nfds = getdtablesize(); |
| 1043 | + result = NULL; |
| 1044 | + /* loop over them and get their info */ |
| 1045 | + for (i=0; i < nfds; i++) { |
| 1046 | + fdflags = fcntl(i, F_GETFD); |
| 1047 | + if (-1 == fdflags) { |
| 1048 | + /* no open fd in that slot */ |
| 1049 | + continue; |
| 1050 | + } |
| 1051 | + flflags = fcntl(i, F_GETFL); |
| 1052 | + if (-1 == flflags) { |
| 1053 | + /* no open fd in that slot */ |
| 1054 | + continue; |
| 1055 | + } |
| 1056 | + snprintf(path, 1024, "/proc/self/fd/%d", i); |
| 1057 | + memset(info, 0, 256); |
| 1058 | + /* read the info about this fd */ |
| 1059 | + rc = readlink(path, info, 256); |
| 1060 | + if (-1 == rc) { |
| 1061 | + /* this fd is unavailable */ |
| 1062 | + continue; |
| 1063 | + } |
| 1064 | + /* get any file locking status */ |
| 1065 | + fl.l_type = F_WRLCK; |
| 1066 | + fl.l_whence = 0; |
| 1067 | + fl.l_start = 0; |
| 1068 | + fl.l_len = 0; |
| 1069 | + fcntl(i, F_GETLK, &fl); |
| 1070 | + /* construct the list of capabilities */ |
| 1071 | + if (fdflags & FD_CLOEXEC) { |
| 1072 | + opal_argv_append_nosize(&list, "cloexec"); |
| 1073 | + } |
| 1074 | + if (flflags & O_APPEND) { |
| 1075 | + opal_argv_append_nosize(&list, "append"); |
| 1076 | + } |
| 1077 | + if (flflags & O_NONBLOCK) { |
| 1078 | + opal_argv_append_nosize(&list, "nonblock"); |
| 1079 | + } |
| 1080 | + if (flflags & O_RDONLY) { |
| 1081 | + opal_argv_append_nosize(&list, "rdonly"); |
| 1082 | + } |
| 1083 | + if (flflags & O_RDWR) { |
| 1084 | + opal_argv_append_nosize(&list, "rdwr"); |
| 1085 | + } |
| 1086 | + if (flflags & O_WRONLY) { |
| 1087 | + opal_argv_append_nosize(&list, "wronly"); |
| 1088 | + } |
| 1089 | + if (flflags & O_DSYNC) { |
| 1090 | + opal_argv_append_nosize(&list, "dsync"); |
| 1091 | + } |
| 1092 | + if (flflags & O_RSYNC) { |
| 1093 | + opal_argv_append_nosize(&list, "rsync"); |
| 1094 | + } |
| 1095 | + if (flflags & O_SYNC) { |
| 1096 | + opal_argv_append_nosize(&list, "sync"); |
| 1097 | + } |
| 1098 | + if (F_UNLCK != fl.l_type) { |
| 1099 | + if (F_WRLCK == fl.l_type) { |
| 1100 | + opal_argv_append_nosize(&list, "wrlock"); |
| 1101 | + } else { |
| 1102 | + opal_argv_append_nosize(&list, "rdlock"); |
| 1103 | + } |
| 1104 | + } |
| 1105 | + if (NULL != list) { |
| 1106 | + status = opal_argv_join(list, ' '); |
| 1107 | + opal_argv_free(list); |
| 1108 | + list = NULL; |
| 1109 | + if (NULL == result) { |
| 1110 | + asprintf(&result, " %d\t(%s)\t%s\n", i, info, status); |
| 1111 | + } else { |
| 1112 | + asprintf(&r2, "%s %d\t(%s)\t%s\n", result, i, info, status); |
| 1113 | + free(result); |
| 1114 | + result = r2; |
| 1115 | + } |
| 1116 | + free(status); |
| 1117 | + } |
| 1118 | + ++cnt; |
| 1119 | + } |
| 1120 | + asprintf(&r2, "%s: %d open file descriptors after job %d completed\n%s", |
| 1121 | + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cnt, ORTE_LOCAL_JOBID(jdata->jobid), result); |
| 1122 | + opal_output(0, "%s", r2); |
| 1123 | + free(result); |
| 1124 | + free(r2); |
| 1125 | +} |
0 commit comments