Skip to content

Commit d9ad918

Browse files
committed
orte/iof: Address the case when output is a regular file
Regular files are always write-ready, so non-blocking I/O does not give any benefits for them. More than that - if libevent is using "epoll" to track fd events, epoll_ctl will refuse attempt to add an fd pointing to a regular file descriptor with EPERM. This fix checks the object referenced by fd and avoids event_add using event_active instead. In the original configuration that uncovered this issue "epoll" was used in libevent, it was triggering the following warning message: "[warn] Epoll ADD(1) on fd 0 failed. Old events were 0; read change was 1 (add); write change was 0 (none): Operation not permitted" And the side effect was accumulation of all output in mpirun memory and actually writing it only at mpirun exit. Signed-off-by: Artem Polyakov <[email protected]>
1 parent d1c5955 commit d9ad918

File tree

5 files changed

+112
-3
lines changed

5 files changed

+112
-3
lines changed

opal/util/fd.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*
22
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
33
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
4+
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
45
*
56
* $COPYRIGHT$
67
*
@@ -11,6 +12,14 @@
1112

1213
#include "opal_config.h"
1314

15+
#ifdef HAVE_SYS_TYPES_H
16+
#include <sys/types.h>
17+
#endif
18+
#ifdef HAVE_SYS_STAT_H
19+
#include <sys/stat.h>
20+
#endif
21+
22+
1423
#ifdef HAVE_UNISTD_H
1524
#include <unistd.h>
1625
#endif
@@ -89,3 +98,31 @@ int opal_fd_set_cloexec(int fd)
8998

9099
return OPAL_SUCCESS;
91100
}
101+
102+
bool opal_fd_is_regular(int fd)
103+
{
104+
struct stat buf;
105+
if (fstat(fd, &buf)) {
106+
return false;
107+
}
108+
return S_ISREG(buf.st_mode);
109+
}
110+
111+
bool opal_fd_is_chardev(int fd)
112+
{
113+
struct stat buf;
114+
if (fstat(fd, &buf)) {
115+
return false;
116+
}
117+
return S_ISCHR(buf.st_mode);
118+
}
119+
120+
bool opal_fd_is_blkdev(int fd)
121+
{
122+
struct stat buf;
123+
if (fstat(fd, &buf)) {
124+
return false;
125+
}
126+
return S_ISBLK(buf.st_mode);
127+
}
128+

opal/util/fd.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*
22
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
33
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
4+
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
45
*
56
* $COPYRIGHT$
67
*
@@ -63,6 +64,37 @@ OPAL_DECLSPEC int opal_fd_write(int fd, int len, const void *buffer);
6364
*/
6465
OPAL_DECLSPEC int opal_fd_set_cloexec(int fd);
6566

67+
/**
68+
* Convenience function to check if fd point to an accessible regular file.
69+
*
70+
* @param fd File descriptor
71+
*
72+
* @returns true if "fd" points to a regular file.
73+
* @returns false otherwise.
74+
*/
75+
OPAL_DECLSPEC bool opal_fd_is_regular(int fd);
76+
77+
/**
78+
* Convenience function to check if fd point to an accessible character device.
79+
*
80+
* @param fd File descriptor
81+
*
82+
* @returns true if "fd" points to a regular file.
83+
* @returns false otherwise.
84+
*/
85+
OPAL_DECLSPEC bool opal_fd_is_chardev(int fd);
86+
87+
/**
88+
* Convenience function to check if fd point to an accessible block device.
89+
*
90+
* @param fd File descriptor
91+
*
92+
* @returns true if "fd" points to a regular file.
93+
* @returns false otherwise.
94+
*/
95+
OPAL_DECLSPEC bool opal_fd_is_blkdev(int fd);
96+
97+
6698
END_C_DECLS
6799

68100
#endif

orte/mca/iof/base/base.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* All rights reserved.
1515
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2017 IBM Corporation. All rights reserved.
17+
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -48,6 +49,7 @@
4849
#include "opal/class/opal_bitmap.h"
4950
#include "orte/mca/mca.h"
5051
#include "opal/mca/event/event.h"
52+
#include "opal/util/fd.h"
5153

5254
#include "orte/mca/iof/iof.h"
5355
#include "orte/runtime/orte_globals.h"
@@ -84,6 +86,7 @@ ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_iof_job_t);
8486
typedef struct {
8587
opal_list_item_t super;
8688
bool pending;
89+
bool always_writable;
8790
opal_event_t *ev;
8891
int fd;
8992
opal_list_t outputs;
@@ -157,6 +160,9 @@ typedef struct orte_iof_base_t orte_iof_base_t;
157160
ep->tag = (tg); \
158161
if (0 <= (fid)) { \
159162
ep->wev->fd = (fid); \
163+
ep->wev->always_writable = opal_fd_is_regular(fid) || \
164+
opal_fd_is_chardev(fid) || \
165+
opal_fd_is_blkdev(fid); \
160166
opal_event_set(orte_event_base, \
161167
ep->wev->ev, ep->wev->fd, \
162168
OPAL_EV_WRITE, \

orte/mca/iof/base/iof_base_frame.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2015-2017 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2017 IBM Corporation. All rights reserved.
18+
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -298,6 +299,7 @@ OBJ_CLASS_INSTANCE(orte_iof_read_event_t,
298299
static void orte_iof_base_write_event_construct(orte_iof_write_event_t* wev)
299300
{
300301
wev->pending = false;
302+
wev->always_writable = false;
301303
wev->fd = -1;
302304
OBJ_CONSTRUCT(&wev->outputs, opal_list_t);
303305
wev->ev = opal_event_alloc();

orte/mca/iof/base/iof_base_output.c

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2017 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -259,13 +260,22 @@ int orte_iof_base_write_output(const orte_process_name_t *name, orte_iof_tag_t s
259260

260261
/* is the write event issued? */
261262
if (!channel->pending) {
263+
int rc = -1;
262264
/* issue it */
263265
OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
264266
"%s write:output adding write event",
265267
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
266268
channel->pending = true;
267269
ORTE_POST_OBJECT(channel);
268-
opal_event_add(channel->ev, 0);
270+
if (channel->always_writable) {
271+
/* Regular is always write ready. Activate the handler. */
272+
opal_event_active (channel->ev, OPAL_EV_WRITE, 1);
273+
} else {
274+
rc = opal_event_add(channel->ev, 0);
275+
if (rc) {
276+
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
277+
}
278+
}
269279
}
270280

271281
return num_buffered;
@@ -297,13 +307,14 @@ void orte_iof_base_static_dump_output(orte_iof_read_event_t *rev)
297307
}
298308
}
299309

310+
#define ORTE_IOF_REGULARF_BLOCK (1024)
300311
void orte_iof_base_write_handler(int fd, short event, void *cbdata)
301312
{
302313
orte_iof_sink_t *sink = (orte_iof_sink_t*)cbdata;
303314
orte_iof_write_event_t *wev = sink->wev;
304315
opal_list_item_t *item;
305316
orte_iof_write_output_t *output;
306-
int num_written;
317+
int num_written, total_written = 0;
307318

308319
ORTE_ACQUIRE_OBJECT(sink);
309320

@@ -333,6 +344,10 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
333344
/* leave the write event running so it will call us again
334345
* when the fd is ready.
335346
*/
347+
if(wev->always_writable){
348+
/* Schedule another event */
349+
opal_event_active (wev->ev, OPAL_EV_WRITE, 1);
350+
}
336351
return;
337352
}
338353
/* otherwise, something bad happened so all we can do is abort
@@ -356,12 +371,29 @@ void orte_iof_base_write_handler(int fd, short event, void *cbdata)
356371
/* leave the write event running so it will call us again
357372
* when the fd is ready
358373
*/
374+
if(wev->always_writable){
375+
/* Schedule another event */
376+
opal_event_active (wev->ev, OPAL_EV_WRITE, 1);
377+
378+
}
359379
return;
360380
}
361381
OBJ_RELEASE(output);
382+
383+
total_written += num_written;
384+
if(wev->always_writable && (ORTE_IOF_REGULARF_BLOCK <= total_written)){
385+
/* If this is a regular file it will never tell us it will block
386+
* Write no more than ORTE_IOF_REGULARF_BLOCK at a time allowing
387+
* other fds to progress
388+
*/
389+
opal_event_active (wev->ev, OPAL_EV_WRITE, 1);
390+
return;
391+
}
362392
}
363393
ABORT:
364-
opal_event_del(wev->ev);
394+
if (!wev->always_writable){
395+
opal_event_del(wev->ev);
396+
}
365397
wev->pending = false;
366398
ORTE_POST_OBJECT(wev);
367399
}

0 commit comments

Comments
 (0)