Skip to content

Commit 86337c4

Browse files
committed
Merge remote-tracking branch 'origin/549-open-files'
* origin/549-open-files: issue #603: Revert "ci: update to Ansible 2.8.3" Fix unit_Test.ClientTest following 108015a service: clean up log messages, especially at shutdown remove unused imports flagged by lgtm [linear2]: merge fallout flaggged by LGTM issue #549: docs: update Changelog issue #549: increase open file limit automatically if possible ansible: improve process.py docs docs: remove old list link. docs: migrate email list docs: changelog tweaks parent: decode logged stdout as UTF-8. scripts: import affin.sh ci: update to Ansible 2.8.3 tests: terraform tweaks unix: include more IO in the try/except for connection failure tests: update gcloud.py to match terraform config tests: hide ugly error during Ansible tests tests/ansible/gcloud: terraform conf for load testing ansible: gracefully handle failure to connect to MuxProcess ansible: fix affinity tests for 5ae45f6612390bbc888b65964fb5c218feed1679 ansible: pin per-CPU muxes to their corresponding CPU ansible: reap mux processes on shut down
2 parents adbad76 + cd2689a commit 86337c4

28 files changed

+442
-147
lines changed

ansible_mitogen/affinity.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def assign_controller(self):
142142
Assign the Ansible top-level policy to this process.
143143
"""
144144

145-
def assign_muxprocess(self):
145+
def assign_muxprocess(self, index):
146146
"""
147147
Assign the MuxProcess policy to this process.
148148
"""
@@ -224,7 +224,7 @@ def _balance(self):
224224
))
225225

226226
def _set_cpu(self, cpu):
227-
self._set_affinity(1 << cpu)
227+
self._set_affinity(1 << (cpu % self.cpu_count))
228228

229229
def _clear(self):
230230
all_cpus = (1 << self.cpu_count) - 1
@@ -236,8 +236,8 @@ def assign_controller(self):
236236
else:
237237
self._balance()
238238

239-
def assign_muxprocess(self):
240-
self._set_cpu(0)
239+
def assign_muxprocess(self, index):
240+
self._set_cpu(index)
241241

242242
def assign_worker(self):
243243
self._balance()

ansible_mitogen/connection.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,13 @@
3737
import sys
3838
import time
3939

40-
import jinja2.runtime
41-
from ansible.module_utils import six
4240
import ansible.constants as C
4341
import ansible.errors
4442
import ansible.plugins.connection
4543
import ansible.utils.shlex
4644

4745
import mitogen.core
4846
import mitogen.fork
49-
import mitogen.unix
5047
import mitogen.utils
5148

5249
import ansible_mitogen.parsing

ansible_mitogen/mixins.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -182,14 +182,6 @@ def _generate_tmp_path(self):
182182
)
183183
)
184184

185-
def _generate_tmp_path(self):
186-
return os.path.join(
187-
self._connection.get_good_temp_dir(),
188-
'ansible_mitogen_action_%016x' % (
189-
random.getrandbits(8*8),
190-
)
191-
)
192-
193185
def _make_tmp_path(self, remote_user=None):
194186
"""
195187
Create a temporary subdirectory as a child of the temporary directory

ansible_mitogen/process.py

Lines changed: 74 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,13 @@
2828

2929
from __future__ import absolute_import
3030
import atexit
31-
import errno
3231
import logging
3332
import multiprocessing
3433
import os
34+
import resource
3535
import signal
3636
import socket
3737
import sys
38-
import time
3938

4039
try:
4140
import faulthandler
@@ -79,6 +78,14 @@
7978
'"mitogen_*" or "operon_*" strategies are active.'
8079
)
8180

81+
shutting_down_msg = (
82+
'The task worker cannot connect. Ansible may be shutting down, or '
83+
'the maximum open files limit may have been exceeded. If this occurs '
84+
'midway through a run, please retry after increasing the open file '
85+
'limit (ulimit -n). Original error: %s'
86+
)
87+
88+
8289
#: The worker model as configured by the currently running strategy. This is
8390
#: managed via :func:`get_worker_model` / :func:`set_worker_model` functions by
8491
#: :class:`StrategyMixin`.
@@ -229,9 +236,27 @@ def _setup_responder(responder):
229236
)
230237

231238

239+
def increase_open_file_limit():
240+
"""
241+
#549: in order to reduce the possibility of hitting an open files limit,
242+
increase :data:`resource.RLIMIT_NOFILE` from its soft limit to its hard
243+
limit, if they differ.
244+
245+
It is common that a low soft limit is configured by default, where the hard
246+
limit is much higher.
247+
"""
248+
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
249+
if soft < hard:
250+
LOG.debug('raising soft open file limit from %d to %d', soft, hard)
251+
resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
252+
else:
253+
LOG.debug('cannot increase open file limit; existing limit is %d', hard)
254+
255+
232256
def common_setup(enable_affinity=True, _init_logging=True):
233257
save_pid('controller')
234258
ansible_mitogen.logging.set_process_name('top')
259+
235260
if enable_affinity:
236261
ansible_mitogen.affinity.policy.assign_controller()
237262

@@ -247,6 +272,7 @@ def common_setup(enable_affinity=True, _init_logging=True):
247272
mitogen.core.enable_profiling()
248273

249274
MuxProcess.cls_original_env = dict(os.environ)
275+
increase_open_file_limit()
250276

251277

252278
def get_cpu_count(default=None):
@@ -269,10 +295,25 @@ def get_cpu_count(default=None):
269295

270296

271297
class Binding(object):
298+
"""
299+
Represent a bound connection for a particular inventory hostname. When
300+
operating in sharded mode, the actual MuxProcess implementing a connection
301+
varies according to the target machine. Depending on the particular
302+
implementation, this class represents a binding to the correct MuxProcess.
303+
"""
272304
def get_child_service_context(self):
273305
"""
274306
Return the :class:`mitogen.core.Context` to which children should
275-
direct ContextService requests, or :data:`None` for the local process.
307+
direct requests for services such as FileService, or :data:`None` for
308+
the local process.
309+
310+
This can be different from :meth:`get_service_context` where MuxProcess
311+
and WorkerProcess are combined, and it is discovered a task is
312+
delegated after being assigned to its initial worker for the original
313+
un-delegated hostname. In that case, connection management and
314+
expensive services like file transfer must be implemented by the
315+
MuxProcess connected to the target, rather than routed to the
316+
MuxProcess responsible for executing the task.
276317
"""
277318
raise NotImplementedError()
278319

@@ -358,8 +399,8 @@ def __init__(self, _init_logging=True):
358399

359400
def _listener_for_name(self, name):
360401
"""
361-
Given a connection stack, return the UNIX listener that should be used
362-
to communicate with it. This is a simple hash of the inventory name.
402+
Given an inventory hostname, return the UNIX listener that should
403+
communicate with it. This is a simple hash of the inventory name.
363404
"""
364405
if len(self._muxes) == 1:
365406
return self._muxes[0].path
@@ -376,21 +417,26 @@ def _reconnect(self, path):
376417
self.parent = None
377418
self.router = None
378419

379-
self.router, self.parent = mitogen.unix.connect(
380-
path=path,
381-
broker=self.broker,
382-
)
420+
try:
421+
self.router, self.parent = mitogen.unix.connect(
422+
path=path,
423+
broker=self.broker,
424+
)
425+
except mitogen.unix.ConnectError as e:
426+
# This is not AnsibleConnectionFailure since we want to break
427+
# with_items loops.
428+
raise ansible.errors.AnsibleError(shutting_down_msg % (e,))
429+
383430
self.listener_path = path
384431

385432
def on_process_exit(self, sock):
386433
"""
387434
This is an :mod:`atexit` handler installed in the top-level process.
388435
389436
Shut the write end of `sock`, causing the receive side of the socket in
390-
every worker process to wake up with a 0-byte reads, and causing their
391-
main threads to wake up and initiate shutdown. After shutting the
392-
socket down, wait for a 0-byte read from the read end, which will occur
393-
after the last child closes the descriptor on exit.
437+
every worker process to return 0-byte reads, and causing their main
438+
threads to wake and initiate shutdown. After shutting the socket down,
439+
wait on each child to finish exiting.
394440
395441
This is done using :mod:`atexit` since Ansible lacks any better hook to
396442
run code during exit, and unless some synchronization exists with
@@ -407,14 +453,21 @@ def on_process_exit(self, sock):
407453
mitogen.core.io_op(sock.recv, 1)
408454
sock.close()
409455

456+
for mux in self._muxes:
457+
_, status = os.waitpid(mux.pid, 0)
458+
status = mitogen.fork._convert_exit_status(status)
459+
LOG.debug('mux %d PID %d %s', mux.index, mux.pid,
460+
mitogen.parent.returncode_to_str(status))
461+
410462
def _initialize(self):
411463
"""
412-
Arrange for classic process model connection multiplexer child
413-
processes to be started, if they are not already running.
464+
Arrange for classic model multiplexers to be started, if they are not
465+
already running.
414466
415-
The parent process picks a UNIX socket path the child will use prior to
416-
fork, creates a socketpair used essentially as a semaphore, then blocks
417-
waiting for the child to indicate the UNIX socket is ready for use.
467+
The parent process picks a UNIX socket path each child will use prior
468+
to fork, creates a socketpair used essentially as a semaphore, then
469+
blocks waiting for the child to indicate the UNIX socket is ready for
470+
use.
418471
419472
:param bool _init_logging:
420473
For testing, if :data:`False`, don't initialize logging.
@@ -513,8 +566,8 @@ class MuxProcess(object):
513566
Implement a subprocess forked from the Ansible top-level, as a safe place
514567
to contain the Mitogen IO multiplexer thread, keeping its use of the
515568
logging package (and the logging package's heavy use of locks) far away
516-
from the clutches of os.fork(), which is used continuously by the
517-
multiprocessing package in the top-level process.
569+
from os.fork(), which is used continuously by the multiprocessing package
570+
in the top-level process.
518571
519572
The problem with running the multiplexer in that process is that should the
520573
multiplexer thread be in the process of emitting a log entry (and holding
@@ -555,7 +608,6 @@ def start(self):
555608
mitogen.core.io_op(MuxProcess.cls_parent_sock.recv, 1)
556609
return
557610

558-
save_pid('mux')
559611
ansible_mitogen.logging.set_process_name('mux:' + str(self.index))
560612
if setproctitle:
561613
setproctitle.setproctitle('mitogen mux:%s (%s)' % (
@@ -581,7 +633,7 @@ def worker_main(self):
581633
"""
582634
save_pid('mux')
583635
ansible_mitogen.logging.set_process_name('mux')
584-
ansible_mitogen.affinity.policy.assign_muxprocess()
636+
ansible_mitogen.affinity.policy.assign_muxprocess(self.index)
585637

586638
self._setup_master()
587639
self._setup_services()

docs/ansible_detailed.rst

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,21 +79,24 @@ Installation
7979

8080
.. raw:: html
8181

82-
<form action="https://www.freelists.org/cgi-bin/subscription.cgi" method="post">
82+
<form action="https://networkgenomics.com/save-email/" method="post" id="emailform">
83+
<input type=hidden name="list_name" value="mitogen-announce">
84+
8385
Releases occur frequently and often include important fixes. Subscribe
84-
to the <a
85-
href="https://www.freelists.org/list/mitogen-announce">mitogen-announce
86-
mailing list</a> be notified of new releases.
86+
to the mitogen-announce list to stay updated.
8787

8888
<p>
8989
<input type="email" placeholder="E-mail Address" name="email" style="font-size: 105%;">
90-
<input type=hidden name="list" value="mitogen-announce">
91-
<!-- <input type=hidden name="url_or_message" value="https://mitogen.readthedocs.io/en/stable/ansible.html#installation">-->
92-
<input type="hidden" name="action" value="subscribe">
9390
<button type="submit" style="font-size: 105%;">
9491
Subscribe
9592
</button>
9693
</p>
94+
95+
<div id="emailthanks" style="display:none">
96+
Thanks!
97+
</div>
98+
99+
<p>
97100
</form>
98101

99102

@@ -1375,3 +1378,19 @@ Despite the small margin for optimization, Mitogen still manages **6.2x less
13751378
bandwidth and 1.8x less time**.
13761379

13771380
.. image:: images/ansible/pcaps/costapp-uk-india.svg
1381+
1382+
1383+
.. raw:: html
1384+
1385+
<script src="https://networkgenomics.com/static/js/public_all.js?92d49a3a"></script>
1386+
<script>
1387+
NetGen = {
1388+
public: {
1389+
page_id: "operon",
1390+
urls: {
1391+
save_email: "https://networkgenomics.com/save-email/",
1392+
}
1393+
}
1394+
};
1395+
setupEmailForm();
1396+
</script>

docs/changelog.rst

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ Enhancements
3232
are not yet handled.
3333

3434
* The ``MITOGEN_CPU_COUNT`` environment variable shards the connection
35-
multiplexer into per-CPU worker processes. This improves throughput for large
36-
runs especially involving file transfer, and is a prerequisite to future
37-
in-process SSH support. To match the behaviour of older releases, only one
38-
multiplexer is started by default.
35+
multiplexer into per-CPU workers. This improves throughput for large runs
36+
especially involving file transfer, and is a prerequisite for future
37+
in-process SSH support. One multiplexer starts by default, to match existing
38+
behaviour.
3939

4040
* `#419 <https://github.com/dw/mitogen/issues/419>`_,
4141
`#470 <https://github.com/dw/mitogen/issues/470>`_, file descriptor usage
@@ -56,13 +56,19 @@ Enhancements
5656
some hot paths, and locks that must be taken are held for less time.
5757

5858

59-
Fixes
60-
^^^^^
59+
Mitogen for Ansible
60+
^^^^^^^^^^^^^^^^^^^
6161

6262
* `#363 <https://github.com/dw/mitogen/issues/363>`_: fix an obscure race
6363
matching *Permission denied* errors from some versions of ``su`` running on
6464
heavily loaded machines.
6565

66+
* `#549 <https://github.com/dw/mitogen/issues/549>`_: the open file descriptor
67+
limit for the Ansible process is increased to the available hard limit. It is
68+
common for distributions to ship with a much higher hard limit than their
69+
default soft limit, allowing *"too many open files"* errors to be avoided
70+
more often in large runs without user configuration.
71+
6672
* `#578 <https://github.com/dw/mitogen/issues/578>`_: the extension could crash
6773
while rendering an error message, due to an incorrect format string.
6874

@@ -93,14 +99,14 @@ Fixes
9399
Core Library
94100
~~~~~~~~~~~~
95101

96-
* Logs are more readable, and many :func:`repr` strings are more descriptive.
97-
The old pseudo-function-call format is slowly migrating to human-readable
98-
output where possible. For example, *"Stream(ssh:123).connect()"* might
99-
be written *"connecting to ssh:123"*.
102+
* Log readability is improving, and many :func:`repr` strings are more
103+
descriptive. The old pseudo-function-call format is slowly migrating to
104+
human-readable output where possible. For example,
105+
*"Stream(ssh:123).connect()"* might be written *"connecting to ssh:123"*.
100106

101107
* :func:`bytearray` was removed from the list of supported serialization types.
102108
It was never portable between Python versions, unused, and never made much
103-
sense to support as a wire type.
109+
sense to support.
104110

105111
* `#170 <https://github.com/dw/mitogen/issues/170>`_: to improve subprocess
106112
management and asynchronous connect, a :class:`mitogen.parent.TimerList`
@@ -123,13 +129,12 @@ Core Library
123129
Python.
124130

125131
* `#256 <https://github.com/dw/mitogen/issues/256>`_,
126-
127-
`#419 <https://github.com/dw/mitogen/issues/419>`_: most :func:`os.dup` was
128-
eliminated, along with almost all manual file descriptor management.
129-
Descriptors are trapped in :func:`os.fdopen` objects when they are created,
130-
ensuring a leaked object will close itself, and ensuring every descriptor is
131-
fused to a `closed` flag, preventing historical bugs where a double close
132-
could destroy descriptors belonging to unrelated streams.
132+
`#419 <https://github.com/dw/mitogen/issues/419>`_: most :func:`os.dup` use
133+
was eliminated, along with almost all manual file descriptor management.
134+
Descriptors are trapped in :func:`os.fdopen` objects at creation, ensuring a
135+
leaked object will close itself, and ensuring every descriptor is fused to a
136+
`closed` flag, preventing historical bugs where a double close could destroy
137+
descriptors belonging to unrelated streams.
133138

134139
* `a5536c35 <https://github.com/dw/mitogen/commit/a5536c35>`_: avoid quadratic
135140
buffer management when logging lines received from a child's redirected
@@ -141,6 +146,7 @@ Thanks!
141146

142147
Mitogen would not be possible without the support of users. A huge thanks for
143148
bug reports, testing, features and fixes in this release contributed by
149+
`Andreas Hubert <https://github.com/peshay>`_.
144150
`Anton Markelov <https://github.com/strangeman>`_,
145151
`Nigel Metheringham <https://github.com/nigelm>`_,
146152
`Orion Poplawski <https://github.com/opoplawski>`_,

0 commit comments

Comments
 (0)