Skip to content

Commit 7e5e3f6

Browse files
committed
Tree: fix error handling when gateway channel is closing
Fix PropagationChannel.ev_close() where gateway channel termination is handled. If we get an actual rc > 0, that comes from the gateway command itself and that means the gateway is defective/misconfigured, in that case, we mark it as unreachable at the Task level. In addition, in that case, if we have not launched the remote commands yet, they are redistributed to other available gateways. rc=None is now handled as a normal termination of the propagation channel and the corresponding gateway is not marked as unreachable anymore. Fixes #566.
1 parent 014a201 commit 7e5e3f6

File tree

2 files changed

+16
-7
lines changed

2 files changed

+16
-7
lines changed

lib/ClusterShell/Propagation.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,7 @@ def recv_ctl(self, msg):
394394

395395
def ev_hup(self, worker, node, rc):
396396
"""Channel command is closing"""
397+
self.logger.debug("ev_hup gateway=%s %s", str(worker.nodes), self)
397398
self._rc = rc
398399

399400
def ev_close(self, worker, timedout):
@@ -402,16 +403,24 @@ def ev_close(self, worker, timedout):
402403
# common stream names
403404
gateway = str(worker.nodes)
404405
self.logger.debug("ev_close gateway=%s %s", gateway, self)
406+
407+
# NOTE: self._rc is set None when we abort the channel
405408
self.logger.debug("ev_close rc=%s", self._rc) # may be None
406409

407-
# NOTE: self._rc may be None if the communication channel has aborted
408-
if self._rc != 0:
409-
self.logger.debug("error on gateway %s (setup=%s)", gateway,
410-
self.setup)
410+
if self._rc is None and not self.setup:
411+
# aborting before the channel is setup would be weird...
412+
self.logger.warning("ev_close rc=%s and self.setup=%s", self._rc,
413+
self.setup)
414+
415+
if self._rc is not None and self._rc != 0:
416+
# handle gateway channel error
417+
self.logger.debug("error on gateway %s (rc=%s, setup=%s)", gateway,
418+
self._rc, self.setup)
411419
self.task.router.mark_unreachable(gateway)
412420
self.logger.debug("gateway %s now set as unreachable", gateway)
413421

414422
if not self.setup:
415-
# channel was not set up: we can safely repropagate commands
423+
# channel was not set up: we can safely redistribute commands
424+
self.logger.debug("channel was not set up: redistributing...")
416425
for mw in set(self.task.gateways[gateway][1]):
417426
mw._relaunch(gateway)

lib/ClusterShell/Worker/Tree.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,8 +430,8 @@ def _on_remote_node_msgline(self, node, msg, sname, gateway):
430430
def _on_remote_node_close(self, node, rc, gateway):
431431
"""remote node closing with return code"""
432432
DistantWorker._on_node_close(self, node, rc)
433-
self.logger.debug("_on_remote_node_close %s %s via gw %s", node,
434-
self._close_count, gateway)
433+
self.logger.debug("_on_remote_node_close %s %s via gw %s rc=%s", node,
434+
self._close_count, gateway, rc)
435435

436436
# finalize rcopy: extract tar data
437437
if self.source and self.reverse:

0 commit comments

Comments
 (0)