Skip to content

Commit 55613fd

Browse files
committed
Tree: fix error handling when gateway channel is closing
Fix PropagationChannel.ev_close() where gateway channel termination is handled. If we get an actual rc > 0, that comes from the gateway command itself and that means the gateway is defective/misconfigured, in that case, we mark it as unreachable at the Task level. In addition, in that case, if we have not launched the remote commands yet, they are redistributed to other available gateways. rc=None is now handled as a normal termination of the propagation channel and the corresponding gateway is not marked as unreachable anymore. Fixes #566.
1 parent d547257 commit 55613fd

File tree

2 files changed

+18
-10
lines changed

2 files changed

+18
-10
lines changed

lib/ClusterShell/Propagation.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -394,24 +394,32 @@ def recv_ctl(self, msg):
394394

395395
def ev_hup(self, worker, node, rc):
396396
"""Channel command is closing"""
397+
self.logger.debug("ev_hup gateway=%s %s", str(worker.nodes), self)
397398
self._rc = rc
398399

399400
def ev_close(self, worker, timedout):
400401
"""Channel is closing"""
401402
# do not use worker buffer or rc accessors here as we doesn't use
402403
# common stream names
403404
gateway = str(worker.nodes)
404-
self.logger.debug("ev_close gateway=%s %s", gateway, self)
405-
self.logger.debug("ev_close rc=%s", self._rc) # may be None
406-
407-
# NOTE: self._rc may be None if the communication channel has aborted
408-
if self._rc != 0:
409-
self.logger.debug("error on gateway %s (setup=%s)", gateway,
410-
self.setup)
405+
self.logger.debug("ev_close gateway=%s rc=%s %s", gateway, self._rc,
406+
self)
407+
408+
# NOTE: self._rc is set None when _we_ close the channel (abort)
409+
if self._rc is None and not self.setup:
410+
# aborting before the channel is setup is worth a warning
411+
self.logger.warning("ev_close: rc=%s with channel not setup",
412+
self._rc)
413+
414+
if self._rc is not None and self._rc != 0:
415+
# handle gateway channel error
416+
self.logger.debug("error on gateway %s (rc=%s, setup=%s)", gateway,
417+
self._rc, self.setup)
411418
self.task.router.mark_unreachable(gateway)
412419
self.logger.debug("gateway %s now set as unreachable", gateway)
413420

414421
if not self.setup:
415-
# channel was not set up: we can safely repropagate commands
422+
# channel was not set up: we can safely redistribute commands
423+
self.logger.debug("channel was not set up: redistributing...")
416424
for mw in set(self.task.gateways[gateway][1]):
417425
mw._relaunch(gateway)

lib/ClusterShell/Worker/Tree.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,8 +430,8 @@ def _on_remote_node_msgline(self, node, msg, sname, gateway):
430430
def _on_remote_node_close(self, node, rc, gateway):
431431
"""remote node closing with return code"""
432432
DistantWorker._on_node_close(self, node, rc)
433-
self.logger.debug("_on_remote_node_close %s %s via gw %s", node,
434-
self._close_count, gateway)
433+
self.logger.debug("_on_remote_node_close %s %s via gw %s rc=%s", node,
434+
self._close_count, gateway, rc)
435435

436436
# finalize rcopy: extract tar data
437437
if self.source and self.reverse:

0 commit comments

Comments
 (0)