Skip to content

Commit c106cff

Browse files
authored
Merge pull request #751 from netenglabs/fix-exceptions
Fix exception handling in the poller
2 parents 1e3c5c9 + 0670a19 commit c106cff

File tree

4 files changed

+35
-19
lines changed

4 files changed

+35
-19
lines changed

suzieq/poller/sq_poller.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,12 @@ async def start_controller(user_args: argparse.Namespace, config_data: Dict):
5252
if not log_stdout:
5353
print(f"ERROR: {error}")
5454
logger.error(error)
55-
sys.exit(-1)
55+
sys.exit(1)
56+
except Exception as error:
57+
if not log_stdout:
58+
traceback.print_exc()
59+
logger.critical(f'{error}\n{traceback.format_exc()}')
60+
sys.exit(1)
5661

5762

5863
def controller_main():
@@ -198,8 +203,6 @@ def controller_main():
198203
asyncio.run(start_controller(args, cfg))
199204
except (KeyboardInterrupt, RuntimeError):
200205
pass
201-
except Exception: # pylint: disable=broad-except
202-
traceback.print_exc()
203206

204207

205208
if __name__ == '__main__':

suzieq/poller/worker/nodes/node.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,7 @@ async def initialize(self, **kwargs) -> TNode:
146146
passphrase)
147147
if not self.jump_host_key:
148148
raise SqPollerConfError('Unable to read private key file'
149-
f' at {pvtkey_file}'
150-
)
149+
f' at {pvtkey_file}')
151150
else:
152151
self.jump_host = None
153152
self.jump_host_key = None
@@ -463,6 +462,15 @@ async def _detect_node_type(self):
463462
# that sets the device type
464463
devtype = self.devtype
465464
except Exception:
465+
self.logger.exception(f'{self.address}:{self.port}: Node '
466+
'discovery failed due to exception')
467+
# All the exceptions related to timeouts and authentication
468+
# problems are already catched inside. If we get an
469+
# exception here, this is unexpected and most likely something
470+
# went wrong with the command output parsing.
471+
# In this case there is not point in retrying discovery, it is
472+
# likely a bug.
473+
self._retry = 0
466474
devtype = None
467475

468476
if not devtype:
@@ -840,7 +848,6 @@ async def _exec_cmd(self, service_callback, cmd_list, cb_token,
840848
if only_one is True, commands are executed until the first one that
841849
succeeds, and the rest are ignored.
842850
'''
843-
844851
if self.transport == "ssh":
845852
await self._ssh_gather(service_callback, cmd_list, cb_token,
846853
oformat, timeout, only_one)
@@ -1220,7 +1227,7 @@ async def _rest_gather(self, service_callback, cmd_list, cb_token,
12201227
result.append(self._create_error(cmd))
12211228
self.logger.error(
12221229
f"{self.transport}://{self.hostname}:{self.port}: Unable "
1223-
"to communicate with node due to str(e)")
1230+
f"to communicate with node due to {str(e)}")
12241231

12251232
await service_callback(result, cb_token)
12261233

@@ -1331,8 +1338,8 @@ async def _init_rest(self):
13311338
except Exception as e:
13321339
self.current_exception = e
13331340
self.logger.error(
1334-
"{self.transport}://{self.hostname}:{self.port}: Unable to"
1335-
" communicate with node due to {str(e)}")
1341+
f"{self.transport}://{self.hostname}:{self.port}: Unable "
1342+
f"to communicate with node due to {str(e)}")
13361343

13371344
async def _rest_gather(self, service_callback, cmd_list, cb_token,
13381345
oformat='json', timeout=None):
@@ -1372,8 +1379,8 @@ async def _rest_gather(self, service_callback, cmd_list, cb_token,
13721379
self.current_exception = e
13731380
result.append(self._create_error(cmd_list))
13741381
self.logger.error(
1375-
"{self.transport}://{self.hostname}:{self.port}: Unable "
1376-
"to communicate with node due to {str(e)}")
1382+
f"{self.transport}://{self.hostname}:{self.port}: Unable "
1383+
f"to communicate with node due to {str(e)}")
13771384

13781385
await service_callback(result, cb_token)
13791386

@@ -1700,10 +1707,10 @@ async def _ssh_gather(self, service_callback, cmd_list, cb_token, oformat,
17001707
await self._close_connection()
17011708
self.logger.debug("Closed conn successfully for "
17021709
f"{self.hostname}")
1703-
except Exception as e1:
1710+
except Exception as close_exc:
17041711
self.logger.error(
17051712
f"Caught an exception closing {self.hostname}"
1706-
f" for {cmd}: {e1}")
1713+
f" for {cmd}: {close_exc}")
17071714
else:
17081715
self.logger.error(
17091716
f"Unable to connect to {self.hostname} {cmd} "

suzieq/poller/worker/sq_worker.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,10 @@
88
from typing import Dict
99

1010
import uvloop
11-
1211
from suzieq.poller.worker.worker import Worker
1312
from suzieq.poller.worker.writers.output_worker import OutputWorker
1413
from suzieq.shared.exceptions import InventorySourceError, SqPollerConfError
15-
from suzieq.shared.utils import poller_log_params, init_logger, load_sq_config
14+
from suzieq.shared.utils import init_logger, load_sq_config, poller_log_params
1615

1716

1817
async def start_worker(userargs: argparse.Namespace, cfg: Dict):
@@ -36,8 +35,14 @@ async def start_worker(userargs: argparse.Namespace, cfg: Dict):
3635
await worker.init_worker()
3736
await worker.run()
3837
except (SqPollerConfError, InventorySourceError) as error:
38+
if not log_stdout:
39+
print(error)
3940
logger.error(error)
40-
print(error)
41+
sys.exit(1)
42+
except Exception as error:
43+
if not log_stdout:
44+
traceback.print_exc()
45+
logger.critical(f'{error}\n{traceback.format_exc()}')
4146
sys.exit(1)
4247

4348

@@ -130,8 +135,6 @@ def worker_main():
130135
asyncio.run(start_worker(userargs, cfg))
131136
except (KeyboardInterrupt, RuntimeError):
132137
pass
133-
except Exception: # pylint: disable=broad-except
134-
traceback.print_exc()
135138

136139
sys.exit(0)
137140

suzieq/poller/worker/worker.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,11 @@ async def run(self):
138138
tasks = await self._pop_waiting_worker_tasks()
139139
while tasks:
140140
try:
141-
_, pending = await asyncio.wait(
141+
done, pending = await asyncio.wait(
142142
tasks, return_when=asyncio.FIRST_COMPLETED)
143+
for d in done:
144+
if d.exception():
145+
raise d.exception()
143146
tasks = list(pending)
144147
running_svcs = self.service_manager.running_services
145148
if tasks and any(i._coro in running_svcs

0 commit comments

Comments
 (0)