Skip to content

Commit 43c5fb6

Browse files
authored
fix(azure-iot-device): ReconnectStage State Machine Overhaul (#844)
* Auto-reconnect and SAS renewals should no longer cause problems if one or the other is turned off * Operations now fail if they require a connection, and one cannot be automatically established * All user-initiated actions should now return errors in case of failure rather than any kind of indefinite retry * Created completely separate flows for automatic reconnection vs other connections * Added more explicitness in conditional logic and improved documentation for clarity
1 parent adfe9ac commit 43c5fb6

File tree

7 files changed

+1477
-1652
lines changed

7 files changed

+1477
-1652
lines changed

azure-iot-device/azure/iot/device/common/pipeline/pipeline_ops_base.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,6 @@ class ReauthorizeConnectionOperation(PipelineOperation):
254254
Even though this is an base operation, it will most likely be handled by a more specific stage (such as an IoTHub or MQTT stage).
255255
"""
256256

257-
def __init__(self, callback):
258-
self.watchdog_timer = None
259-
super(ReauthorizeConnectionOperation, self).__init__(callback)
260-
261257
pass
262258

263259

@@ -270,7 +266,9 @@ class DisconnectOperation(PipelineOperation):
270266
Even though this is an base operation, it will most likely be handled by a more specific stage (such as an IoTHub or MQTT stage).
271267
"""
272268

273-
pass
269+
def __init__(self, callback):
270+
self.hard = True # Indicates if this is a "hard" disconnect that kills in-flight ops
271+
super(DisconnectOperation, self).__init__(callback)
274272

275273

276274
class EnableFeatureOperation(PipelineOperation):

azure-iot-device/azure/iot/device/common/pipeline/pipeline_stages_base.py

Lines changed: 350 additions & 251 deletions
Large diffs are not rendered by default.

azure-iot-device/azure/iot/device/common/pipeline/pipeline_stages_mqtt.py

Lines changed: 56 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -41,24 +41,26 @@ def __init__(self):
4141

4242
# The transport will be instantiated upon receiving the InitializePipelineOperation
4343
self.transport = None
44-
44+
# The current in-progress op that affects connection state (Connect, Disconnect, Reauthorize)
4545
self._pending_connection_op = None
46+
# Waitable event indicating the disconnect portion of the reauthorization is complete
47+
self._reauth_disconnection_complete = threading.Event()
4648

4749
@pipeline_thread.runs_on_pipeline_thread
4850
def _cancel_pending_connection_op(self, error=None):
4951
"""
50-
Cancel any running connect, disconnect or reauthorize_connection op. Since our ability to "cancel" is fairly limited,
52+
Cancel any running connect, disconnect or reauthorize connection op. Since our ability to "cancel" is fairly limited,
5153
all this does (for now) is to fail the operation
5254
"""
5355

5456
op = self._pending_connection_op
5557
if op:
5658
# NOTE: This code path should NOT execute in normal flow. There should never already be a pending
57-
# connection op when another is added, due to the SerializeConnectOps stage.
59+
# connection op when another is added, due to the ConnectionLock stage.
5860
# If this block does execute, there is a bug in the codebase.
5961
if not error:
6062
error = pipeline_exceptions.OperationCancelled(
61-
"Cancelling because new ConnectOperation, DisconnectOperation, or ReauthorizeConnectionOperation was issued"
63+
"Cancelling because new ConnectOperation or DisconnectOperationwas issued"
6264
)
6365
self._cancel_connection_watchdog(op)
6466
op.complete(error=error)
@@ -211,29 +213,40 @@ def _run_op(self, op):
211213
# already doing.
212214

213215
try:
214-
self.transport.disconnect(clear_inflight=True)
216+
# The connect after the disconnect will be triggered upon completion of the
217+
# disconnect in the on_disconnected handler
218+
self.transport.disconnect(clear_inflight=op.hard)
215219
except Exception as e:
216220
logger.info("transport.disconnect raised error while disconnecting")
217221
logger.info(traceback.format_exc())
218222
self._pending_connection_op = None
219223
op.complete(error=e)
220224

221225
elif isinstance(op, pipeline_ops_base.ReauthorizeConnectionOperation):
222-
logger.debug("{}({}): reauthorizing".format(self.name, op.name))
226+
logger.debug(
227+
"{}({}): reauthorizing. Will issue disconnect and then a connect".format(
228+
self.name, op.name
229+
)
230+
)
231+
self_weakref = weakref.ref(self)
232+
reauth_op = op # rename for clarity
223233

224-
self._cancel_pending_connection_op()
225-
self._pending_connection_op = op
226-
# We don't need a watchdog on disconnect because there's no callback to wait for
227-
# and we respond to a watchdog timeout by calling disconnect, which is what we're
228-
# already doing.
234+
def on_disconnect_complete(op, error):
235+
this = self_weakref()
236+
if error:
237+
# Failing a disconnect should still get us disconnected, so can proceed anyway
238+
logger.debug(
239+
"Disconnect failed during reauthorization, continuing with connect"
240+
)
241+
connect_op = reauth_op.spawn_worker_op(pipeline_ops_base.ConnectOperation)
229242

230-
try:
231-
self.transport.disconnect()
232-
except Exception as e:
233-
logger.info("transport.disconnect raised error while reauthorizing")
234-
logger.info(traceback.format_exc())
235-
self._pending_connection_op = None
236-
op.complete(error=e)
243+
# the problem is this doens't unset the disconnect from being the pending op before continuing
244+
this.run_op(connect_op)
245+
246+
disconnect_op = pipeline_ops_base.DisconnectOperation(callback=on_disconnect_complete)
247+
disconnect_op.hard = False
248+
249+
self.run_op(disconnect_op)
237250

238251
elif isinstance(op, pipeline_ops_mqtt.MQTTPublishOperation):
239252
logger.debug("{}({}): publishing on {}".format(self.name, op.name, op.topic))
@@ -331,7 +344,7 @@ def _on_mqtt_connected(self):
331344
self.send_event_up(pipeline_events_base.ConnectedEvent())
332345

333346
if isinstance(self._pending_connection_op, pipeline_ops_base.ConnectOperation):
334-
logger.debug("completing connect op")
347+
logger.debug("{}: completing connect op".format(self.name))
335348
op = self._pending_connection_op
336349
self._cancel_connection_watchdog(op)
337350
self._pending_connection_op = None
@@ -340,7 +353,9 @@ def _on_mqtt_connected(self):
340353
# This should indicate something odd is going on.
341354
# If this occurs, either a connect was completed while there was no pending op,
342355
# OR that a connect was completed while a disconnect op was pending
343-
logger.info("Connection was unexpected")
356+
logger.info(
357+
"{}: Connection was unexpected (no connection op pending)".format(self.name)
358+
)
344359

345360
@pipeline_thread.invoke_on_pipeline_thread_nowait
346361
def _on_mqtt_connection_failure(self, cause):
@@ -378,47 +393,47 @@ def _on_mqtt_disconnected(self, cause=None):
378393

379394
# Send an event to tell other pipeline stages that we're disconnected. Do this before
380395
# we do anything else (in case upper stages have any "are we connected" logic.)
396+
# NOTE: Other stages rely on the fact that this occurs before any op that may be in
397+
# progress is completed. Be careful with changing the order things occur here.
381398
self.send_event_up(pipeline_events_base.DisconnectedEvent())
382399

383-
# Complete any pending connection ops
384400
if self._pending_connection_op:
385-
# on_mqtt_disconnected will cause any pending connect op to complete. This is how Paho
386-
# behaves when there is a connection error, and it also makes sense that on_mqtt_disconnected
387-
# would cause a pending connection op to fail.
388-
logger.debug(
389-
"{}: completing pending {} op".format(self.name, self._pending_connection_op.name)
390-
)
401+
391402
op = self._pending_connection_op
392-
self._cancel_connection_watchdog(op)
393-
self._pending_connection_op = None
394403

395-
if isinstance(op, pipeline_ops_base.DisconnectOperation) or isinstance(
396-
op, pipeline_ops_base.ReauthorizeConnectionOperation
397-
):
404+
if isinstance(op, pipeline_ops_base.DisconnectOperation):
405+
logger.debug(
406+
"{}: Expected disconnect - completing pending disconnect op".format(self.name)
407+
)
398408
# Swallow any errors if we intended to disconnect - even if something went wrong, we
399409
# got to the state we wanted to be in!
400-
#
401-
# NOTE: ReauthorizeConnectionOperation currently completes on disconnect, not when
402-
# the connection is re-established (it leaves connection retry to automatically
403-
# re-establish). This needs to change because it is inaccurate and means that if
404-
# a SASToken expires while connection retry is disabled, a reauthorization cannot
405-
# complete (!!!!)
406-
# TODO: Fix that!
407410
if cause:
408411
handle_exceptions.swallow_unraised_exception(
409412
cause,
410-
log_msg="Unexpected disconnect with error while disconnecting - swallowing error",
413+
log_msg="Unexpected error while disconnecting - swallowing error",
411414
)
415+
# Disconnect complete, no longer pending
416+
self._pending_connection_op = None
412417
op.complete()
418+
413419
else:
420+
logger.debug(
421+
"{}: Unexpected disconnect - completing pending {} operation".format(
422+
self.name, op.name
423+
)
424+
)
425+
# Cancel any potential connection watchdog, and clear the pending op
426+
self._cancel_connection_watchdog(op)
427+
self._pending_connection_op = None
428+
# Complete
414429
if cause:
415430
op.complete(error=cause)
416431
else:
417432
op.complete(
418433
error=transport_exceptions.ConnectionDroppedError("transport disconnected")
419434
)
420435
else:
421-
logger.info("{}: disconnection was unexpected".format(self.name))
436+
logger.info("{}: Unexpected disconnect (no pending connection op)".format(self.name))
422437

423438
# If there is no connection retry, cancel any transport operations waiting on response
424439
# so that they do not get stuck there.

azure-iot-device/tests/common/pipeline/test_pipeline_ops_base.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -105,18 +105,10 @@ def init_kwargs(self, mocker):
105105
return kwargs
106106

107107

108-
class ReauthorizeConnectionOperationInstantiationTests(ReauthorizeConnectionOperationTestConfig):
109-
@pytest.mark.it("Initializes 'watchdog_timer' attribute to 'None'")
110-
def test_retry_timer(self, cls_type, init_kwargs):
111-
op = cls_type(**init_kwargs)
112-
assert op.watchdog_timer is None
113-
114-
115108
pipeline_ops_test.add_operation_tests(
116109
test_module=this_module,
117110
op_class_under_test=pipeline_ops_base.ReauthorizeConnectionOperation,
118111
op_test_config_class=ReauthorizeConnectionOperationTestConfig,
119-
extended_op_instantiation_test_class=ReauthorizeConnectionOperationInstantiationTests,
120112
)
121113

122114

0 commit comments

Comments
 (0)