Skip to content

Commit 24a0639

Browse files
committed
[OVN] Hash Ring: Better handle Neutron worker failures
This patch implements a more resilient approach to handle the case where Neutron API workers are killed and restarted. Instead of marking all nodes for that host as offline, this patch tries to remove the worker that was killed from the Hash Ring leaving all others nodes for that host online. In case the we fail to remove the node and another entry is added upon the restart of the worker this patch also logs a clear critical log message to alert the operator that there are more Hash Ring nodes than API workers (it's expect to be the same) and that OVSDB events could go missing if they are routed to the previous node that failed to be removed from the ring. Closes-Bug: #2024205 Change-Id: I4b7376cf7df45fcc6e487970b068d06b4e74e319 Signed-off-by: Lucas Alvares Gomes <[email protected]> (cherry picked from commit 9e8e3a7)
1 parent d0c5478 commit 24a0639

File tree

6 files changed

+53
-20
lines changed

6 files changed

+53
-20
lines changed

neutron/cmd/ovn/neutron_ovn_db_sync_util.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,12 @@ def post_fork_initialize(self, resource, event, trigger, **kwargs):
5757
def ovn_client(self):
5858
return self._ovn_client
5959

60-
def _set_hash_ring_nodes_offline(self):
61-
"""Don't set hash ring nodes as offline.
60+
def _remove_node_from_hash_ring(self):
61+
"""Don't remove the node from the Hash Ring.
6262
6363
If this method was not overridden, cleanup would be performed when
64-
calling the db sync and running neutron server would mark all the
65-
nodes from the ring as offline.
64+
calling the db sync and running neutron server would remove the
65+
nodes from the Hash Ring.
6666
"""
6767

6868
# Since we are not using the ovn mechanism driver while syncing,

neutron/db/ovn_hash_ring_db.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@ def remove_nodes_from_host(context, group_name):
5050
CONF.host, group_name)
5151

5252

53+
def remove_node_by_uuid(context, node_uuid):
54+
with db_api.CONTEXT_WRITER.using(context):
55+
context.session.query(ovn_models.OVNHashRing).filter(
56+
ovn_models.OVNHashRing.node_uuid == node_uuid).delete()
57+
LOG.info('Node "%s" removed from the Hash Ring', node_uuid)
58+
59+
5360
def _touch(context, updated_at=None, **filter_args):
5461
if updated_at is None:
5562
updated_at = timeutils.utcnow()
@@ -96,7 +103,9 @@ def count_offline_nodes(context, interval, group_name):
96103
return query.count()
97104

98105

99-
def set_nodes_from_host_as_offline(context, group_name):
100-
timestamp = datetime.datetime(day=26, month=10, year=1985, hour=9)
101-
_touch(context, updated_at=timestamp, hostname=CONF.host,
102-
group_name=group_name)
106+
@db_api.CONTEXT_READER
107+
def count_nodes_from_host(context, group_name):
108+
query = context.session.query(ovn_models.OVNHashRing).filter(
109+
ovn_models.OVNHashRing.group_name == group_name,
110+
ovn_models.OVNHashRing.hostname == CONF.host)
111+
return query.count()

neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -270,17 +270,17 @@ def subscribe(self):
270270
resources.SECURITY_GROUP_RULE,
271271
events.BEFORE_DELETE)
272272

273-
def _set_hash_ring_nodes_offline(self, *args, **kwargs):
273+
def _remove_node_from_hash_ring(self, *args, **kwargs):
274+
# The node_uuid attribute will be empty for worker types
275+
# that are not added to the Hash Ring and can be skipped
276+
if self.node_uuid is None:
277+
return
274278
admin_context = n_context.get_admin_context()
275-
ovn_hash_ring_db.set_nodes_from_host_as_offline(
276-
admin_context, self.hash_ring_group)
277-
LOG.info('Hash Ring nodes from host "%s" marked as offline',
278-
cfg.CONF.host)
279+
ovn_hash_ring_db.remove_node_by_uuid(
280+
admin_context, self.node_uuid)
279281

280282
def pre_fork_initialize(self, resource, event, trigger, payload=None):
281283
"""Pre-initialize the ML2/OVN driver."""
282-
atexit.register(self._set_hash_ring_nodes_offline)
283-
signal.signal(signal.SIGTERM, self._set_hash_ring_nodes_offline)
284284
ovn_utils.create_neutron_pg_drop()
285285

286286
@staticmethod
@@ -298,6 +298,10 @@ def _setup_hash_ring(self):
298298
thread for this host. Subsequently workers just need to register
299299
themselves to the hash ring.
300300
"""
301+
# Attempt to remove the node from the ring when the worker stops
302+
atexit.register(self._remove_node_from_hash_ring)
303+
signal.signal(signal.SIGTERM, self._remove_node_from_hash_ring)
304+
301305
admin_context = n_context.get_admin_context()
302306
if not self._hash_ring_probe_event.is_set():
303307
# Clear existing entries

neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from neutron.db import segments_db
4343
from neutron.objects import router as router_obj
4444
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync
45+
from neutron import service
4546
from neutron.services.logapi.drivers.ovn import driver as log_driver
4647

4748

@@ -1067,3 +1068,20 @@ def touch_hash_ring_nodes(self):
10671068
# here because we want the maintenance tasks from each instance to
10681069
# execute this task.
10691070
hash_ring_db.touch_nodes_from_host(self.ctx, self._group)
1071+
1072+
# Check the number of the nodes in the ring and log a message in
1073+
# case they are out of sync. See LP #2024205 for more information
1074+
# on this issue.
1075+
api_workers = service._get_api_workers()
1076+
num_nodes = hash_ring_db.count_nodes_from_host(self.ctx, self._group)
1077+
1078+
if num_nodes > api_workers:
1079+
LOG.critical(
1080+
'The number of nodes in the Hash Ring (%d) is higher than '
1081+
'the number of API workers (%d) for host "%s". Something is '
1082+
'not right and OVSDB events could be missed because of this. '
1083+
'Please check the status of the Neutron processes, this can '
1084+
'happen when the API workers are killed and restarted. '
1085+
'Restarting the service should fix the issue, see LP '
1086+
'#2024205 for more information.',
1087+
num_nodes, api_workers, cfg.CONF.host)

neutron/tests/functional/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ def trigger(self):
363363
# NOTE(ralonsoh): do not access to the DB at exit when the SQL
364364
# connection is already closed, to avoid useless exception messages.
365365
mock.patch.object(
366-
self.mech_driver, '_set_hash_ring_nodes_offline').start()
366+
self.mech_driver, '_remove_node_from_hash_ring').start()
367367
self.mech_driver.pre_fork_initialize(
368368
mock.ANY, mock.ANY, trigger_cls.trigger)
369369

neutron/tests/unit/db/test_ovn_hash_ring_db.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -270,16 +270,18 @@ def test_count_offline_nodes(self):
270270
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
271271
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
272272

273-
def test_set_nodes_from_host_as_offline(self):
273+
def test_remove_node_by_uuid(self):
274274
self._add_nodes_and_assert_exists(count=3)
275275

276276
active_nodes = ovn_hash_ring_db.get_active_nodes(
277277
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
278278
self.assertEqual(3, len(active_nodes))
279279

280-
ovn_hash_ring_db.set_nodes_from_host_as_offline(
281-
self.admin_ctx, HASH_RING_TEST_GROUP)
280+
node_to_remove = active_nodes[0].node_uuid
281+
ovn_hash_ring_db.remove_node_by_uuid(
282+
self.admin_ctx, node_to_remove)
282283

283284
active_nodes = ovn_hash_ring_db.get_active_nodes(
284285
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
285-
self.assertEqual(0, len(active_nodes))
286+
self.assertEqual(2, len(active_nodes))
287+
self.assertNotIn(node_to_remove, [n.node_uuid for n in active_nodes])

0 commit comments

Comments
 (0)