Skip to content

Commit f2e5b2b

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "[OVN] Hash Ring: Better handle Neutron worker failures" into stable/zed
2 parents 2e50aef + ebd1980 commit f2e5b2b

File tree

6 files changed

+53
-20
lines changed

6 files changed

+53
-20
lines changed

neutron/cmd/ovn/neutron_ovn_db_sync_util.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,12 @@ def post_fork_initialize(self, resource, event, trigger, **kwargs):
5858
def ovn_client(self):
5959
return self._ovn_client
6060

61-
def _set_hash_ring_nodes_offline(self):
62-
"""Don't set hash ring nodes as offline.
61+
def _remove_node_from_hash_ring(self):
62+
"""Don't remove the node from the Hash Ring.
6363
6464
If this method was not overridden, cleanup would be performed when
65-
calling the db sync and running neutron server would mark all the
66-
nodes from the ring as offline.
65+
calling the db sync and running neutron server would remove the
66+
nodes from the Hash Ring.
6767
"""
6868

6969
# Since we are not using the ovn mechanism driver while syncing,

neutron/db/ovn_hash_ring_db.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@ def remove_nodes_from_host(context, group_name):
5050
CONF.host, group_name)
5151

5252

53+
def remove_node_by_uuid(context, node_uuid):
54+
with db_api.CONTEXT_WRITER.using(context):
55+
context.session.query(ovn_models.OVNHashRing).filter(
56+
ovn_models.OVNHashRing.node_uuid == node_uuid).delete()
57+
LOG.info('Node "%s" removed from the Hash Ring', node_uuid)
58+
59+
5360
def _touch(context, updated_at=None, **filter_args):
5461
if updated_at is None:
5562
updated_at = timeutils.utcnow()
@@ -96,7 +103,9 @@ def count_offline_nodes(context, interval, group_name):
96103
return query.count()
97104

98105

99-
def set_nodes_from_host_as_offline(context, group_name):
100-
timestamp = datetime.datetime(day=26, month=10, year=1985, hour=9)
101-
_touch(context, updated_at=timestamp, hostname=CONF.host,
102-
group_name=group_name)
106+
@db_api.CONTEXT_READER
107+
def count_nodes_from_host(context, group_name):
108+
query = context.session.query(ovn_models.OVNHashRing).filter(
109+
ovn_models.OVNHashRing.group_name == group_name,
110+
ovn_models.OVNHashRing.hostname == CONF.host)
111+
return query.count()

neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -278,17 +278,17 @@ def subscribe(self):
278278
resources.SECURITY_GROUP_RULE,
279279
events.BEFORE_DELETE)
280280

281-
def _set_hash_ring_nodes_offline(self, *args, **kwargs):
281+
def _remove_node_from_hash_ring(self, *args, **kwargs):
282+
# The node_uuid attribute will be empty for worker types
283+
# that are not added to the Hash Ring and can be skipped
284+
if self.node_uuid is None:
285+
return
282286
admin_context = n_context.get_admin_context()
283-
ovn_hash_ring_db.set_nodes_from_host_as_offline(
284-
admin_context, self.hash_ring_group)
285-
LOG.info('Hash Ring nodes from host "%s" marked as offline',
286-
cfg.CONF.host)
287+
ovn_hash_ring_db.remove_node_by_uuid(
288+
admin_context, self.node_uuid)
287289

288290
def pre_fork_initialize(self, resource, event, trigger, payload=None):
289291
"""Pre-initialize the ML2/OVN driver."""
290-
atexit.register(self._set_hash_ring_nodes_offline)
291-
signal.signal(signal.SIGTERM, self._set_hash_ring_nodes_offline)
292292
ovn_utils.create_neutron_pg_drop()
293293

294294
@staticmethod
@@ -306,6 +306,10 @@ def _setup_hash_ring(self):
306306
thread for this host. Subsequently workers just need to register
307307
themselves to the hash ring.
308308
"""
309+
# Attempt to remove the node from the ring when the worker stops
310+
atexit.register(self._remove_node_from_hash_ring)
311+
signal.signal(signal.SIGTERM, self._remove_node_from_hash_ring)
312+
309313
admin_context = n_context.get_admin_context()
310314
if not self._hash_ring_probe_event.is_set():
311315
# Clear existing entries

neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from neutron.objects import ports as ports_obj
4242
from neutron.objects import router as router_obj
4343
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync
44+
from neutron import service
4445
from neutron.services.logapi.drivers.ovn import driver as log_driver
4546

4647

@@ -1045,3 +1046,20 @@ def touch_hash_ring_nodes(self):
10451046
# here because we want the maintenance tasks from each instance to
10461047
# execute this task.
10471048
hash_ring_db.touch_nodes_from_host(self.ctx, self._group)
1049+
1050+
# Check the number of the nodes in the ring and log a message in
1051+
# case they are out of sync. See LP #2024205 for more information
1052+
# on this issue.
1053+
api_workers = service._get_api_workers()
1054+
num_nodes = hash_ring_db.count_nodes_from_host(self.ctx, self._group)
1055+
1056+
if num_nodes > api_workers:
1057+
LOG.critical(
1058+
'The number of nodes in the Hash Ring (%d) is higher than '
1059+
'the number of API workers (%d) for host "%s". Something is '
1060+
'not right and OVSDB events could be missed because of this. '
1061+
'Please check the status of the Neutron processes, this can '
1062+
'happen when the API workers are killed and restarted. '
1063+
'Restarting the service should fix the issue, see LP '
1064+
'#2024205 for more information.',
1065+
num_nodes, api_workers, cfg.CONF.host)

neutron/tests/functional/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ def trigger(self):
363363
# NOTE(ralonsoh): do not access to the DB at exit when the SQL
364364
# connection is already closed, to avoid useless exception messages.
365365
mock.patch.object(
366-
self.mech_driver, '_set_hash_ring_nodes_offline').start()
366+
self.mech_driver, '_remove_node_from_hash_ring').start()
367367
self.mech_driver.pre_fork_initialize(
368368
mock.ANY, mock.ANY, trigger_cls.trigger)
369369

neutron/tests/unit/db/test_ovn_hash_ring_db.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -270,16 +270,18 @@ def test_count_offline_nodes(self):
270270
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
271271
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
272272

273-
def test_set_nodes_from_host_as_offline(self):
273+
def test_remove_node_by_uuid(self):
274274
self._add_nodes_and_assert_exists(count=3)
275275

276276
active_nodes = ovn_hash_ring_db.get_active_nodes(
277277
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
278278
self.assertEqual(3, len(active_nodes))
279279

280-
ovn_hash_ring_db.set_nodes_from_host_as_offline(
281-
self.admin_ctx, HASH_RING_TEST_GROUP)
280+
node_to_remove = active_nodes[0].node_uuid
281+
ovn_hash_ring_db.remove_node_by_uuid(
282+
self.admin_ctx, node_to_remove)
282283

283284
active_nodes = ovn_hash_ring_db.get_active_nodes(
284285
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
285-
self.assertEqual(0, len(active_nodes))
286+
self.assertEqual(2, len(active_nodes))
287+
self.assertNotIn(node_to_remove, [n.node_uuid for n in active_nodes])

0 commit comments

Comments
 (0)