Skip to content

Commit b58abc6

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "[OVN] Hash Ring: Better handle Neutron worker failures" into stable/yoga
2 parents 7bc6c4a + 24a0639 commit b58abc6

File tree

6 files changed

+53
-20
lines changed

6 files changed

+53
-20
lines changed

neutron/cmd/ovn/neutron_ovn_db_sync_util.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,12 @@ def post_fork_initialize(self, resource, event, trigger, **kwargs):
5757
def ovn_client(self):
5858
return self._ovn_client
5959

60-
def _set_hash_ring_nodes_offline(self):
61-
"""Don't set hash ring nodes as offline.
60+
def _remove_node_from_hash_ring(self):
61+
"""Don't remove the node from the Hash Ring.
6262
6363
If this method was not overridden, cleanup would be performed when
64-
calling the db sync and running neutron server would mark all the
65-
nodes from the ring as offline.
64+
calling the db sync and running neutron server would remove the
65+
nodes from the Hash Ring.
6666
"""
6767

6868
# Since we are not using the ovn mechanism driver while syncing,

neutron/db/ovn_hash_ring_db.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@ def remove_nodes_from_host(context, group_name):
5050
CONF.host, group_name)
5151

5252

53+
def remove_node_by_uuid(context, node_uuid):
54+
with db_api.CONTEXT_WRITER.using(context):
55+
context.session.query(ovn_models.OVNHashRing).filter(
56+
ovn_models.OVNHashRing.node_uuid == node_uuid).delete()
57+
LOG.info('Node "%s" removed from the Hash Ring', node_uuid)
58+
59+
5360
def _touch(context, updated_at=None, **filter_args):
5461
if updated_at is None:
5562
updated_at = timeutils.utcnow()
@@ -96,7 +103,9 @@ def count_offline_nodes(context, interval, group_name):
96103
return query.count()
97104

98105

99-
def set_nodes_from_host_as_offline(context, group_name):
100-
timestamp = datetime.datetime(day=26, month=10, year=1985, hour=9)
101-
_touch(context, updated_at=timestamp, hostname=CONF.host,
102-
group_name=group_name)
106+
@db_api.CONTEXT_READER
107+
def count_nodes_from_host(context, group_name):
108+
query = context.session.query(ovn_models.OVNHashRing).filter(
109+
ovn_models.OVNHashRing.group_name == group_name,
110+
ovn_models.OVNHashRing.hostname == CONF.host)
111+
return query.count()

neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -270,17 +270,17 @@ def subscribe(self):
270270
resources.SECURITY_GROUP_RULE,
271271
events.BEFORE_DELETE)
272272

273-
def _set_hash_ring_nodes_offline(self, *args, **kwargs):
273+
def _remove_node_from_hash_ring(self, *args, **kwargs):
274+
# The node_uuid attribute will be empty for worker types
275+
# that are not added to the Hash Ring and can be skipped
276+
if self.node_uuid is None:
277+
return
274278
admin_context = n_context.get_admin_context()
275-
ovn_hash_ring_db.set_nodes_from_host_as_offline(
276-
admin_context, self.hash_ring_group)
277-
LOG.info('Hash Ring nodes from host "%s" marked as offline',
278-
cfg.CONF.host)
279+
ovn_hash_ring_db.remove_node_by_uuid(
280+
admin_context, self.node_uuid)
279281

280282
def pre_fork_initialize(self, resource, event, trigger, payload=None):
281283
"""Pre-initialize the ML2/OVN driver."""
282-
atexit.register(self._set_hash_ring_nodes_offline)
283-
signal.signal(signal.SIGTERM, self._set_hash_ring_nodes_offline)
284284
ovn_utils.create_neutron_pg_drop()
285285

286286
@staticmethod
@@ -298,6 +298,10 @@ def _setup_hash_ring(self):
298298
thread for this host. Subsequently workers just need to register
299299
themselves to the hash ring.
300300
"""
301+
# Attempt to remove the node from the ring when the worker stops
302+
atexit.register(self._remove_node_from_hash_ring)
303+
signal.signal(signal.SIGTERM, self._remove_node_from_hash_ring)
304+
301305
admin_context = n_context.get_admin_context()
302306
if not self._hash_ring_probe_event.is_set():
303307
# Clear existing entries

neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from neutron.db import segments_db
4343
from neutron.objects import router as router_obj
4444
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync
45+
from neutron import service
4546
from neutron.services.logapi.drivers.ovn import driver as log_driver
4647

4748

@@ -1067,3 +1068,20 @@ def touch_hash_ring_nodes(self):
10671068
# here because we want the maintenance tasks from each instance to
10681069
# execute this task.
10691070
hash_ring_db.touch_nodes_from_host(self.ctx, self._group)
1071+
1072+
# Check the number of the nodes in the ring and log a message in
1073+
# case they are out of sync. See LP #2024205 for more information
1074+
# on this issue.
1075+
api_workers = service._get_api_workers()
1076+
num_nodes = hash_ring_db.count_nodes_from_host(self.ctx, self._group)
1077+
1078+
if num_nodes > api_workers:
1079+
LOG.critical(
1080+
'The number of nodes in the Hash Ring (%d) is higher than '
1081+
'the number of API workers (%d) for host "%s". Something is '
1082+
'not right and OVSDB events could be missed because of this. '
1083+
'Please check the status of the Neutron processes, this can '
1084+
'happen when the API workers are killed and restarted. '
1085+
'Restarting the service should fix the issue, see LP '
1086+
'#2024205 for more information.',
1087+
num_nodes, api_workers, cfg.CONF.host)

neutron/tests/functional/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ def trigger(self):
363363
# NOTE(ralonsoh): do not access to the DB at exit when the SQL
364364
# connection is already closed, to avoid useless exception messages.
365365
mock.patch.object(
366-
self.mech_driver, '_set_hash_ring_nodes_offline').start()
366+
self.mech_driver, '_remove_node_from_hash_ring').start()
367367
self.mech_driver.pre_fork_initialize(
368368
mock.ANY, mock.ANY, trigger_cls.trigger)
369369

neutron/tests/unit/db/test_ovn_hash_ring_db.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -270,16 +270,18 @@ def test_count_offline_nodes(self):
270270
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
271271
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
272272

273-
def test_set_nodes_from_host_as_offline(self):
273+
def test_remove_node_by_uuid(self):
274274
self._add_nodes_and_assert_exists(count=3)
275275

276276
active_nodes = ovn_hash_ring_db.get_active_nodes(
277277
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
278278
self.assertEqual(3, len(active_nodes))
279279

280-
ovn_hash_ring_db.set_nodes_from_host_as_offline(
281-
self.admin_ctx, HASH_RING_TEST_GROUP)
280+
node_to_remove = active_nodes[0].node_uuid
281+
ovn_hash_ring_db.remove_node_by_uuid(
282+
self.admin_ctx, node_to_remove)
282283

283284
active_nodes = ovn_hash_ring_db.get_active_nodes(
284285
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
285-
self.assertEqual(0, len(active_nodes))
286+
self.assertEqual(2, len(active_nodes))
287+
self.assertNotIn(node_to_remove, [n.node_uuid for n in active_nodes])

0 commit comments

Comments
 (0)