Skip to content

Commit 73ba302

Browse files
committed
[OVN] Improve Hash Ring logs
Debugging Hash Ring problems can be difficult challenge given that prior to this patch the logs were very limited. This patch improves the logging for this feature as follow: 1. Log when a node is added to the ring 2. Log when nodes are removed from the ring 3. Keep track the number of offline nodes and log it upon loading the ring 4. Improve the "Hash Ring is empty" exception with the number of offline nodes found (if 0, means the ovn_hash_ring table has no entries) Closes-Bug: #2023670 Change-Id: Ic90432b5ddea8cf176de159ec7eaafd5fd7bdd6e Signed-off-by: Lucas Alvares Gomes <[email protected]> (cherry picked from commit afa20fa) [OVN] The all() and count() methods should be inside a DB txn The ``ovn_hash_ring_db`` methods ``get_active_nodes`` and ``count_offline_nodes`` are sending SQL requests that should be issued from inside a READER context. Closes-Bug: #2024447 Change-Id: If06c372a9d5cb1dc1ec1af768abb61f52c2c5abd (cherry picked from commit 0c66dfa)
1 parent 4764ed5 commit 73ba302

File tree

4 files changed

+72
-12
lines changed

4 files changed

+72
-12
lines changed

neutron/common/ovn/exceptions.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class StandardAttributeIDNotFound(n_exc.NeutronException):
3333

3434

3535
class HashRingIsEmpty(n_exc.NeutronException):
36-
message = _('Hash Ring returned empty when hashing "%(key)s". '
37-
'This should never happen in a normal situation, please '
38-
'check the status of your cluster')
36+
message = _('Hash Ring returned empty when hashing "%(key)s". All '
37+
'%(node_count)d nodes were found offline. This should never '
38+
'happen in a normal situation, please check the status '
39+
'of your cluster')

neutron/common/ovn/hash_ring_manager.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def __init__(self, group_name):
3838
# Flag to rate limit the caching log
3939
self._prev_num_nodes = -1
4040
self.admin_ctx = context.get_admin_context()
41+
self._offline_node_count = 0
4142

4243
@property
4344
def _wait_startup_before_caching(self):
@@ -92,6 +93,11 @@ def _load_hash_ring(self, refresh=False):
9293
self._hash_ring = hashring.HashRing({node.node_uuid
9394
for node in nodes})
9495
self._last_time_loaded = timeutils.utcnow()
96+
self._offline_node_count = db_hash_ring.count_offline_nodes(
97+
self.admin_ctx, constants.HASH_RING_NODES_TIMEOUT,
98+
self._group)
99+
LOG.debug("Hash Ring loaded. %d active nodes. %d offline nodes",
100+
len(nodes), self._offline_node_count)
95101

96102
def refresh(self):
97103
self._load_hash_ring(refresh=True)
@@ -108,4 +114,5 @@ def get_node(self, key):
108114
# KeyError is raised
109115
return self._hash_ring[key].pop()
110116
except KeyError:
111-
raise exceptions.HashRingIsEmpty(key=key)
117+
raise exceptions.HashRingIsEmpty(
118+
key=key, node_count=self._offline_node_count)

neutron/db/ovn_hash_ring_db.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@
1717

1818
from neutron_lib.db import api as db_api
1919
from oslo_config import cfg
20+
from oslo_log import log
2021
from oslo_utils import timeutils
2122
from oslo_utils import uuidutils
2223

2324
from neutron.db.models import ovn as ovn_models
2425

2526
CONF = cfg.CONF
27+
LOG = log.getLogger(__name__)
2628

2729

2830
# NOTE(ralonsoh): this was migrated from networking-ovn to neutron and should
@@ -34,6 +36,8 @@ def add_node(context, group_name, node_uuid=None):
3436
with db_api.CONTEXT_WRITER.using(context):
3537
context.session.add(ovn_models.OVNHashRing(
3638
node_uuid=node_uuid, hostname=CONF.host, group_name=group_name))
39+
LOG.info('Node %s from host "%s" and group "%s" added to the Hash Ring',
40+
node_uuid, CONF.host, group_name)
3741
return node_uuid
3842

3943

@@ -42,6 +46,8 @@ def remove_nodes_from_host(context, group_name):
4246
context.session.query(ovn_models.OVNHashRing).filter(
4347
ovn_models.OVNHashRing.hostname == CONF.host,
4448
ovn_models.OVNHashRing.group_name == group_name).delete()
49+
LOG.info('Nodes from host "%s" and group "%s" removed from the Hash Ring',
50+
CONF.host, group_name)
4551

4652

4753
def _touch(context, **filter_args):
@@ -58,12 +64,31 @@ def touch_node(context, node_uuid):
5864
_touch(context, node_uuid=node_uuid)
5965

6066

61-
def get_active_nodes(context, interval, group_name, from_host=False):
67+
def _get_nodes_query(context, interval, group_name, offline=False,
68+
from_host=False):
6269
limit = timeutils.utcnow() - datetime.timedelta(seconds=interval)
63-
with db_api.CONTEXT_READER.using(context):
64-
query = context.session.query(ovn_models.OVNHashRing).filter(
65-
ovn_models.OVNHashRing.updated_at >= limit,
66-
ovn_models.OVNHashRing.group_name == group_name)
67-
if from_host:
68-
query = query.filter_by(hostname=CONF.host)
69-
return query.all()
70+
query = context.session.query(ovn_models.OVNHashRing).filter(
71+
ovn_models.OVNHashRing.group_name == group_name)
72+
73+
if offline:
74+
query = query.filter(ovn_models.OVNHashRing.updated_at < limit)
75+
else:
76+
query = query.filter(ovn_models.OVNHashRing.updated_at >= limit)
77+
78+
if from_host:
79+
query = query.filter_by(hostname=CONF.host)
80+
81+
return query
82+
83+
84+
@db_api.CONTEXT_READER
85+
def get_active_nodes(context, interval, group_name, from_host=False):
86+
query = _get_nodes_query(context, interval, group_name,
87+
from_host=from_host)
88+
return query.all()
89+
90+
91+
@db_api.CONTEXT_READER
92+
def count_offline_nodes(context, interval, group_name):
93+
query = _get_nodes_query(context, interval, group_name, offline=True)
94+
return query.count()

neutron/tests/unit/db/test_ovn_hash_ring_db.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,3 +242,30 @@ def test_touch_nodes_from_host_different_groups(self):
242242
for node in group2:
243243
node_db = self._get_node_row(node)
244244
self.assertEqual(node_db.created_at, node_db.updated_at)
245+
246+
def test_count_offline_nodes(self):
247+
self._add_nodes_and_assert_exists(count=3)
248+
249+
# Assert no nodes are considered offline
250+
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
251+
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
252+
253+
# Subtract 60 seconds from utcnow() and touch the nodes to make
254+
# them to appear offline
255+
fake_utcnow = timeutils.utcnow() - datetime.timedelta(seconds=60)
256+
with mock.patch.object(timeutils, 'utcnow') as mock_utcnow:
257+
mock_utcnow.return_value = fake_utcnow
258+
ovn_hash_ring_db.touch_nodes_from_host(self.admin_ctx,
259+
HASH_RING_TEST_GROUP)
260+
261+
# Now assert that all nodes from our host are seeing as offline
262+
self.assertEqual(3, ovn_hash_ring_db.count_offline_nodes(
263+
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
264+
265+
# Touch the nodes again without faking utcnow()
266+
ovn_hash_ring_db.touch_nodes_from_host(self.admin_ctx,
267+
HASH_RING_TEST_GROUP)
268+
269+
# Assert no nodes are considered offline
270+
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
271+
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))

0 commit comments

Comments
 (0)