Skip to content

Commit 406b1e0

Browse files
committed
Lower spacing time of the OVN maintenance tasks which should be run once
Some of the OVN maintenance tasks are expected to be run just once and then they raise periodic.NeverAgain() to not be run anymore. Those tasks also require to have acquried ovn db lock so that only one of the maintenance workers really runs them. All those tasks had set 600 seconds as a spacing time so they were run every 600 seconds. This works fine usually but that may cause small issue in the environments were Neutron is run in POD as k8s/openshift application. In such case, when e.g. configuration of neutron is updated, it may happen that first new POD with Neutron is spawned and only once it is already running, k8s will stop old POD. Because of that maintenance worker running in the new neutron-server POD will not acquire lock on the OVN DB (old POD still holds the lock) and will not run all those maintenance tasks immediately. After old POD will be terminated, one of the new PODs will at some point acquire that lock and then will run all those maintenance tasks but this would cause 600 seconds delay in running them. To avoid such long waiting time to run those maintenance tasks, this patch lowers its spacing time from 600 to just 5 seconds. Additionally maintenance tasks which are supposed to be run only once and only by the maintenance worker which has acquired ovn db lock will now be stopped (periodic.NeverAgain will be raised) after 100 attempts of run. This will avoid running them every 5 seconds forever on the workers which don't acquire lock on the OVN DB at all. Conflicts: neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py Closes-bug: #2074209 Change-Id: Iabb4bb427588c1a5da27a5d313f75b5bd23805b2 (cherry picked from commit 04c217b) (cherry picked from commit 78900c1) (cherry picked from commit 9108218)
1 parent 5072af6 commit 406b1e0

File tree

3 files changed

+134
-16
lines changed

3 files changed

+134
-16
lines changed

neutron/common/ovn/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,8 @@
274274
TYPE_SECURITY_GROUP_RULES)
275275

276276
DB_CONSISTENCY_CHECK_INTERVAL = 300 # 5 minutes
277+
MAINTENANCE_TASK_RETRY_LIMIT = 100 # times
278+
MAINTENANCE_ONE_RUN_TASK_SPACING = 5 # seconds
277279

278280
# The order in which the resources should be created or updated by the
279281
# maintenance task: Root ones first and leafs at the end.

neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py

Lines changed: 75 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,28 @@
5454
INCONSISTENCY_TYPE_DELETE = 'delete'
5555

5656

57-
def has_lock_periodic(*args, **kwargs):
57+
def has_lock_periodic(*args, periodic_run_limit=0, **kwargs):
5858
def wrapper(f):
59+
_retries = 0
60+
5961
@functools.wraps(f)
6062
@periodics.periodic(*args, **kwargs)
6163
def decorator(self, *args, **kwargs):
6264
# This periodic task is included in DBInconsistenciesPeriodics
6365
# since it uses the lock to ensure only one worker is executing
66+
# additonally, if periodic_run_limit parameter with value > 0 is
67+
# provided and lock is not acquired for periodic_run_limit
68+
# times, task will not be run anymore by this maintenance worker
69+
nonlocal _retries
6470
if not self.has_lock:
71+
if periodic_run_limit > 0:
72+
if _retries >= periodic_run_limit:
73+
LOG.debug("Have not been able to acquire lock to run "
74+
"task '%s' after %s tries, limit reached. "
75+
"No more attempts will be made.",
76+
f, _retries)
77+
raise periodics.NeverAgain()
78+
_retries += 1
6579
return
6680
return f(self, *args, **kwargs)
6781
return decorator
@@ -481,7 +495,10 @@ def _delete_floatingip_and_pf(self, context, fip_id):
481495

482496
# A static spacing value is used here, but this method will only run
483497
# once per lock due to the use of periodics.NeverAgain().
484-
@has_lock_periodic(spacing=600, run_immediately=True)
498+
@has_lock_periodic(
499+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
500+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
501+
run_immediately=True)
485502
def check_global_dhcp_opts(self):
486503
if (not ovn_conf.get_global_dhcpv4_opts() and
487504
not ovn_conf.get_global_dhcpv6_opts()):
@@ -511,7 +528,10 @@ def check_global_dhcp_opts(self):
511528

512529
# A static spacing value is used here, but this method will only run
513530
# once per lock due to the use of periodics.NeverAgain().
514-
@has_lock_periodic(spacing=600, run_immediately=True)
531+
@has_lock_periodic(
532+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
533+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
534+
run_immediately=True)
515535
def check_for_igmp_snoop_support(self):
516536
with self._nb_idl.transaction(check_error=True) as txn:
517537
value = ('true' if ovn_conf.is_igmp_snooping_enabled()
@@ -531,7 +551,10 @@ def check_for_igmp_snoop_support(self):
531551
# TODO(czesla): Remove this in the A+4 cycle
532552
# A static spacing value is used here, but this method will only run
533553
# once per lock due to the use of periodics.NeverAgain().
534-
@has_lock_periodic(spacing=600, run_immediately=True)
554+
@has_lock_periodic(
555+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
556+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
557+
run_immediately=True)
535558
def check_port_has_address_scope(self):
536559
ports = self._nb_idl.db_find_rows(
537560
"Logical_Switch_Port", ("type", "!=", ovn_const.LSP_TYPE_LOCALNET)
@@ -574,7 +597,10 @@ def _delete_default_ha_chassis_group(self, txn):
574597

575598
# A static spacing value is used here, but this method will only run
576599
# once per lock due to the use of periodics.NeverAgain().
577-
@has_lock_periodic(spacing=600, run_immediately=True)
600+
@has_lock_periodic(
601+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
602+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
603+
run_immediately=True)
578604
def check_for_ha_chassis_group(self):
579605
# If external ports is not supported stop running
580606
# this periodic task
@@ -604,7 +630,10 @@ def check_for_ha_chassis_group(self):
604630
# TODO(lucasagomes): Remove this in the B+3 cycle
605631
# A static spacing value is used here, but this method will only run
606632
# once per lock due to the use of periodics.NeverAgain().
607-
@has_lock_periodic(spacing=600, run_immediately=True)
633+
@has_lock_periodic(
634+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
635+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
636+
run_immediately=True)
608637
def check_for_mcast_flood_reports(self):
609638
cmds = []
610639
for port in self._nb_idl.lsp_list().execute(check_error=True):
@@ -703,7 +732,10 @@ def update_port_qos_with_external_ids_reference(self):
703732

704733
# A static spacing value is used here, but this method will only run
705734
# once per lock due to the use of periodics.NeverAgain().
706-
@has_lock_periodic(spacing=600, run_immediately=True)
735+
@has_lock_periodic(
736+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
737+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
738+
run_immediately=True)
707739
def check_redirect_type_router_gateway_ports(self):
708740
"""Check OVN router gateway ports
709741
Check for the option "redirect-type=bridged" value for
@@ -761,7 +793,10 @@ def check_redirect_type_router_gateway_ports(self):
761793

762794
# A static spacing value is used here, but this method will only run
763795
# once per lock due to the use of periodics.NeverAgain().
764-
@has_lock_periodic(spacing=600, run_immediately=True)
796+
@has_lock_periodic(
797+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
798+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
799+
run_immediately=True)
765800
def check_vlan_distributed_ports(self):
766801
"""Check VLAN distributed ports
767802
Check for the option "reside-on-redirect-chassis" value for
@@ -802,7 +837,10 @@ def check_vlan_distributed_ports(self):
802837
# a gateway (that means, that has "external_ids:OVN_GW_PORT_EXT_ID_KEY").
803838
# A static spacing value is used here, but this method will only run
804839
# once per lock due to the use of periodics.NeverAgain().
805-
@has_lock_periodic(spacing=600, run_immediately=True)
840+
@has_lock_periodic(
841+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
842+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
843+
run_immediately=True)
806844
def update_logical_router_with_gateway_network_id(self):
807845
"""Update all OVN logical router registers with the GW network ID"""
808846
cmds = []
@@ -829,7 +867,10 @@ def update_logical_router_with_gateway_network_id(self):
829867

830868
# A static spacing value is used here, but this method will only run
831869
# once per lock due to the use of periodics.NeverAgain().
832-
@has_lock_periodic(spacing=600, run_immediately=True)
870+
@has_lock_periodic(
871+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
872+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
873+
run_immediately=True)
833874
def check_baremetal_ports_dhcp_options(self):
834875
"""Update baremetal ports DHCP options
835876
@@ -912,7 +953,10 @@ def update_port_virtual_type(self):
912953
raise periodics.NeverAgain()
913954

914955
# TODO(ralonsoh): Remove this in the Antelope+4 cycle
915-
@has_lock_periodic(spacing=600, run_immediately=True)
956+
@has_lock_periodic(
957+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
958+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
959+
run_immediately=True)
916960
def create_router_extra_attributes_registers(self):
917961
"""Create missing ``RouterExtraAttributes`` registers.
918962
@@ -933,7 +977,10 @@ def create_router_extra_attributes_registers(self):
933977
raise periodics.NeverAgain()
934978

935979
# TODO(slaweq): Remove this in the E cycle (C+2 as it will be next SLURP)
936-
@periodics.periodic(spacing=600, run_immediately=True)
980+
@has_lock_periodic(
981+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
982+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
983+
run_immediately=True)
937984
def add_gw_port_info_to_logical_router_port(self):
938985
"""Add info if LRP is connecting internal subnet or ext gateway."""
939986
cmds = []
@@ -969,7 +1016,10 @@ def add_gw_port_info_to_logical_router_port(self):
9691016
txn.add(cmd)
9701017
raise periodics.NeverAgain()
9711018

972-
@has_lock_periodic(spacing=600, run_immediately=True)
1019+
@has_lock_periodic(
1020+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
1021+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
1022+
run_immediately=True)
9731023
def check_router_default_route_empty_dst_ip(self):
9741024
"""Check routers with default route with empty dst-ip (LP: #2002993).
9751025
"""
@@ -994,7 +1044,10 @@ def check_router_default_route_empty_dst_ip(self):
9941044
raise periodics.NeverAgain()
9951045

9961046
# TODO(ralonsoh): Remove this in the Antelope+4 cycle
997-
@has_lock_periodic(spacing=600, run_immediately=True)
1047+
@has_lock_periodic(
1048+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
1049+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
1050+
run_immediately=True)
9981051
def add_vnic_type_and_pb_capabilities_to_lsp(self):
9991052
"""Add the port VNIC type and port binding capabilities to the LSP.
10001053
@@ -1026,7 +1079,10 @@ def add_vnic_type_and_pb_capabilities_to_lsp(self):
10261079

10271080
raise periodics.NeverAgain()
10281081

1029-
@has_lock_periodic(spacing=600, run_immediately=True)
1082+
@has_lock_periodic(
1083+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
1084+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
1085+
run_immediately=True)
10301086
def check_fair_meter_consistency(self):
10311087
"""Update the logging meter after neutron-server reload
10321088
@@ -1108,7 +1164,10 @@ def cleanup_old_hash_ring_nodes(self):
11081164
context = n_context.get_admin_context()
11091165
hash_ring_db.cleanup_old_nodes(context, days=5)
11101166

1111-
@periodics.periodic(spacing=600, run_immediately=True)
1167+
@has_lock_periodic(
1168+
periodic_run_limit=ovn_const.MAINTENANCE_TASK_RETRY_LIMIT,
1169+
spacing=ovn_const.MAINTENANCE_ONE_RUN_TASK_SPACING,
1170+
run_immediately=True)
11121171
def update_nat_floating_ip_with_gateway_port_reference(self):
11131172
"""Set NAT rule gateway_port column to any floating IP without
11141173
router gateway port uuid reference - LP#2035281.

neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/ovsdb/test_maintenance.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,69 @@
3232
from neutron.objects import ports as ports_obj
3333
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import maintenance
3434
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync
35+
from neutron.tests import base
3536
from neutron.tests.unit import fake_resources as fakes
3637
from neutron.tests.unit.plugins.ml2 import test_security_group as test_sg
3738
from neutron.tests.unit import testlib_api
3839
from neutron_lib import exceptions as n_exc
3940

4041

42+
class TestHasLockPeriodicDecorator(base.BaseTestCase):
43+
44+
def test_decorator_no_limit_have_lock(self):
45+
run_counter = 0
46+
47+
@maintenance.has_lock_periodic(
48+
periodic_run_limit=0, spacing=30)
49+
def test_maintenance_task(worker):
50+
nonlocal run_counter
51+
run_counter += 1
52+
53+
worker_mock = mock.MagicMock()
54+
worker_mock.has_lock = True
55+
56+
for _ in range(3):
57+
test_maintenance_task(worker_mock)
58+
self.assertEqual(3, run_counter)
59+
60+
def test_decorator_no_lock_no_limit(self):
61+
run_counter = 0
62+
63+
@maintenance.has_lock_periodic(
64+
periodic_run_limit=0, spacing=30)
65+
def test_maintenance_task(worker):
66+
nonlocal run_counter
67+
run_counter += 1
68+
69+
worker_mock = mock.MagicMock()
70+
has_lock_values = [False, False, True]
71+
72+
for has_lock in has_lock_values:
73+
worker_mock.has_lock = has_lock
74+
test_maintenance_task(worker_mock)
75+
self.assertEqual(1, run_counter)
76+
77+
def test_decorator_no_lock_with_limit(self):
78+
run_counter = 0
79+
80+
@maintenance.has_lock_periodic(
81+
periodic_run_limit=1, spacing=30)
82+
def test_maintenance_task(worker):
83+
nonlocal run_counter
84+
run_counter += 1
85+
86+
worker_mock = mock.MagicMock()
87+
88+
worker_mock.has_lock = False
89+
test_maintenance_task(worker_mock)
90+
self.assertEqual(0, run_counter)
91+
92+
worker_mock.has_lock = False
93+
self.assertRaises(periodics.NeverAgain,
94+
test_maintenance_task, worker_mock)
95+
self.assertEqual(0, run_counter)
96+
97+
4198
class TestSchemaAwarePeriodicsBase(testlib_api.SqlTestCaseLight):
4299

43100
def test__set_schema_aware_periodics(self):

0 commit comments

Comments
 (0)