Skip to content

Commit 106d4f4

Browse files
otherwiseguymnasiadka
authored andcommitted
Avoid race condition by using persist_uuid
When creating a network and adding a port, the API requests will often be handled by different workers. Since each worker maintains its own internal copy of the OVN DBs, it is possible that the request for creating the port arrives at a worker that has not yet received notification that the Logical Switch for the network has been created. When the code for adding the Logical Switch Port is called, it tries to look up the LS, and fails. Currently, a retry-and-wait solution is in place for the LS/LSP issue in create_port(), but a new feature introduced in OVS 3.1 allows specifying the UUID of the LS, so that we can use a value known to the other worker, and add the LSP to the LS by that UUID even if the LS isn't in our local DB cache. This allows the ovsdb-server to be the source of truth on the existance of the LS. There are lots of other interactions that could result in this kind of race condition, any create/update of objects that happen rapidly on systems under high load for example. This method of creating and referencing objects should be generalizable. In this patch, the UUID for the LS is defined as the UUID for the neutron network. The name of the LS continues to be neutron-$UUID to match existing usage and to keep lookups by name working. Change-Id: Icc27c2b8825d7f96c9dac87dec8bbb55d493d942
1 parent 094c9f1 commit 106d4f4

File tree

8 files changed

+129
-24
lines changed

8 files changed

+129
-24
lines changed

neutron/common/ovn/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@
6868
'HAChassisGroupInfo', ['group_name', 'chassis_list', 'az_hints',
6969
'ignore_chassis', 'external_ids'])
7070

71+
_OVS_PERSIST_UUID = _SENTINEL = object()
72+
7173

7274
class OvsdbClientCommand:
7375
_CONNECTION = 0

neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import maintenance
7171
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_client
7272
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync
73+
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovs_fixes
7374
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import worker
7475
from neutron import service
7576
from neutron.services.logapi.drivers.ovn import driver as log_driver
@@ -419,6 +420,9 @@ def post_fork_initialize(self, resource, event, trigger, payload=None):
419420
self._post_fork_event.clear()
420421
self._ovn_client_inst = None
421422

423+
# Patch python-ovs for fixes not yet released
424+
ovs_fixes.apply_ovs_fixes()
425+
422426
if worker_class == wsgi.WorkerService:
423427
self._setup_hash_ring()
424428

neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/commands.py

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@
1717
import uuid
1818

1919
from oslo_utils import timeutils
20+
from ovs.db import idl as ovs_idl
2021
from ovsdbapp.backend.ovs_idl import command
2122
from ovsdbapp.backend.ovs_idl import idlutils
23+
from ovsdbapp.backend.ovs_idl import rowview
2224
from ovsdbapp.schema.ovn_northbound import commands as ovn_nb_commands
2325
from ovsdbapp import utils as ovsdbapp_utils
2426

@@ -155,21 +157,64 @@ def run_idl(self, txn):
155157
self.result = self.api.nb_global.nb_cfg
156158

157159

160+
class AddNetworkCommand(command.AddCommand):
161+
table_name = 'Logical_Switch'
162+
163+
def __init__(self, api, network_id, may_exist=False, **columns):
164+
super().__init__(api)
165+
self.network_uuid = uuid.UUID(str(network_id))
166+
self.may_exist = may_exist
167+
self.columns = columns
168+
169+
def run_idl(self, txn):
170+
table = self.api.tables[self.table_name]
171+
try:
172+
ls = table.rows[self.network_uuid]
173+
if self.may_exist:
174+
self.result = rowview.RowView(ls)
175+
return
176+
msg = _("Switch %s already exists") % self.network_uuid
177+
raise RuntimeError(msg)
178+
except KeyError:
179+
# Adding a new LS
180+
if utils.ovs_persist_uuid_supported(txn.idl):
181+
ls = txn.insert(table, new_uuid=self.network_uuid,
182+
persist_uuid=True)
183+
else:
184+
ls = txn.insert(table)
185+
self.set_columns(ls, **self.columns)
186+
ls.name = utils.ovn_name(self.network_uuid)
187+
self.result = ls.uuid
188+
189+
158190
class AddLSwitchPortCommand(command.BaseCommand):
159-
def __init__(self, api, lport, lswitch, may_exist, **columns):
191+
def __init__(self, api, lport, lswitch, may_exist, network_id=None,
192+
**columns):
160193
super().__init__(api)
161194
self.lport = lport
162195
self.lswitch = lswitch
163196
self.may_exist = may_exist
197+
self.network_uuid = uuid.UUID(str(network_id)) if network_id else None
164198
self.columns = columns
165199

166200
def run_idl(self, txn):
167201
try:
202+
# We must look in the local cache first, because the LS may have
203+
# been created as part of the current transaction. or in the case
204+
# of adding an LSP to a LS that was created before persist_uuid
168205
lswitch = idlutils.row_by_value(self.api.idl, 'Logical_Switch',
169206
'name', self.lswitch)
170207
except idlutils.RowNotFound:
171-
msg = _("Logical Switch %s does not exist") % self.lswitch
172-
raise RuntimeError(msg)
208+
if self.network_uuid and utils.ovs_persist_uuid_supported(txn.idl):
209+
# Create a "fake" row with the right UUID so python-ovs creates
210+
# a transaction referencing the Row, even though we might not
211+
# have received the update for the row ourselves.
212+
lswitch = ovs_idl.Row(self.api.idl,
213+
self.api.tables['Logical_Switch'],
214+
uuid=self.network_uuid, data={})
215+
else:
216+
msg = _("Logical Switch %s does not exist") % self.lswitch
217+
raise RuntimeError(msg)
173218
if self.may_exist:
174219
port = idlutils.row_by_value(self.api.idl,
175220
'Logical_Switch_Port', 'name',
@@ -255,8 +300,8 @@ def run_idl(self, txn):
255300
else:
256301
new_port_dhcp_opts.add(dhcpv6_options.result)
257302
port.dhcpv6_options = [dhcpv6_options.result]
258-
for uuid in cur_port_dhcp_opts - new_port_dhcp_opts:
259-
self.api._tables['DHCP_Options'].rows[uuid].delete()
303+
for uuid_ in cur_port_dhcp_opts - new_port_dhcp_opts:
304+
self.api._tables['DHCP_Options'].rows[uuid_].delete()
260305

261306
external_ids_update = self.external_ids_update or {}
262307
external_ids = getattr(port, 'external_ids', {})
@@ -338,8 +383,8 @@ def run_idl(self, txn):
338383
# Delete DHCP_Options records no longer referred by this port.
339384
cur_port_dhcp_opts = get_lsp_dhcp_options_uuids(
340385
lport, self.lport)
341-
for uuid in cur_port_dhcp_opts:
342-
self.api._tables['DHCP_Options'].rows[uuid].delete()
386+
for uuid_ in cur_port_dhcp_opts:
387+
self.api._tables['DHCP_Options'].rows[uuid_].delete()
343388

344389
_delvalue_from_list(lswitch, 'ports', lport)
345390
self.api._tables['Logical_Switch_Port'].rows[lport.uuid].delete()

neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/impl_idl_ovn.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,10 +275,17 @@ def transaction(self, *args, **kwargs):
275275
if revision_mismatch_raise:
276276
raise e
277277

278+
def ls_add(self, switch=None, may_exist=False, network_id=None, **columns):
279+
if network_id is None:
280+
return super().ls_add(switch, may_exist, **columns)
281+
return cmd.AddNetworkCommand(self, network_id, may_exist=may_exist,
282+
**columns)
283+
278284
def create_lswitch_port(self, lport_name, lswitch_name, may_exist=True,
279-
**columns):
285+
network_id=None, **columns):
280286
return cmd.AddLSwitchPortCommand(self, lport_name, lswitch_name,
281-
may_exist, **columns)
287+
may_exist, network_id=network_id,
288+
**columns)
282289

283290
def set_lswitch_port(self, lport_name, external_ids_update=None,
284291
if_exists=True, **columns):

neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovn_client.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -590,9 +590,11 @@ def create_port(self, context, port):
590590
# controller does not yet see that network in its local cache of the
591591
# OVN northbound database. Check if the logical switch is present
592592
# or not in the idl's local copy of the database before creating
593-
# the lswitch port.
594-
self._nb_idl.check_for_row_by_value_and_retry(
595-
'Logical_Switch', 'name', lswitch_name)
593+
# the lswitch port. Once we require an ovs version with working
594+
# persist_uuid support, this can be removed.
595+
if not utils.ovs_persist_uuid_supported(self._nb_idl):
596+
self._nb_idl.check_for_row_by_value_and_retry(
597+
'Logical_Switch', 'name', lswitch_name)
596598

597599
with self._nb_idl.transaction(check_error=True) as txn:
598600
dhcpv4_options, dhcpv6_options = self.update_port_dhcp_options(
@@ -604,6 +606,7 @@ def create_port(self, context, port):
604606
kwargs = {
605607
'lport_name': port['id'],
606608
'lswitch_name': lswitch_name,
609+
'network_id': port['network_id'],
607610
'addresses': port_info.addresses,
608611
'external_ids': external_ids,
609612
'parent_name': port_info.parent_name,
@@ -2074,6 +2077,7 @@ def create_provnet_port(self, network_id, segment, txn=None,
20742077
cmd = self._nb_idl.create_lswitch_port(
20752078
lport_name=utils.ovn_provnet_port_name(segment['id']),
20762079
lswitch_name=utils.ovn_name(network_id),
2080+
network_id=network_id,
20772081
addresses=[ovn_const.UNKNOWN_ADDR],
20782082
external_ids={},
20792083
type=ovn_const.LSP_TYPE_LOCALNET,
@@ -2136,14 +2140,13 @@ def create_network(self, context, network):
21362140
# UUID. This provides an easy way to refer to the logical switch
21372141
# without having to track what UUID OVN assigned to it.
21382142
lswitch_params = self._gen_network_parameters(network)
2139-
lswitch_name = utils.ovn_name(network['id'])
21402143
# NOTE(mjozefcz): Remove this workaround when bug
21412144
# 1869877 will be fixed.
21422145
segments = segments_db.get_network_segments(
21432146
context, network['id'])
21442147
with self._nb_idl.transaction(check_error=True) as txn:
2145-
txn.add(self._nb_idl.ls_add(lswitch_name, **lswitch_params,
2146-
may_exist=True))
2148+
txn.add(self._nb_idl.ls_add(network_id=network['id'],
2149+
**lswitch_params, may_exist=True))
21472150
for segment in segments:
21482151
if segment.get(segment_def.PHYSICAL_NETWORK):
21492152
self.create_provnet_port(network['id'], segment, txn=txn,
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (c) 2024
2+
# All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
5+
# not use this file except in compliance with the License. You may obtain
6+
# a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13+
# License for the specific language governing permissions and limitations
14+
# under the License.
15+
16+
from ovs.db import idl
17+
import ovs.ovsuuid
18+
19+
20+
# Temporarily fix ovs.db.idl.Transaction._substitute_uuids to support handling
21+
# the persist_uuid feature
22+
def _substitute_uuids(self, json):
23+
if isinstance(json, (list, tuple)):
24+
if (len(json) == 2 and
25+
json[0] == 'uuid' and
26+
ovs.ovsuuid.is_valid_string(json[1])):
27+
uuid = ovs.ovsuuid.from_string(json[1])
28+
row = self._txn_rows.get(uuid, None)
29+
if row and row._data is None and not row._persist_uuid:
30+
return ["named-uuid", idl._uuid_name_from_uuid(uuid)]
31+
else:
32+
return [self._substitute_uuids(elem) for elem in json]
33+
return json
34+
35+
36+
def apply_ovs_fixes():
37+
idl.Transaction._substitute_uuids = _substitute_uuids

neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/test_mech_driver.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ def setUp(self):
106106
neutron_agent.AgentCache().get_agents.return_value = [agent1]
107107
self.mock_vp_parents = mock.patch.object(
108108
ovn_utils, 'get_virtual_port_parents', return_value=None).start()
109+
mock.patch.object(ovn_utils, 'ovs_persist_uuid_supported',
110+
return_value=True).start()
109111

110112
def _add_chassis_private(self, nb_cfg, name=None):
111113
chassis_private = mock.Mock()
@@ -881,15 +883,19 @@ def test_network_precommit(self):
881883
self.mech_driver.update_network_precommit,
882884
fake_network_context)
883885

886+
def _verify_ls_add(self, net_id, may_exist=True, **kwargs):
887+
ls_add = self.mech_driver._ovn_client._nb_idl.ls_add
888+
ls_add.assert_called_once_with(
889+
external_ids=mock.ANY,
890+
may_exist=may_exist, network_id=net_id, **kwargs)
891+
884892
def _create_network_igmp_snoop(self, enabled):
885893
cfg.CONF.set_override('igmp_snooping_enable', enabled, group='OVS')
886-
nb_idl = self.mech_driver._ovn_client._nb_idl
887894
net = self._make_network(self.fmt, name='net1',
888895
admin_state_up=True)['network']
889896
value = 'true' if enabled else 'false'
890-
nb_idl.ls_add.assert_called_once_with(
891-
ovn_utils.ovn_name(net['id']), external_ids=mock.ANY,
892-
may_exist=True,
897+
self._verify_ls_add(
898+
net_id=net['id'],
893899
other_config={ovn_const.MCAST_SNOOP: value,
894900
ovn_const.MCAST_FLOOD_UNREGISTERED: 'false',
895901
ovn_const.VLAN_PASSTHRU: 'false'})
@@ -901,7 +907,6 @@ def test_create_network_igmp_snoop_disabled(self):
901907
self._create_network_igmp_snoop(enabled=False)
902908

903909
def _create_network_vlan_passthru(self, vlan_transparent, qinq):
904-
nb_idl = self.mech_driver._ovn_client._nb_idl
905910
net = self._make_network(
906911
self.fmt, name='net1',
907912
as_admin=True,
@@ -917,9 +922,8 @@ def _create_network_vlan_passthru(self, vlan_transparent, qinq):
917922
'provider:physical_network': 'physnet1'})['network']
918923
value = 'true' if vlan_transparent or qinq else 'false'
919924
expected_fdb_age_treshold = ovn_conf.get_fdb_age_threshold()
920-
nb_idl.ls_add.assert_called_once_with(
921-
ovn_utils.ovn_name(net['id']), external_ids=mock.ANY,
922-
may_exist=True,
925+
self._verify_ls_add(
926+
net_id=net['id'],
923927
other_config={
924928
ovn_const.MCAST_SNOOP: 'false',
925929
ovn_const.MCAST_FLOOD_UNREGISTERED: 'false',
@@ -958,6 +962,7 @@ def test_create_network_create_localnet_port_physical_network_type(self):
958962
self.context, net['id'])
959963
nb_idl.create_lswitch_port.assert_called_once_with(
960964
addresses=[ovn_const.UNKNOWN_ADDR],
965+
network_id=net['id'],
961966
external_ids={},
962967
lport_name=ovn_utils.ovn_provnet_port_name(segments[0]['id']),
963968
lswitch_name=ovn_utils.ovn_name(net['id']),
@@ -3477,6 +3482,7 @@ def test_create_segment_create_localnet_port(self):
34773482
segmentation_id=200, network_type='vlan')['segment']
34783483
ovn_nb_api.create_lswitch_port.assert_called_once_with(
34793484
addresses=[ovn_const.UNKNOWN_ADDR],
3485+
network_id=net['id'],
34803486
external_ids={},
34813487
lport_name=ovn_utils.ovn_provnet_port_name(new_segment['id']),
34823488
lswitch_name=ovn_utils.ovn_name(net['id']),
@@ -3495,6 +3501,7 @@ def test_create_segment_create_localnet_port(self):
34953501
segmentation_id=300, network_type='vlan')['segment']
34963502
ovn_nb_api.create_lswitch_port.assert_called_once_with(
34973503
addresses=[ovn_const.UNKNOWN_ADDR],
3504+
network_id=net['id'],
34983505
external_ids={},
34993506
lport_name=ovn_utils.ovn_provnet_port_name(new_segment['id']),
35003507
lswitch_name=ovn_utils.ovn_name(net['id']),

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ oslo.versionedobjects>=1.35.1 # Apache-2.0
4444
osprofiler>=2.3.0 # Apache-2.0
4545
os-ken>=3.0.0 # Apache-2.0
4646
os-resource-classes>=1.1.0 # Apache-2.0
47-
ovs>=2.12.0 # Apache-2.0
47+
ovs>3.3.0 # Apache-2.0
4848
ovsdbapp>=2.11.0 # Apache-2.0
4949
psutil>=6.1.0 # BSD
5050
pyroute2>=0.7.3;sys_platform!='win32' # Apache-2.0 (+ dual licensed GPL2)

0 commit comments

Comments
 (0)