Skip to content

Commit 483f6d7

Browse files
committed
Modify startup scripts for ovn-controller-ovs
In order to minimize the downtime during update of the ovn-controller-ovs pods we're modifying the update strategy so it doesn't accept any Unavailable pod. This means that it will create new ovn-controller-ovs while the old one is running. This commit enables that two ovn-controller-ovs pods can coexists. It accomplish this by modifying the start-up scripts of all containers so it checks if a pod is already running and slowly stops in a controlled fashon the old pods while it starts the new ones. The logic is done with a temporary file created during the init container that will inform the ovsdb-server/ovs-vswitchd containers if they're on an update scenario or a normal one. The temporary file is deleted after the end of the ovs-vswitchd so when the ovnController CR is deleted, the volumes won't have any leftovers. Related: OSPRH-11636 Jira: OSPRH-10821 Depends-on: lib-common#611
1 parent 602aa84 commit 483f6d7

File tree

8 files changed

+202
-10
lines changed

8 files changed

+202
-10
lines changed

controllers/ovncontroller_controller.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,11 @@ func (r *OVNControllerReconciler) generateServiceConfigMaps(
744744
} else {
745745
templateParameters["OVNEncapNIC"] = "eth0"
746746
}
747+
if instance.Spec.TLS.Enabled() {
748+
templateParameters["TLS"] = "Enabled"
749+
} else {
750+
templateParameters["TLS"] = "Disabled"
751+
}
747752
cms := []util.Template{
748753
// ScriptsConfigMap
749754
{

pkg/ovncontroller/daemonset.go

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
appsv1 "k8s.io/api/apps/v1"
2525
corev1 "k8s.io/api/core/v1"
2626
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/util/intstr"
2728
"k8s.io/utils/ptr"
2829
)
2930

@@ -236,6 +237,27 @@ func CreateOVSDaemonSet(
236237
envVars := map[string]env.Setter{}
237238
envVars["CONFIG_HASH"] = env.SetValue(configHash)
238239

240+
volumes := []corev1.Volume{}
241+
mounts := []corev1.VolumeMount{}
242+
243+
// add OVN dbs cert and CA
244+
if instance.Spec.TLS.Enabled() {
245+
svc := tls.Service{
246+
SecretName: *instance.Spec.TLS.GenericService.SecretName,
247+
CertMount: ptr.To(ovn_common.OVNDbCertPath),
248+
KeyMount: ptr.To(ovn_common.OVNDbKeyPath),
249+
CaMount: ptr.To(ovn_common.OVNDbCaCertPath),
250+
}
251+
volumes = append(volumes, svc.CreateVolume(ovnv1.ServiceNameOVS))
252+
mounts = append(mounts, svc.CreateVolumeMounts(ovnv1.ServiceNameOVS)...)
253+
254+
// add CA bundle if defined
255+
if instance.Spec.TLS.CaBundleSecretName != "" {
256+
volumes = append(volumes, instance.Spec.TLS.CreateVolume())
257+
mounts = append(mounts, instance.Spec.TLS.CreateVolumeMounts(nil)...)
258+
}
259+
}
260+
239261
initContainers := []corev1.Container{
240262
{
241263
Name: "ovsdb-server-init",
@@ -250,7 +272,7 @@ func CreateOVSDaemonSet(
250272
Privileged: &privileged,
251273
},
252274
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
253-
VolumeMounts: GetOVSDbVolumeMounts(),
275+
VolumeMounts: append(GetOVSDbVolumeMounts(), mounts...),
254276
},
255277
}
256278

@@ -276,7 +298,7 @@ func CreateOVSDaemonSet(
276298
Privileged: &privileged,
277299
},
278300
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
279-
VolumeMounts: GetOVSDbVolumeMounts(),
301+
VolumeMounts: append(GetOVSDbVolumeMounts(), mounts...),
280302
// TODO: consider the fact that resources are now double booked
281303
Resources: instance.Spec.Resources,
282304
LivenessProbe: ovsDbLivenessProbe,
@@ -303,7 +325,7 @@ func CreateOVSDaemonSet(
303325
Privileged: &privileged,
304326
},
305327
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
306-
VolumeMounts: GetVswitchdVolumeMounts(),
328+
VolumeMounts: append(GetVswitchdVolumeMounts(), mounts...),
307329
// TODO: consider the fact that resources are now double booked
308330
Resources: instance.Spec.Resources,
309331
LivenessProbe: ovsVswitchdLivenessProbe,
@@ -312,6 +334,9 @@ func CreateOVSDaemonSet(
312334
},
313335
}
314336

337+
maxUnavailable := intstr.FromInt32(0)
338+
maxSurge := intstr.FromInt32(1)
339+
315340
daemonset := &appsv1.DaemonSet{
316341
ObjectMeta: metav1.ObjectMeta{
317342
Name: ovnv1.ServiceNameOVS,
@@ -327,9 +352,17 @@ func CreateOVSDaemonSet(
327352
},
328353
Spec: corev1.PodSpec{
329354
ServiceAccountName: instance.RbacResourceName(),
355+
HostPID: true,
330356
InitContainers: initContainers,
331357
Containers: containers,
332-
Volumes: GetOVSVolumes(instance.Name, instance.Namespace),
358+
Volumes: append(GetOVSVolumes(instance.Name, instance.Namespace), volumes...),
359+
},
360+
},
361+
UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
362+
Type: appsv1.RollingUpdateDaemonSetStrategyType,
363+
RollingUpdate: &appsv1.RollingUpdateDaemonSet{
364+
MaxUnavailable: &maxUnavailable,
365+
MaxSurge: &maxSurge,
333366
},
334367
},
335368
},

templates/ovncontroller/bin/functions

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,15 @@ FLOWS_RESTORE_SCRIPT=$ovs_dir/flows-script
2929
FLOWS_RESTORE_DIR=$ovs_dir/saved-flows
3030
SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE=$ovs_dir/is_safe_to_stop_ovsdb_server
3131

32+
# Variables declaration used by start-up optimization
33+
ovs_vswitchd_pid_file=/var/run/openvswitch/ovs-vswitchd.pid
34+
ovsdb_server_pid_file=/var/run/openvswitch/ovsdb-server.pid
35+
update_semaphore_file=/var/lib/openvswitch/update
36+
stop_vswitchd_script_file=/usr/local/bin/container-scripts/stop-vswitchd.sh
37+
stop_ovsdb_server_script_file=/usr/local/bin/container-scripts/stop-ovsdb-server.sh
38+
skip_ovsdb_server_stop_file=/var/lib/openvswitch/skip_stop_ovsdbserver
39+
skip_vswitchd_stop_file=/var/lib/openvswitch/skip_stop_vswitchd
40+
3241
function cleanup_ovsdb_server_semaphore() {
3342
rm -f $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE 2>&1 > /dev/null
3443
}

templates/ovncontroller/bin/init-ovsdb-server.sh

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,62 @@ set -ex
1818
source $(dirname $0)/functions
1919
trap wait_for_db_creation EXIT
2020

21+
# Debug
22+
sleep 3
23+
2124
# If db file is empty, remove it; otherwise service won't start.
2225
# See https://issues.redhat.com/browse/FDP-689 for more details.
2326
if ! [ -s ${DB_FILE} ]; then
2427
rm -f ${DB_FILE}
2528
fi
26-
# Initialize or upgrade database if needed
27-
CTL_ARGS="--system-id=random --no-ovs-vswitchd"
28-
/usr/share/openvswitch/scripts/ovs-ctl start $CTL_ARGS
29-
/usr/share/openvswitch/scripts/ovs-ctl stop $CTL_ARGS
3029

31-
wait_for_db_creation
32-
trap - EXIT
30+
# Check if it's a normal start or an update
31+
# Normal start: ovsdb-server & ovs-vswitchd are not running, start normal
32+
# Update: ovsdb-server & ovs-vswitchd still running, need different approach
33+
if [ -f $ovs_vswitchd_pid_file ] || [ -f $ovsdb_server_pid_file ]; then
34+
# Some process it's running, it's an update. Create semaphore
35+
echo "UPDATE" > $update_semaphore_file
36+
# Due to modify the priority to minimize the dataplane outages we need to connect
37+
# to the nb db, check if TLS is used or not
38+
if [ {{ .TLS }} == "Enabled" ]; then
39+
# TLS is used
40+
TLSOptions="--certificate=/etc/pki/tls/certs/ovndb.crt --private-key=/etc/pki/tls/private/ovndb.key --ca-cert=/etc/pki/tls/certs/ovndbca.crt"
41+
DBOptions="--db ssl:ovsdbserver-nb.openstack.svc.cluster.local:6641"
42+
else
43+
# Normal TCP is used
44+
TLSOptions=""
45+
DBOptions="--db tcp:ovsdbserver-nb.openstack.svc.cluster.local:6641"
46+
fi
47+
# No need to initializice ovs-vswitchd in this path, as this has done before
48+
# TODO: check what happens if during the update an update to the ovs db is needed
49+
chassis_id=$(ovs-vsctl get Open_Vswitch . external_ids:system-id)
50+
nb_output=$(ovn-nbctl --no-leader-only $DBOptions $TLSOptions --columns=_uuid,priority find Gateway_Chassis chassis_name=$chassis_id)
51+
# Check that nbctl was executed correctly
52+
if [ $? -ne 0 ]; then
53+
echo "ERROR: ovn-nbctl find command failed"
54+
exit 1
55+
fi
56+
row_uuid=$(echo "$nb_output" | grep "_uuid" | cut -d':' -f2 | xargs)
57+
priority=$(echo "$nb_output" | grep "priority" | cut -d':' -f2 | xargs)
58+
# Save priority to be able to restore it later (It's overwritting, not appending, hence no check)
59+
echo $priority > /var/lib/openvswitch/old_priority
60+
# Set lower priority (lowest value possible 0)
61+
ovn-nbctl --no-leader-only $DBOptions $TLSOptions set Gateway_Chassis $row_uuid priority=0
62+
# Check that nbctl was executed correctly
63+
if [ $? -ne 0 ]; then
64+
echo "ERROR: ovn-nbctl set command failed"
65+
exit 1
66+
fi
67+
else
68+
# In case something went wrong last run, ensure that semaphor_file is not present in this path
69+
if [ -f $update_semaphore_file ]; then
70+
rm $update_semaphore_file
71+
fi
72+
# Initialize or upgrade database if needed
73+
CTL_ARGS="--system-id=random --no-ovs-vswitchd"
74+
/usr/share/openvswitch/scripts/ovs-ctl start $CTL_ARGS
75+
/usr/share/openvswitch/scripts/ovs-ctl stop $CTL_ARGS
76+
77+
wait_for_db_creation
78+
trap - EXIT
79+
fi

templates/ovncontroller/bin/start-ovsdb-server.sh

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,43 @@ source $(dirname $0)/functions
2020
# Remove the obsolete semaphore file in case it still exists.
2121
cleanup_ovsdb_server_semaphore
2222

23+
# Check if we're on the update path
24+
if [ -f $update_semaphore_file ]; then
25+
echo "In the middle of an upgrade"
26+
# Need to stop vsitchd
27+
echo "Stopping vswitchd"
28+
bash $stop_vswitchd_script_file
29+
# with this script the current lflows should be already stored in a file
30+
# and vswitchd should be stopped.
31+
# Need to wait until vswitchd is stoped in order to stop also the ovsdb-server
32+
while true; do
33+
if [ ! -f $ovs_vswitchd_pid_file ]; then
34+
break
35+
fi
36+
# TODO change it to .1
37+
sleep 1
38+
done
39+
# Ovs-vswtichd was already restarted, need to skip the preStop from the openshift
40+
# lifecicle when the old pod gets deleted
41+
echo "Creating flag file to skip ovs-vswitchd stop"
42+
touch $skip_vswitchd_stop_file
43+
# Run stop-ovsdbserver script to ensure lflows semaphor is cleaned correctly
44+
bash $stop_ovsdb_server_script_file
45+
# Need to create a flag-file to skip ovsdb-server stop
46+
# to avoid triggering it again when openshift triggers the preStop script.
47+
# TODO: check that flag-file is deleted correctly.
48+
echo "Creating flag file to skip ovsdb-server stop"
49+
touch $skip_ovsdb_server_stop_file
50+
# Ensure that ovsdb-server is stopped
51+
while true; do
52+
if [ ! -f $ovsdb_server_pid_file ]; then
53+
break
54+
fi
55+
# TODO change it to .1
56+
sleep 1
57+
done
58+
fi
59+
2360
# Start the service
2461
ovsdb-server ${DB_FILE} \
2562
--pidfile \

templates/ovncontroller/bin/start-vswitchd.sh

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,30 @@
1515
# under the License.
1616

1717
source $(dirname $0)/functions
18+
19+
# Check which connection is used
20+
if [ {{ .TLS }} == "Enabled" ]; then
21+
# TLS is used
22+
TLSOptions="--certificate=/etc/pki/tls/certs/ovndb.crt --private-key=/etc/pki/tls/private/ovndb.key --ca-cert=/etc/pki/tls/certs/ovndbca.crt"
23+
DBOptions="--db ssl:ovsdbserver-nb.openstack.svc.cluster.local:6641"
24+
else
25+
# Normal TCP is used
26+
TLSOptions=""
27+
DBOptions="--db tcp:ovsdbserver-nb.openstack.svc.cluster.local:6641"
28+
fi
29+
30+
# If we're on an update wait until past vswitchd process is stopped correctly
31+
if [ -f $update_semaphore_file ]; then
32+
# In the middle of an update, wait until vswitchd is already stopped
33+
while true; do
34+
if [ ! -f $ovs_vswitchd_pid_file ]; then
35+
break
36+
fi
37+
# TODO change it to .1
38+
sleep 1
39+
done
40+
fi
41+
1842
wait_for_ovsdb_server
1943

2044
# The order - first wait for db server, then set -ex - is important. Otherwise,
@@ -28,6 +52,27 @@ ovs-vsctl --no-wait set open . external-ids:ovn-encap-ip=${OVNEncapIP}
2852
# Before starting vswitchd, block it from flushing existing datapath flows.
2953
ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait=true
3054

55+
# Restore the priority if this was changed during the update
56+
if [ -f /var/lib/openvswitch/old_priority ]; then
57+
echo "Using DBOptions: $DBOptions"
58+
echo "Using TLSOptions: $TLSOptions"
59+
priority=$(cat /var/lib/openvswitch/old_priority)
60+
echo "Restoring old priority, which was: $priority"
61+
chassis_id=$(ovs-vsctl get Open_Vswitch . external_ids:system-id)
62+
nb_output=$(ovn-nbctl --no-leader-only $DBOptions $TLSOptions --columns=_uuid,priority find Gateway_Chassis chassis_name=$chassis_id)
63+
err=$?
64+
if [ $err -ne 0 ]; then
65+
echo "Error while getting gateway chassis uuid $err"
66+
fi
67+
row_uuid=$(echo "$nb_output" | grep "_uuid" | cut -d':' -f2 | xargs)
68+
rm /var/lib/openvswitch/old_priority
69+
ovn-nbctl --no-leader-only $DBOptions $TLSOptions set Gateway_Chassis $row_uuid priority=$priority
70+
err=$?
71+
if [ $err -ne 0 ]; then
72+
echo "Error while setting gateway chassis priority ($priority), error: $err"
73+
fi
74+
fi
75+
3176
# It's safe to start vswitchd now. Do it.
3277
# --detach to allow the execution to continue to restoring the flows.
3378
/usr/sbin/ovs-vswitchd --pidfile --mlockall --detach
@@ -49,6 +94,10 @@ cleanup_flows_backup
4994
# Now, inform vswitchd that we are done.
5095
ovs-vsctl remove open_vswitch . other_config flow-restore-wait
5196

97+
# At this point, ovsdb-server and vswitchd are already running, update (if it was the case)
98+
# is already done. Delete update file
99+
rm $update_semaphore_file || true
100+
52101
# This is container command script. Block it from exiting, otherwise k8s will
53102
# restart the container again.
54103
sleep infinity

templates/ovncontroller/bin/stop-ovsdb-server.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
set -ex
1818
source $(dirname $0)/functions
1919

20+
# If file is present, skip stop script
21+
if [ -f $skip_ovsdb_server_stop_file ]; then
22+
rm $skip_ovsdb_server_stop_file
23+
exit 0
24+
fi
25+
2026
# The ovs_vswitchd container has to terminate before ovsdb-server because it
2127
# needs access to db in its preStop script. The preStop script backs up flows
2228
# for restoration during the next startup. This semaphore ensures the vswitchd

templates/ovncontroller/bin/stop-vswitchd.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
set -ex
1818
source $(dirname $0)/functions
1919

20+
# If file is present, skip stop script
21+
if [ -f $skip_vswitchd_stop_file ]; then
22+
rm $skip_vswitchd_stop_file
23+
exit 0
24+
fi
25+
2026
# Clean up any previously created flow backups to avoid conflict with newly
2127
# generated backup.
2228
cleanup_flows_backup

0 commit comments

Comments
 (0)