Skip to content

Commit 5416cdf

Browse files
committed
Modify startup scripts for ovn-controller-ovs
In order to minimize the downtime during update of the ovn-controller-ovs pods we're modifying the update strategy so it doesn't accept any Unavailable pod. This means that it will create new ovn-controller-ovs while the old one is running. This commit enables that two ovn-controller-ovs pods can coexists. It accomplish this by modifying the start-up scripts of all containers so it checks if a pod is already running and slowly stops in a controlled fashon the old pods while it starts the new ones. The logic is done with a temporary file created during the init container that will inform the ovsdb-server/ovs-vswitchd containers if they're on an update scenario or a normal one. The temporary file is deleted after the end of the ovs-vswitchd so when the ovnController CR is deleted, the volumes won't have any leftovers. Related: OSPRH-11636 Jira: OSPRH-10821 Depends-on: lib-common#611
1 parent 602aa84 commit 5416cdf

File tree

8 files changed

+193
-10
lines changed

8 files changed

+193
-10
lines changed

controllers/ovncontroller_controller.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,11 @@ func (r *OVNControllerReconciler) generateServiceConfigMaps(
744744
} else {
745745
templateParameters["OVNEncapNIC"] = "eth0"
746746
}
747+
if instance.Spec.TLS.Enabled() {
748+
templateParameters["TLS"] = "Enabled"
749+
} else {
750+
templateParameters["TLS"] = "Disabled"
751+
}
747752
cms := []util.Template{
748753
// ScriptsConfigMap
749754
{

pkg/ovncontroller/daemonset.go

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
appsv1 "k8s.io/api/apps/v1"
2525
corev1 "k8s.io/api/core/v1"
2626
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/util/intstr"
2728
"k8s.io/utils/ptr"
2829
)
2930

@@ -236,6 +237,27 @@ func CreateOVSDaemonSet(
236237
envVars := map[string]env.Setter{}
237238
envVars["CONFIG_HASH"] = env.SetValue(configHash)
238239

240+
volumes := []corev1.Volume{}
241+
mounts := []corev1.VolumeMount{}
242+
243+
// add OVN dbs cert and CA
244+
if instance.Spec.TLS.Enabled() {
245+
svc := tls.Service{
246+
SecretName: *instance.Spec.TLS.GenericService.SecretName,
247+
CertMount: ptr.To(ovn_common.OVNDbCertPath),
248+
KeyMount: ptr.To(ovn_common.OVNDbKeyPath),
249+
CaMount: ptr.To(ovn_common.OVNDbCaCertPath),
250+
}
251+
volumes = append(volumes, svc.CreateVolume(ovnv1.ServiceNameOVS))
252+
mounts = append(mounts, svc.CreateVolumeMounts(ovnv1.ServiceNameOVS)...)
253+
254+
// add CA bundle if defined
255+
if instance.Spec.TLS.CaBundleSecretName != "" {
256+
volumes = append(volumes, instance.Spec.TLS.CreateVolume())
257+
mounts = append(mounts, instance.Spec.TLS.CreateVolumeMounts(nil)...)
258+
}
259+
}
260+
239261
initContainers := []corev1.Container{
240262
{
241263
Name: "ovsdb-server-init",
@@ -250,7 +272,7 @@ func CreateOVSDaemonSet(
250272
Privileged: &privileged,
251273
},
252274
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
253-
VolumeMounts: GetOVSDbVolumeMounts(),
275+
VolumeMounts: append(GetOVSDbVolumeMounts(), mounts...),
254276
},
255277
}
256278

@@ -276,7 +298,7 @@ func CreateOVSDaemonSet(
276298
Privileged: &privileged,
277299
},
278300
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
279-
VolumeMounts: GetOVSDbVolumeMounts(),
301+
VolumeMounts: append(GetOVSDbVolumeMounts(), mounts...),
280302
// TODO: consider the fact that resources are now double booked
281303
Resources: instance.Spec.Resources,
282304
LivenessProbe: ovsDbLivenessProbe,
@@ -303,7 +325,7 @@ func CreateOVSDaemonSet(
303325
Privileged: &privileged,
304326
},
305327
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
306-
VolumeMounts: GetVswitchdVolumeMounts(),
328+
VolumeMounts: append(GetVswitchdVolumeMounts(), mounts...),
307329
// TODO: consider the fact that resources are now double booked
308330
Resources: instance.Spec.Resources,
309331
LivenessProbe: ovsVswitchdLivenessProbe,
@@ -312,6 +334,9 @@ func CreateOVSDaemonSet(
312334
},
313335
}
314336

337+
maxUnavailable := intstr.FromInt32(0)
338+
maxSurge := intstr.FromInt32(1)
339+
315340
daemonset := &appsv1.DaemonSet{
316341
ObjectMeta: metav1.ObjectMeta{
317342
Name: ovnv1.ServiceNameOVS,
@@ -327,9 +352,17 @@ func CreateOVSDaemonSet(
327352
},
328353
Spec: corev1.PodSpec{
329354
ServiceAccountName: instance.RbacResourceName(),
355+
HostPID: true,
330356
InitContainers: initContainers,
331357
Containers: containers,
332-
Volumes: GetOVSVolumes(instance.Name, instance.Namespace),
358+
Volumes: append(GetOVSVolumes(instance.Name, instance.Namespace), volumes...),
359+
},
360+
},
361+
UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
362+
Type: appsv1.RollingUpdateDaemonSetStrategyType,
363+
RollingUpdate: &appsv1.RollingUpdateDaemonSet{
364+
MaxUnavailable: &maxUnavailable,
365+
MaxSurge: &maxSurge,
333366
},
334367
},
335368
},

templates/ovncontroller/bin/functions

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,15 @@ FLOWS_RESTORE_SCRIPT=$ovs_dir/flows-script
2929
FLOWS_RESTORE_DIR=$ovs_dir/saved-flows
3030
SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE=$ovs_dir/is_safe_to_stop_ovsdb_server
3131

32+
# Variables declaration used by start-up optimization
33+
ovs_vswitchd_pid_file=/var/run/openvswitch/ovs-vswitchd.pid
34+
ovsdb_server_pid_file=/var/run/openvswitch/ovsdb-server.pid
35+
update_semaphore_file=/var/lib/openvswitch/update
36+
stop_vswitchd_script_file=/usr/local/bin/container-scripts/stop-vswitchd.sh
37+
stop_ovsdb_server_script_file=/usr/local/bin/container-scripts/stop-ovsdb-server.sh
38+
skip_ovsdb_server_stop_file=/var/lib/openvswitch/skip_stop_ovsdbserver
39+
skip_vswitchd_stop_file=/var/lib/openvswitch/skip_stop_vswitchd
40+
3241
function cleanup_ovsdb_server_semaphore() {
3342
rm -f $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE 2>&1 > /dev/null
3443
}

templates/ovncontroller/bin/init-ovsdb-server.sh

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,54 @@ trap wait_for_db_creation EXIT
2323
if ! [ -s ${DB_FILE} ]; then
2424
rm -f ${DB_FILE}
2525
fi
26-
# Initialize or upgrade database if needed
27-
CTL_ARGS="--system-id=random --no-ovs-vswitchd"
28-
/usr/share/openvswitch/scripts/ovs-ctl start $CTL_ARGS
29-
/usr/share/openvswitch/scripts/ovs-ctl stop $CTL_ARGS
3026

31-
wait_for_db_creation
32-
trap - EXIT
27+
# Check if it's a normal start or an update
28+
# Normal start: ovsdb-server & ovs-vswitchd are not running, start normal
29+
# Update: ovsdb-server & ovs-vswitchd still running, need different approach
30+
if [ -f $ovs_vswitchd_pid_file ] || [ -f $ovsdb_server_pid_file ]; then
31+
# Some process it's running, it's an update. Create semaphore
32+
echo "UPDATE" > $update_semaphore_file
33+
# Due to modify the priority to minimize the dataplane outages we need to connect
34+
# to the nb db, check if TLS is used or not
35+
if [ {{ .TLS }} == "Enabled" ]; then
36+
# TLS is used
37+
TLSOptions="--certificate=/etc/pki/tls/certs/ovndb.crt --private-key=/etc/pki/tls/private/ovndb.key --ca-cert=/etc/pki/tls/certs/ovndbca.crt"
38+
DBOptions="--db ssl:ovsdbserver-nb.openstack.svc.cluster.local:6641"
39+
else
40+
# Normal TCP is used
41+
TLSOptions=""
42+
DBOptions="--db tcp:ovsdbserver-nb.openstack.svc.cluster.local:6641"
43+
fi
44+
# No need to initializice ovs-vswitchd in this path, as this has done before
45+
# TODO: check what happens if during the update an update to the ovs db is needed
46+
chassis_id=$(ovs-vsctl get Open_Vswitch . external_ids:system-id)
47+
nb_output=$(ovn-nbctl --no-leader-only $DBOptions $TLSOptions --columns=_uuid,priority find Gateway_Chassis chassis_name=$chassis_id)
48+
# Check that nbctl was executed correctly
49+
if [ $? -ne 0 ]; then
50+
echo "ERROR: ovn-nbctl find command failed"
51+
exit 1
52+
fi
53+
row_uuid=$(echo "$nb_output" | grep "_uuid" | cut -d':' -f2 | xargs)
54+
priority=$(echo "$nb_output" | grep "priority" | cut -d':' -f2 | xargs)
55+
# Save priority to be able to restore it later (It's overwritting, not appending, hence no check)
56+
echo $priority > /var/lib/openvswitch/old_priority
57+
# Set lower priority (lowest value possible 0)
58+
ovn-nbctl --no-leader-only $DBOptions $TLSOptions set Gateway_Chassis $row_uuid priority=0
59+
# Check that nbctl was executed correctly
60+
if [ $? -ne 0 ]; then
61+
echo "ERROR: ovn-nbctl set command failed"
62+
exit 1
63+
fi
64+
else
65+
# In case something went wrong last run, ensure that semaphor_file is not present in this path
66+
if [ -f $update_semaphore_file ]; then
67+
rm $update_semaphore_file
68+
fi
69+
# Initialize or upgrade database if needed
70+
CTL_ARGS="--system-id=random --no-ovs-vswitchd"
71+
/usr/share/openvswitch/scripts/ovs-ctl start $CTL_ARGS
72+
/usr/share/openvswitch/scripts/ovs-ctl stop $CTL_ARGS
73+
74+
wait_for_db_creation
75+
trap - EXIT
76+
fi

templates/ovncontroller/bin/start-ovsdb-server.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,40 @@ source $(dirname $0)/functions
2020
# Remove the obsolete semaphore file in case it still exists.
2121
cleanup_ovsdb_server_semaphore
2222

23+
# Check if we're on the update path
24+
if [ -f $update_semaphore_file ]; then
25+
echo "In the middle of an upgrade"
26+
# Need to stop vsitchd
27+
echo "Stopping vswitchd"
28+
bash $stop_vswitchd_script_file
29+
# with this script the current lflows should be already stored in a file
30+
# and vswitchd should be stopped.
31+
# Need to wait until vswitchd is stoped in order to stop also the ovsdb-server
32+
while true; do
33+
if [ ! -f $ovs_vswitchd_pid_file ]; then
34+
break
35+
fi
36+
sleep 0.1
37+
done
38+
# Ovs-vswtichd was already restarted, need to skip the preStop from the openshift
39+
# lifecicle when the old pod gets deleted
40+
echo "Creating flag file to skip ovs-vswitchd stop"
41+
touch $skip_vswitchd_stop_file
42+
# Run stop-ovsdbserver script to ensure lflows semaphor is cleaned correctly
43+
bash $stop_ovsdb_server_script_file
44+
# Need to create a flag-file to skip ovsdb-server stop
45+
# to avoid triggering it again when openshift triggers the preStop script.
46+
echo "Creating flag file to skip ovsdb-server stop"
47+
touch $skip_ovsdb_server_stop_file
48+
# Ensure that ovsdb-server is stopped
49+
while true; do
50+
if [ ! -f $ovsdb_server_pid_file ]; then
51+
break
52+
fi
53+
sleep 0.1
54+
done
55+
fi
56+
2357
# Start the service
2458
ovsdb-server ${DB_FILE} \
2559
--pidfile \

templates/ovncontroller/bin/start-vswitchd.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,29 @@
1515
# under the License.
1616

1717
source $(dirname $0)/functions
18+
19+
# Check which connection is used
20+
if [ {{ .TLS }} == "Enabled" ]; then
21+
# TLS is used
22+
TLSOptions="--certificate=/etc/pki/tls/certs/ovndb.crt --private-key=/etc/pki/tls/private/ovndb.key --ca-cert=/etc/pki/tls/certs/ovndbca.crt"
23+
DBOptions="--db ssl:ovsdbserver-nb.openstack.svc.cluster.local:6641"
24+
else
25+
# Normal TCP is used
26+
TLSOptions=""
27+
DBOptions="--db tcp:ovsdbserver-nb.openstack.svc.cluster.local:6641"
28+
fi
29+
30+
# If we're on an update wait until past vswitchd process is stopped correctly
31+
if [ -f $update_semaphore_file ]; then
32+
# In the middle of an update, wait until vswitchd is already stopped
33+
while true; do
34+
if [ ! -f $ovs_vswitchd_pid_file ]; then
35+
break
36+
fi
37+
sleep 0.1
38+
done
39+
fi
40+
1841
wait_for_ovsdb_server
1942

2043
# The order - first wait for db server, then set -ex - is important. Otherwise,
@@ -28,6 +51,25 @@ ovs-vsctl --no-wait set open . external-ids:ovn-encap-ip=${OVNEncapIP}
2851
# Before starting vswitchd, block it from flushing existing datapath flows.
2952
ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait=true
3053

54+
# Restore the priority if this was changed during the update
55+
if [ -f /var/lib/openvswitch/old_priority ]; then
56+
priority=$(cat /var/lib/openvswitch/old_priority)
57+
echo "Restoring old priority, which was: $priority"
58+
chassis_id=$(ovs-vsctl get Open_Vswitch . external_ids:system-id)
59+
nb_output=$(ovn-nbctl --no-leader-only $DBOptions $TLSOptions --columns=_uuid,priority find Gateway_Chassis chassis_name=$chassis_id)
60+
err=$?
61+
if [ $err -ne 0 ]; then
62+
echo "Error while getting gateway chassis uuid $err"
63+
fi
64+
row_uuid=$(echo "$nb_output" | grep "_uuid" | cut -d':' -f2 | xargs)
65+
rm /var/lib/openvswitch/old_priority
66+
ovn-nbctl --no-leader-only $DBOptions $TLSOptions set Gateway_Chassis $row_uuid priority=$priority
67+
err=$?
68+
if [ $err -ne 0 ]; then
69+
echo "Error while setting gateway chassis priority ($priority), error: $err"
70+
fi
71+
fi
72+
3173
# It's safe to start vswitchd now. Do it.
3274
# --detach to allow the execution to continue to restoring the flows.
3375
/usr/sbin/ovs-vswitchd --pidfile --mlockall --detach
@@ -49,6 +91,10 @@ cleanup_flows_backup
4991
# Now, inform vswitchd that we are done.
5092
ovs-vsctl remove open_vswitch . other_config flow-restore-wait
5193

94+
# At this point, ovsdb-server and vswitchd are already running, update (if it was the case)
95+
# is already done. Delete update file
96+
rm $update_semaphore_file || true
97+
5298
# This is container command script. Block it from exiting, otherwise k8s will
5399
# restart the container again.
54100
sleep infinity

templates/ovncontroller/bin/stop-ovsdb-server.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
set -ex
1818
source $(dirname $0)/functions
1919

20+
# If file is present, skip stop script
21+
if [ -f $skip_ovsdb_server_stop_file ]; then
22+
rm $skip_ovsdb_server_stop_file
23+
exit 0
24+
fi
25+
2026
# The ovs_vswitchd container has to terminate before ovsdb-server because it
2127
# needs access to db in its preStop script. The preStop script backs up flows
2228
# for restoration during the next startup. This semaphore ensures the vswitchd

templates/ovncontroller/bin/stop-vswitchd.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
set -ex
1818
source $(dirname $0)/functions
1919

20+
# If file is present, skip stop script
21+
if [ -f $skip_vswitchd_stop_file ]; then
22+
rm $skip_vswitchd_stop_file
23+
exit 0
24+
fi
25+
2026
# Clean up any previously created flow backups to avoid conflict with newly
2127
# generated backup.
2228
cleanup_flows_backup

0 commit comments

Comments
 (0)