Skip to content

Commit e6ae9e3

Browse files
authored
Implement per-cluster maintenance window for Postgres automatic upgrade (#2710)
* implement maintenance window for major version upgrade * e2e test: fix major version upgrade test and extend with the time window * unit test: add iteration to test isInMaintenanceWindow * UI: show the window and enable edit via UI
1 parent ce15d10 commit e6ae9e3

File tree

10 files changed

+216
-21
lines changed

10 files changed

+216
-21
lines changed

docs/reference/cluster_manifest.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ These parameters are grouped directly under the `spec` key in the manifest.
114114
this parameter. Optional, when empty the load balancer service becomes
115115
inaccessible from outside of the Kubernetes cluster.
116116

117+
* **maintenanceWindows**
118+
a list defines specific time frames when major version upgrades are permitted
119+
to occur, restricting major version upgrades to these designated periods only.
120+
Accepted formats include "01:00-06:00" for daily maintenance windows or
121+
"Sat:00:00-04:00" for specific days, with all times in UTC.
122+
117123
* **users**
118124
a map of usernames to user flags for the users that should be created in the
119125
cluster by the operator. User flags are a list, allowed elements are

e2e/tests/k8s_api.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,6 @@ def wait_for_pod_failover(self, failover_targets, labels, namespace='default'):
218218
pod_phase = 'Failing over'
219219
new_pod_node = ''
220220
pods_with_update_flag = self.count_pods_with_rolling_update_flag(labels, namespace)
221-
222221
while (pod_phase != 'Running') or (new_pod_node not in failover_targets):
223222
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
224223
if pods:
@@ -525,7 +524,6 @@ def wait_for_pod_failover(self, failover_targets, labels, namespace='default'):
525524
pod_phase = 'Failing over'
526525
new_pod_node = ''
527526
pods_with_update_flag = self.count_pods_with_rolling_update_flag(labels, namespace)
528-
529527
while (pod_phase != 'Running') or (new_pod_node not in failover_targets):
530528
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
531529
if pods:

e2e/tests/test_e2e.py

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
SPILO_CURRENT = "registry.opensource.zalan.do/acid/spilo-16-e2e:0.1"
1616
SPILO_LAZY = "registry.opensource.zalan.do/acid/spilo-16-e2e:0.2"
17+
SPILO_FULL_IMAGE = "ghcr.io/zalando/spilo-16:3.2-p3"
1718

1819

1920
def to_selector(labels):
@@ -115,6 +116,7 @@ def setUpClass(cls):
115116
configmap = yaml.safe_load(f)
116117
configmap["data"]["workers"] = "1"
117118
configmap["data"]["docker_image"] = SPILO_CURRENT
119+
configmap["data"]["major_version_upgrade_mode"] = "full"
118120

119121
with open("manifests/configmap.yaml", 'w') as f:
120122
yaml.dump(configmap, f, Dumper=yaml.Dumper)
@@ -1181,31 +1183,94 @@ def get_docker_image():
11811183
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
11821184

11831185
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
1184-
@unittest.skip("Skipping this test until fixed")
11851186
def test_major_version_upgrade(self):
1187+
"""
1188+
Test major version upgrade
1189+
"""
1190+
def check_version():
1191+
p = k8s.patroni_rest("acid-upgrade-test-0", "")
1192+
version = p.get("server_version", 0) // 10000
1193+
return version
1194+
11861195
k8s = self.k8s
1187-
result = k8s.create_with_kubectl("manifests/minimal-postgres-manifest-12.yaml")
1188-
self.eventuallyEqual(lambda: k8s.count_running_pods(labels="application=spilo,cluster-name=acid-upgrade-test"), 2, "No 2 pods running")
1196+
cluster_label = 'application=spilo,cluster-name=acid-upgrade-test'
1197+
1198+
with open("manifests/minimal-postgres-manifest-12.yaml", 'r+') as f:
1199+
upgrade_manifest = yaml.safe_load(f)
1200+
upgrade_manifest["spec"]["dockerImage"] = SPILO_FULL_IMAGE
1201+
1202+
with open("manifests/minimal-postgres-manifest-12.yaml", 'w') as f:
1203+
yaml.dump(upgrade_manifest, f, Dumper=yaml.Dumper)
1204+
1205+
k8s.create_with_kubectl("manifests/minimal-postgres-manifest-12.yaml")
1206+
self.eventuallyEqual(lambda: k8s.count_running_pods(labels=cluster_label), 2, "No 2 pods running")
11891207
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
1208+
self.eventuallyEqual(check_version, 12, "Version is not correct")
11901209

1191-
pg_patch_version = {
1210+
master_nodes, _ = k8s.get_cluster_nodes(cluster_labels=cluster_label)
1211+
# should upgrade immediately
1212+
pg_patch_version_14 = {
11921213
"spec": {
1193-
"postgres": {
1214+
"postgresql": {
11941215
"version": "14"
11951216
}
11961217
}
11971218
}
11981219
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1199-
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version)
1220+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14)
1221+
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
1222+
1223+
# should have finish failover
1224+
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
1225+
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
1226+
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1227+
self.eventuallyEqual(check_version, 14, "Version should be upgraded from 12 to 14")
12001228

1229+
# should not upgrade because current time is not in maintenanceWindow
1230+
current_time = datetime.now()
1231+
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
1232+
pg_patch_version_15 = {
1233+
"spec": {
1234+
"postgresql": {
1235+
"version": "15"
1236+
},
1237+
"maintenanceWindows": [
1238+
maintenance_window_future
1239+
]
1240+
}
1241+
}
1242+
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1243+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
12011244
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
12021245

1203-
def check_version_14():
1204-
p = k8s.get_patroni_state("acid-upgrade-test-0")
1205-
version = p["server_version"][0:2]
1206-
return version
1246+
# should have finish failover
1247+
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
1248+
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
1249+
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1250+
self.eventuallyEqual(check_version, 14, "Version should not be upgraded")
1251+
1252+
# change the version again to trigger operator sync
1253+
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
1254+
pg_patch_version_16 = {
1255+
"spec": {
1256+
"postgresql": {
1257+
"version": "16"
1258+
},
1259+
"maintenanceWindows": [
1260+
maintenance_window_current
1261+
]
1262+
}
1263+
}
12071264

1208-
self.eventuallyEqual(check_version_14, "14", "Version was not upgrade to 14")
1265+
k8s.api.custom_objects_api.patch_namespaced_custom_object(
1266+
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
1267+
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
1268+
1269+
# should have finish failover
1270+
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
1271+
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
1272+
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
1273+
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16")
12091274

12101275
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
12111276
def test_persistent_volume_claim_retention_policy(self):

pkg/cluster/majorversionupgrade.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ func (c *Cluster) majorVersionUpgrade() error {
7474
return nil
7575
}
7676

77+
if !c.isInMainternanceWindow() {
78+
c.logger.Infof("skipping major version upgrade, not in maintenance window")
79+
return nil
80+
}
81+
7782
pods, err := c.listPods()
7883
if err != nil {
7984
return err

pkg/cluster/util.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -662,3 +662,24 @@ func parseResourceRequirements(resourcesRequirement v1.ResourceRequirements) (ac
662662
}
663663
return resources, nil
664664
}
665+
666+
func (c *Cluster) isInMainternanceWindow() bool {
667+
if c.Spec.MaintenanceWindows == nil {
668+
return true
669+
}
670+
now := time.Now()
671+
currentDay := now.Weekday()
672+
currentTime := now.Format("15:04")
673+
674+
for _, window := range c.Spec.MaintenanceWindows {
675+
startTime := window.StartTime.Format("15:04")
676+
endTime := window.EndTime.Format("15:04")
677+
678+
if window.Everyday || window.Weekday == currentDay {
679+
if currentTime >= startTime && currentTime <= endTime {
680+
return true
681+
}
682+
}
683+
}
684+
return false
685+
}

pkg/cluster/util_test.go

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,15 @@ import (
2727

2828
var externalAnnotations = map[string]string{"existing": "annotation"}
2929

30+
func mustParseTime(s string) metav1.Time {
31+
v, err := time.Parse("15:04", s)
32+
if err != nil {
33+
panic(err)
34+
}
35+
36+
return metav1.Time{Time: v.UTC()}
37+
}
38+
3039
func newFakeK8sAnnotationsClient() (k8sutil.KubernetesClient, *k8sFake.Clientset) {
3140
clientSet := k8sFake.NewSimpleClientset()
3241
acidClientSet := fakeacidv1.NewSimpleClientset()
@@ -521,3 +530,83 @@ func Test_trimCronjobName(t *testing.T) {
521530
})
522531
}
523532
}
533+
534+
func TestIsInMaintenanceWindow(t *testing.T) {
535+
client, _ := newFakeK8sStreamClient()
536+
537+
var cluster = New(
538+
Config{
539+
OpConfig: config.Config{
540+
PodManagementPolicy: "ordered_ready",
541+
Resources: config.Resources{
542+
ClusterLabels: map[string]string{"application": "spilo"},
543+
ClusterNameLabel: "cluster-name",
544+
DefaultCPURequest: "300m",
545+
DefaultCPULimit: "300m",
546+
DefaultMemoryRequest: "300Mi",
547+
DefaultMemoryLimit: "300Mi",
548+
PodRoleLabel: "spilo-role",
549+
},
550+
},
551+
}, client, pg, logger, eventRecorder)
552+
553+
now := time.Now()
554+
futureTimeStart := now.Add(1 * time.Hour)
555+
futureTimeStartFormatted := futureTimeStart.Format("15:04")
556+
futureTimeEnd := now.Add(2 * time.Hour)
557+
futureTimeEndFormatted := futureTimeEnd.Format("15:04")
558+
559+
tests := []struct {
560+
name string
561+
windows []acidv1.MaintenanceWindow
562+
expected bool
563+
}{
564+
{
565+
name: "no maintenance windows",
566+
windows: nil,
567+
expected: true,
568+
},
569+
{
570+
name: "maintenance windows with everyday",
571+
windows: []acidv1.MaintenanceWindow{
572+
{
573+
Everyday: true,
574+
StartTime: mustParseTime("00:00"),
575+
EndTime: mustParseTime("23:59"),
576+
},
577+
},
578+
expected: true,
579+
},
580+
{
581+
name: "maintenance windows with weekday",
582+
windows: []acidv1.MaintenanceWindow{
583+
{
584+
Weekday: now.Weekday(),
585+
StartTime: mustParseTime("00:00"),
586+
EndTime: mustParseTime("23:59"),
587+
},
588+
},
589+
expected: true,
590+
},
591+
{
592+
name: "maintenance windows with future interval time",
593+
windows: []acidv1.MaintenanceWindow{
594+
{
595+
Weekday: now.Weekday(),
596+
StartTime: mustParseTime(futureTimeStartFormatted),
597+
EndTime: mustParseTime(futureTimeEndFormatted),
598+
},
599+
},
600+
expected: false,
601+
},
602+
}
603+
604+
for _, tt := range tests {
605+
t.Run(tt.name, func(t *testing.T) {
606+
cluster.Spec.MaintenanceWindows = tt.windows
607+
if cluster.isInMainternanceWindow() != tt.expected {
608+
t.Errorf("Expected isInMainternanceWindow to return %t", tt.expected)
609+
}
610+
})
611+
}
612+
}

pkg/controller/postgresql.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -384,21 +384,13 @@ func (c *Controller) warnOnDeprecatedPostgreSQLSpecParameters(spec *acidv1.Postg
384384
c.logger.Warningf("parameter %q is deprecated. Consider setting %q instead", deprecated, replacement)
385385
}
386386

387-
noeffect := func(param string, explanation string) {
388-
c.logger.Warningf("parameter %q takes no effect. %s", param, explanation)
389-
}
390-
391387
if spec.UseLoadBalancer != nil {
392388
deprecate("useLoadBalancer", "enableMasterLoadBalancer")
393389
}
394390
if spec.ReplicaLoadBalancer != nil {
395391
deprecate("replicaLoadBalancer", "enableReplicaLoadBalancer")
396392
}
397393

398-
if len(spec.MaintenanceWindows) > 0 {
399-
noeffect("maintenanceWindows", "Not implemented.")
400-
}
401-
402394
if (spec.UseLoadBalancer != nil || spec.ReplicaLoadBalancer != nil) &&
403395
(spec.EnableReplicaLoadBalancer != nil || spec.EnableMasterLoadBalancer != nil) {
404396
c.logger.Warnf("both old and new load balancer parameters are present in the manifest, ignoring old ones")

ui/app/src/edit.tag.pug

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ edit
142142
o.spec.enableReplicaConnectionPooler = i.spec.enableReplicaConnectionPooler || false
143143
o.spec.enableMasterPoolerLoadBalancer = i.spec.enableMasterPoolerLoadBalancer || false
144144
o.spec.enableReplicaPoolerLoadBalancer = i.spec.enableReplicaPoolerLoadBalancer || false
145+
o.spec.maintenanceWindows = i.spec.maintenanceWindows || []
145146

146147
o.spec.volume = {
147148
size: i.spec.volume.size,

ui/app/src/new.tag.pug

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,12 @@ new
594594
{{#if enableReplicaPoolerLoadBalancer}}
595595
enableReplicaPoolerLoadBalancer: true
596596
{{/if}}
597+
{{#if maintenanceWindows}}
598+
maintenanceWindows:
599+
{{#each maintenanceWindows}}
600+
- "{{ this }}"
601+
{{/each}}
602+
{{/if}}
597603
volume:
598604
size: "{{ volumeSize }}Gi"{{#if volumeStorageClass}}
599605
storageClass: "{{ volumeStorageClass }}"{{/if}}{{#if iops}}
@@ -651,6 +657,7 @@ new
651657
enableReplicaConnectionPooler: this.enableReplicaConnectionPooler,
652658
enableMasterPoolerLoadBalancer: this.enableMasterPoolerLoadBalancer,
653659
enableReplicaPoolerLoadBalancer: this.enableReplicaPoolerLoadBalancer,
660+
maintenanceWindows: this.maintenanceWindows,
654661
volumeSize: this.volumeSize,
655662
volumeStorageClass: this.volumeStorageClass,
656663
iops: this.iops,
@@ -727,6 +734,10 @@ new
727734
this.enableReplicaPoolerLoadBalancer = !this.enableReplicaPoolerLoadBalancer
728735
}
729736

737+
this.maintenanceWindows = e => {
738+
this.maintenanceWindows = e.target.value
739+
}
740+
730741
this.volumeChange = e => {
731742
this.volumeSize = +e.target.value
732743
}
@@ -1042,6 +1053,7 @@ new
10421053
this.enableReplicaConnectionPooler = false
10431054
this.enableMasterPoolerLoadBalancer = false
10441055
this.enableReplicaPoolerLoadBalancer = false
1056+
this.maintenanceWindows = {}
10451057

10461058
this.postgresqlVersion = this.postgresqlVersion = (
10471059
this.config.postgresql_versions[0]

ui/operator_ui/main.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,7 @@ def get_postgresqls():
465465
'status': status,
466466
'num_elb': spec.get('enableMasterLoadBalancer', 0) + spec.get('enableReplicaLoadBalancer', 0) + \
467467
spec.get('enableMasterPoolerLoadBalancer', 0) + spec.get('enableReplicaPoolerLoadBalancer', 0),
468+
'maintenance_windows': spec.get('maintenanceWindows', []),
468469
}
469470
for cluster in these(
470471
read_postgresqls(
@@ -566,6 +567,11 @@ def update_postgresql(namespace: str, cluster: str):
566567
return fail('allowedSourceRanges invalid')
567568
spec['allowedSourceRanges'] = postgresql['spec']['allowedSourceRanges']
568569

570+
if 'maintenanceWindows' in postgresql['spec']:
571+
if not isinstance(postgresql['spec']['maintenanceWindows'], list):
572+
return fail('maintenanceWindows invalid')
573+
spec['maintenanceWindows'] = postgresql['spec']['maintenanceWindows']
574+
569575
if 'numberOfInstances' in postgresql['spec']:
570576
if not isinstance(postgresql['spec']['numberOfInstances'], int):
571577
return fail('numberOfInstances invalid')

0 commit comments

Comments
 (0)