Skip to content

Commit dad5b13

Browse files
FxKusenthilnathan
andauthored
Standby cluster promotion by changing manifest (#2472)
* Standby cluster promotion by changing manifest * Updated the documentation --------- Co-authored-by: Senthilnathan M <[email protected]>
1 parent bbba15f commit dad5b13

File tree

5 files changed

+212
-25
lines changed

5 files changed

+212
-25
lines changed

docs/user.md

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -940,33 +940,25 @@ established between standby replica(s).
940940
One big advantage of standby clusters is that they can be promoted to a proper
941941
database cluster. This means it will stop replicating changes from the source,
942942
and start accept writes itself. This mechanism makes it possible to move
943-
databases from one place to another with minimal downtime. Currently, the
944-
operator does not support promoting a standby cluster. It has to be done
945-
manually using `patronictl edit-config` inside the postgres container of the
946-
standby leader pod. Remove the following lines from the YAML structure and the
947-
leader promotion happens immediately. Before doing so, make sure that the
948-
standby is not behind the source database.
943+
databases from one place to another with minimal downtime.
949944

950-
```yaml
951-
standby_cluster:
952-
create_replica_methods:
953-
- bootstrap_standby_with_wale
954-
- basebackup_fast_xlog
955-
restore_command: envdir "/home/postgres/etc/wal-e.d/env-standby" /scripts/restore_command.sh
956-
"%f" "%p"
957-
```
945+
Before promoting a standby cluster, make sure that the standby is not behind
946+
the source database. You should ideally stop writes to your source cluster and
947+
then create a dummy database object that you check for being replicated in the
948+
target to verify all data has been copied.
958949

959-
Finally, remove the `standby` section from the postgres cluster manifest.
950+
To promote, remove the `standby` section from the postgres cluster manifest.
951+
A rolling update will be triggered removing the `STANDBY_*` environment
952+
variables from the pods, followed by a Patroni config update that promotes the
953+
cluster.
960954

961-
### Turn a normal cluster into a standby
955+
### Adding standby section after promotion
962956

963-
There is no way to transform a non-standby cluster to a standby cluster through
964-
the operator. Adding the `standby` section to the manifest of a running
965-
Postgres cluster will have no effect. But, as explained in the previous
966-
paragraph it can be done manually through `patronictl edit-config`. This time,
967-
by adding the `standby_cluster` section to the Patroni configuration. However,
968-
the transformed standby cluster will not be doing any streaming. It will be in
969-
standby mode and allow read-only transactions only.
957+
Turning a running cluster into a standby is not easily possible and should be
958+
avoided. The best way is to remove the cluster and resubmit the manifest
959+
after a short wait of a few minutes. Adding the `standby` section would turn
960+
the database cluster in read-only mode on next operator SYNC cycle but it
961+
does not sync automatically with the source cluster again.
970962

971963
## Sidecar Support
972964

pkg/cluster/cluster.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -880,6 +880,13 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error {
880880
}
881881
}()
882882

883+
// add or remove standby_cluster section from Patroni config depending on changes in standby section
884+
if reflect.DeepEqual(oldSpec.Spec.StandbyCluster, newSpec.Spec.StandbyCluster) {
885+
if err := c.syncStandbyClusterConfiguration(); err != nil {
886+
return fmt.Errorf("could not set StandbyCluster configuration options: %v", err)
887+
}
888+
}
889+
883890
// pod disruption budget
884891
if oldSpec.Spec.NumberOfInstances != newSpec.Spec.NumberOfInstances {
885892
c.logger.Debug("syncing pod disruption budgets")

pkg/cluster/sync.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error {
8484
}
8585
}
8686

87+
// add or remove standby_cluster section from Patroni config depending on changes in standby section
88+
if reflect.DeepEqual(oldSpec.Spec.StandbyCluster, newSpec.Spec.StandbyCluster) {
89+
if err := c.syncStandbyClusterConfiguration(); err != nil {
90+
return fmt.Errorf("could not sync StandbyCluster configuration: %v", err)
91+
}
92+
}
93+
8794
c.logger.Debug("syncing pod disruption budgets")
8895
if err = c.syncPodDisruptionBudget(false); err != nil {
8996
err = fmt.Errorf("could not sync pod disruption budget: %v", err)
@@ -710,6 +717,46 @@ func (c *Cluster) checkAndSetGlobalPostgreSQLConfiguration(pod *v1.Pod, effectiv
710717
return configPatched, requiresMasterRestart, nil
711718
}
712719

720+
// syncStandbyClusterConfiguration checks whether standby cluster
721+
// parameters have changed and if necessary sets it via the Patroni API
722+
func (c *Cluster) syncStandbyClusterConfiguration() error {
723+
var (
724+
err error
725+
pods []v1.Pod
726+
)
727+
728+
standbyOptionsToSet := make(map[string]interface{})
729+
if c.Spec.StandbyCluster != nil {
730+
c.logger.Infof("turning %q into a standby cluster", c.Name)
731+
standbyOptionsToSet["create_replica_methods"] = []string{"bootstrap_standby_with_wale", "basebackup_fast_xlog"}
732+
standbyOptionsToSet["restore_command"] = "envdir \"/run/etc/wal-e.d/env-standby\" /scripts/restore_command.sh \"%f\" \"%p\""
733+
734+
} else {
735+
c.logger.Infof("promoting standby cluster and detach from source")
736+
standbyOptionsToSet = nil
737+
}
738+
739+
if pods, err = c.listPods(); err != nil {
740+
return err
741+
}
742+
if len(pods) == 0 {
743+
return fmt.Errorf("could not call Patroni API: cluster has no pods")
744+
}
745+
// try all pods until the first one that is successful, as it doesn't matter which pod
746+
// carries the request to change configuration through
747+
for _, pod := range pods {
748+
podName := util.NameFromMeta(pod.ObjectMeta)
749+
c.logger.Debugf("patching Postgres config via Patroni API on pod %s with following options: %s",
750+
podName, standbyOptionsToSet)
751+
if err = c.patroni.SetStandbyClusterParameters(&pod, standbyOptionsToSet); err == nil {
752+
return nil
753+
}
754+
c.logger.Warningf("could not patch postgres parameters within pod %s: %v", podName, err)
755+
}
756+
return fmt.Errorf("could not reach Patroni API to set Postgres options: failed on every pod (%d total)",
757+
len(pods))
758+
}
759+
713760
func (c *Cluster) syncSecrets() error {
714761
c.logger.Info("syncing secrets")
715762
c.setProcessName("syncing secrets")

pkg/cluster/sync_test.go

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package cluster
22

33
import (
44
"bytes"
5+
"fmt"
56
"io/ioutil"
67
"net/http"
78
"testing"
@@ -480,6 +481,140 @@ func TestCheckAndSetGlobalPostgreSQLConfiguration(t *testing.T) {
480481
}
481482
}
482483

484+
func TestSyncStandbyClusterConfiguration(t *testing.T) {
485+
client, _ := newFakeK8sSyncClient()
486+
clusterName := "acid-standby-cluster"
487+
applicationLabel := "spilo"
488+
namespace := "default"
489+
490+
ctrl := gomock.NewController(t)
491+
defer ctrl.Finish()
492+
493+
pg := acidv1.Postgresql{
494+
ObjectMeta: metav1.ObjectMeta{
495+
Name: clusterName,
496+
Namespace: namespace,
497+
},
498+
Spec: acidv1.PostgresSpec{
499+
NumberOfInstances: int32(1),
500+
Volume: acidv1.Volume{
501+
Size: "1Gi",
502+
},
503+
},
504+
}
505+
506+
var cluster = New(
507+
Config{
508+
OpConfig: config.Config{
509+
PatroniAPICheckInterval: time.Duration(1),
510+
PatroniAPICheckTimeout: time.Duration(5),
511+
PodManagementPolicy: "ordered_ready",
512+
Resources: config.Resources{
513+
ClusterLabels: map[string]string{"application": applicationLabel},
514+
ClusterNameLabel: "cluster-name",
515+
DefaultCPURequest: "300m",
516+
DefaultCPULimit: "300m",
517+
DefaultMemoryRequest: "300Mi",
518+
DefaultMemoryLimit: "300Mi",
519+
MinInstances: int32(-1),
520+
MaxInstances: int32(-1),
521+
PodRoleLabel: "spilo-role",
522+
ResourceCheckInterval: time.Duration(3),
523+
ResourceCheckTimeout: time.Duration(10),
524+
},
525+
},
526+
}, client, pg, logger, eventRecorder)
527+
528+
cluster.Name = clusterName
529+
cluster.Namespace = namespace
530+
531+
// mocking a config after getConfig is called
532+
mockClient := mocks.NewMockHTTPClient(ctrl)
533+
configJson := `{"ttl": 20}`
534+
r := ioutil.NopCloser(bytes.NewReader([]byte(configJson)))
535+
response := http.Response{
536+
StatusCode: 200,
537+
Body: r,
538+
}
539+
mockClient.EXPECT().Get(gomock.Any()).Return(&response, nil).AnyTimes()
540+
541+
// mocking a config after setConfig is called
542+
standbyJson := `{"standby_cluster":{"create_replica_methods":["bootstrap_standby_with_wale","basebackup_fast_xlog"],"restore_command":"envdir \"/run/etc/wal-e.d/env-standby\" /scripts/restore_command.sh \"%f\" \"%p\""}}`
543+
r = ioutil.NopCloser(bytes.NewReader([]byte(standbyJson)))
544+
response = http.Response{
545+
StatusCode: 200,
546+
Body: r,
547+
}
548+
mockClient.EXPECT().Do(gomock.Any()).Return(&response, nil).AnyTimes()
549+
p := patroni.New(patroniLogger, mockClient)
550+
cluster.patroni = p
551+
552+
mockPod := newMockPod("192.168.100.1")
553+
mockPod.Name = fmt.Sprintf("%s-0", clusterName)
554+
mockPod.Namespace = namespace
555+
podLabels := map[string]string{
556+
"cluster-name": clusterName,
557+
"application": applicationLabel,
558+
"spilo-role": "master",
559+
}
560+
mockPod.Labels = podLabels
561+
client.PodsGetter.Pods(namespace).Create(context.TODO(), mockPod, metav1.CreateOptions{})
562+
563+
// create a statefulset
564+
sts, err := cluster.createStatefulSet()
565+
assert.NoError(t, err)
566+
567+
// check that pods do not have a STANDBY_* environment variable
568+
assert.NotContains(t, sts.Spec.Template.Spec.Containers[0].Env, v1.EnvVar{Name: "STANDBY_METHOD", Value: "STANDBY_WITH_WALE"})
569+
570+
// add standby section
571+
cluster.Spec.StandbyCluster = &acidv1.StandbyDescription{
572+
S3WalPath: "s3://custom/path/to/bucket/",
573+
}
574+
cluster.syncStatefulSet()
575+
updatedSts := cluster.Statefulset
576+
577+
// check that pods do not have a STANDBY_* environment variable
578+
assert.Contains(t, updatedSts.Spec.Template.Spec.Containers[0].Env, v1.EnvVar{Name: "STANDBY_METHOD", Value: "STANDBY_WITH_WALE"})
579+
580+
// this should update the Patroni config
581+
err = cluster.syncStandbyClusterConfiguration()
582+
assert.NoError(t, err)
583+
584+
configJson = `{"standby_cluster":{"create_replica_methods":["bootstrap_standby_with_wale","basebackup_fast_xlog"],"restore_command":"envdir \"/run/etc/wal-e.d/env-standby\" /scripts/restore_command.sh \"%f\" \"%p\""}, "ttl": 20}`
585+
r = ioutil.NopCloser(bytes.NewReader([]byte(configJson)))
586+
response = http.Response{
587+
StatusCode: 200,
588+
Body: r,
589+
}
590+
mockClient.EXPECT().Get(gomock.Any()).Return(&response, nil).AnyTimes()
591+
592+
pods, err := cluster.listPods()
593+
assert.NoError(t, err)
594+
595+
_, _, err = cluster.patroni.GetConfig(&pods[0])
596+
assert.NoError(t, err)
597+
// ToDo extend GetConfig to return standy_cluster setting to compare
598+
/*
599+
defaultStandbyParameters := map[string]interface{}{
600+
"create_replica_methods": []string{"bootstrap_standby_with_wale", "basebackup_fast_xlog"},
601+
"restore_command": "envdir \"/run/etc/wal-e.d/env-standby\" /scripts/restore_command.sh \"%f\" \"%p\"",
602+
}
603+
assert.True(t, reflect.DeepEqual(defaultStandbyParameters, standbyCluster))
604+
*/
605+
// remove standby section
606+
cluster.Spec.StandbyCluster = &acidv1.StandbyDescription{}
607+
cluster.syncStatefulSet()
608+
updatedSts2 := cluster.Statefulset
609+
610+
// check that pods do not have a STANDBY_* environment variable
611+
assert.NotContains(t, updatedSts2.Spec.Template.Spec.Containers[0].Env, v1.EnvVar{Name: "STANDBY_METHOD", Value: "STANDBY_WITH_WALE"})
612+
613+
// this should update the Patroni config again
614+
err = cluster.syncStandbyClusterConfiguration()
615+
assert.NoError(t, err)
616+
}
617+
483618
func TestUpdateSecret(t *testing.T) {
484619
testName := "test syncing secrets"
485620
client, _ := newFakeK8sSyncSecretsClient()

pkg/util/patroni/patroni.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ type Interface interface {
3434
GetClusterMembers(master *v1.Pod) ([]ClusterMember, error)
3535
Switchover(master *v1.Pod, candidate string) error
3636
SetPostgresParameters(server *v1.Pod, options map[string]string) error
37+
SetStandbyClusterParameters(server *v1.Pod, options map[string]interface{}) error
3738
GetMemberData(server *v1.Pod) (MemberData, error)
3839
Restart(server *v1.Pod) error
3940
GetConfig(server *v1.Pod) (acidv1.Patroni, map[string]string, error)
@@ -150,7 +151,7 @@ func (p *Patroni) Switchover(master *v1.Pod, candidate string) error {
150151

151152
//TODO: add an option call /patroni to check if it is necessary to restart the server
152153

153-
//SetPostgresParameters sets Postgres options via Patroni patch API call.
154+
// SetPostgresParameters sets Postgres options via Patroni patch API call.
154155
func (p *Patroni) SetPostgresParameters(server *v1.Pod, parameters map[string]string) error {
155156
buf := &bytes.Buffer{}
156157
err := json.NewEncoder(buf).Encode(map[string]map[string]interface{}{"postgresql": {"parameters": parameters}})
@@ -164,7 +165,12 @@ func (p *Patroni) SetPostgresParameters(server *v1.Pod, parameters map[string]st
164165
return p.httpPostOrPatch(http.MethodPatch, apiURLString+configPath, buf)
165166
}
166167

167-
//SetConfig sets Patroni options via Patroni patch API call.
168+
// SetStandbyClusterParameters sets StandbyCluster options via Patroni patch API call.
169+
func (p *Patroni) SetStandbyClusterParameters(server *v1.Pod, parameters map[string]interface{}) error {
170+
return p.SetConfig(server, map[string]interface{}{"standby_cluster": parameters})
171+
}
172+
173+
// SetConfig sets Patroni options via Patroni patch API call.
168174
func (p *Patroni) SetConfig(server *v1.Pod, config map[string]interface{}) error {
169175
buf := &bytes.Buffer{}
170176
err := json.NewEncoder(buf).Encode(config)

0 commit comments

Comments
 (0)