Skip to content

Commit f9eafa9

Browse files
Jonathan S. Katzjkatz
authored andcommitted
Move pgo failover to use updated switchover plumbing
The manual failover/switchover command, `pgo failover`, now uses the updated "switchover" plumbing that was introduced as part of the rolling update changes. This ensures a unified experience when performing an action that involves a failover. Additionally, `pgo failover` now occurs inline: it does not create a pgtask custom resource. This is due to both being a much simpler process and the transactional nature of an immediate failover. `pgo failover` can now also be executed with a --target flag. This was actually always supported, but unavailable based upon a restriction in the `pgo` client. When the `--target` flag is not used, the PostgreSQL Operator will choose the best candidate for failing over.
1 parent 955c6e8 commit f9eafa9

File tree

16 files changed

+326
-559
lines changed

16 files changed

+326
-559
lines changed

cmd/pgo/cmd/failover.go

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,12 @@ var failoverCmd = &cobra.Command{
3232
Short: "Performs a manual failover",
3333
Long: `Performs a manual failover. For example:
3434
35-
pgo failover mycluster`,
35+
# have the operator select the best target candidate
36+
pgo failover hippo
37+
# get a list of target candidates
38+
pgo failover hippo --query
39+
# failover to a specific target candidate
40+
pgo failover hippo --target=hippo-abcd`,
3641
Run: func(cmd *cobra.Command, args []string) {
3742
if Namespace == "" {
3843
Namespace = PGONamespace
@@ -44,10 +49,6 @@ var failoverCmd = &cobra.Command{
4449
if Query {
4550
queryFailover(args, Namespace)
4651
} else if util.AskForConfirmation(NoPrompt, "") {
47-
if Target == "" {
48-
fmt.Println(`Error: The --target flag is required for failover.`)
49-
return
50-
}
5152
createFailover(args, Namespace)
5253
} else {
5354
fmt.Println("Aborting...")
@@ -80,14 +81,12 @@ func createFailover(args []string, ns string) {
8081
os.Exit(2)
8182
}
8283

83-
if response.Status.Code == msgs.Ok {
84-
for k := range response.Results {
85-
fmt.Println(response.Results[k])
86-
}
87-
} else {
84+
if response.Status.Code != msgs.Ok {
8885
fmt.Println("Error: " + response.Status.Msg)
89-
os.Exit(2)
86+
os.Exit(1)
9087
}
88+
89+
fmt.Println(response.Results)
9190
}
9291

9392
// queryFailover is a helper function to return the user information about the

docs/content/pgo-client/common-tasks.md

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -816,13 +816,26 @@ pgo failover --query hacluster
816816

817817
The PostgreSQL Operator is set up with an automated failover system based on
818818
distributed consensus, but there may be times where you wish to have your
819-
cluster manually failover. If you wish to have your cluster manually failover,
820-
first, query your cluster to determine which failover targets are available.
821-
The query command also provides information that may help your decision, such as
822-
replication lag:
819+
cluster manually failover. There are two ways to issue a manual failover to
820+
your PostgreSQL cluster:
821+
822+
1. Allow for the PostgreSQL Operator to select the best replica candidate to
823+
failover to
824+
2. Select your own replica candidate to failover to.
825+
826+
To have the PostgreSQL Operator select the best replica candidate for failover,
827+
all you need to do is execute the following command:
828+
829+
```
830+
pgo failover hacluster
831+
```
832+
833+
If you wish to have your cluster manually failover, you must first query your
834+
cluster to determine which failover targets are available. The query command
835+
also provides information that may help your decision, such as replication lag:
823836

824837
```shell
825-
pgo failover --query hacluster
838+
pgo failover hacluster --query
826839
```
827840

828841
Once you have selected the replica that is best for your to failover to, you can
@@ -833,7 +846,9 @@ pgo failover hacluster --target=hacluster-abcd
833846
```
834847

835848
where `hacluster-abcd` is the name of the PostgreSQL instance that you want to
836-
promote to become the new primary
849+
promote to become the new primary.
850+
851+
Both methods perform the failover immediately upon execution.
837852

838853
#### Destroying a Replica
839854

docs/content/pgo-client/reference/pgo_failover.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@ Performs a manual failover
99

1010
Performs a manual failover. For example:
1111

12-
pgo failover mycluster
12+
# have the operator select the best target candidate
13+
pgo failover hippo
14+
# get a list of target candidates
15+
pgo failover hippo --query
16+
# failover to a specific target candidate
17+
pgo failover hippo --target=hippo-abcd
1318

1419
```
1520
pgo failover [flags]
@@ -27,7 +32,7 @@ pgo failover [flags]
2732
### Options inherited from parent commands
2833

2934
```
30-
--apiserver-url string The URL for the PostgreSQL Operator apiserver that will process the request from the pgo client.
35+
--apiserver-url string The URL for the PostgreSQL Operator apiserver that will process the request from the pgo client. Note that the URL should **not** end in a '/'.
3136
--debug Enable additional output for debugging.
3237
--disable-tls Disable TLS authentication to the Postgres Operator.
3338
--exclude-os-trust Exclude CA certs from OS default trust store
@@ -41,4 +46,4 @@ pgo failover [flags]
4146

4247
* [pgo](/pgo-client/reference/pgo/) - The pgo command line interface.
4348

44-
###### Auto generated by spf13/cobra on 1-Oct-2020
49+
###### Auto generated by spf13/cobra on 1-Jan-2021

docs/content/tutorial/high-availability.md

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,28 @@ pgo scaledown hippo --target=hippo-ojnd
6262

6363
## Manual Failover
6464

65-
Each PostgreSQL cluster will manage its own availability. If you wish to manually fail over, you will need to use the [`pgo failover`]({{< relref "pgo-client/reference/pgo_failover.md">}}) command. First, determine which instance you want to fail over to:
65+
Each PostgreSQL cluster will manage its own availability. If you wish to manually fail over, you will need to use the [`pgo failover`]({{< relref "pgo-client/reference/pgo_failover.md">}}) command.
66+
67+
There are two ways to issue a manual failover to your PostgreSQL cluster:
68+
69+
1. Allow for the PostgreSQL Operator to select the best replica candidate for failover.
70+
2. Select your own replica candidate for failover.
71+
72+
Both methods are detailed below.
73+
74+
### Manual Failover - PostgreSQL Operator Candidate Selection
75+
76+
To have the PostgreSQL Operator select the best replica candidate for failover, all you need to do is execute the following command:
77+
78+
```
79+
pgo failover hippo
80+
```
81+
82+
The PostgreSQL Operator will determine which is the best replica candidate to fail over to, and take into account factors such as replication lag and current timeline.
83+
84+
### Manual Failover - Manual Selection
85+
86+
If you wish to have your cluster manually failover, you must first query your determine which instance you want to fail over to. You can do so with the following command:
6687

6788
```
6889
pgo failover hippo --query

internal/apiserver/failoverservice/failoverimpl.go

Lines changed: 23 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -18,29 +18,32 @@ limitations under the License.
1818
import (
1919
"context"
2020
"errors"
21+
"fmt"
2122

2223
"github.com/crunchydata/postgres-operator/internal/apiserver"
2324
"github.com/crunchydata/postgres-operator/internal/config"
25+
"github.com/crunchydata/postgres-operator/internal/operator"
2426
"github.com/crunchydata/postgres-operator/internal/util"
2527
crv1 "github.com/crunchydata/postgres-operator/pkg/apis/crunchydata.com/v1"
2628
msgs "github.com/crunchydata/postgres-operator/pkg/apiservermsgs"
2729
log "github.com/sirupsen/logrus"
28-
v1 "k8s.io/api/apps/v1"
2930
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031
)
3132

32-
// CreateFailover ...
33+
// CreateFailover is the API endpoint for triggering a manual failover of a
34+
// cluster. It performs this function inline, i.e. it does not trigger any
35+
// asynchronous methods.
36+
//
3337
// pgo failover mycluster
34-
// pgo failover all
35-
// pgo failover --selector=name=mycluster
3638
func CreateFailover(request *msgs.CreateFailoverRequest, ns, pgouser string) msgs.CreateFailoverResponse {
37-
ctx := context.TODO()
39+
log.Debugf("create failover called for %s", request.ClusterName)
3840

39-
var err error
40-
resp := msgs.CreateFailoverResponse{}
41-
resp.Status.Code = msgs.Ok
42-
resp.Status.Msg = ""
43-
resp.Results = make([]string, 0)
41+
resp := msgs.CreateFailoverResponse{
42+
Results: "",
43+
Status: msgs.Status{
44+
Code: msgs.Ok,
45+
},
46+
}
4447

4548
cluster, err := validateClusterName(request.ClusterName, ns)
4649
if err != nil {
@@ -58,49 +61,21 @@ func CreateFailover(request *msgs.CreateFailoverRequest, ns, pgouser string) msg
5861
}
5962

6063
if request.Target != "" {
61-
_, err = isValidFailoverTarget(request.Target, request.ClusterName, ns)
62-
if err != nil {
64+
if err := isValidFailoverTarget(request.Target, request.ClusterName, ns); err != nil {
6365
resp.Status.Code = msgs.Error
6466
resp.Status.Msg = err.Error()
6567
return resp
6668
}
6769
}
6870

69-
log.Debugf("create failover called for %s", request.ClusterName)
70-
71-
// Create a pgtask
72-
spec := crv1.PgtaskSpec{}
73-
spec.Namespace = ns
74-
spec.Name = request.ClusterName + "-" + config.LABEL_FAILOVER
75-
76-
// previous failovers will leave a pgtask so remove it first
77-
_ = apiserver.Clientset.CrunchydataV1().Pgtasks(ns).Delete(ctx, spec.Name, metav1.DeleteOptions{})
78-
79-
spec.TaskType = crv1.PgtaskFailover
80-
spec.Parameters = make(map[string]string)
81-
spec.Parameters[request.ClusterName] = request.ClusterName
82-
83-
labels := make(map[string]string)
84-
labels["target"] = request.Target
85-
labels[config.LABEL_PG_CLUSTER] = request.ClusterName
86-
labels[config.LABEL_PGOUSER] = pgouser
87-
88-
newInstance := &crv1.Pgtask{
89-
ObjectMeta: metav1.ObjectMeta{
90-
Name: spec.Name,
91-
Labels: labels,
92-
},
93-
Spec: spec,
94-
}
95-
96-
_, err = apiserver.Clientset.CrunchydataV1().Pgtasks(ns).Create(ctx, newInstance, metav1.CreateOptions{})
97-
if err != nil {
71+
// perform the switchover
72+
if err := operator.Switchover(apiserver.Clientset, apiserver.RESTConfig, cluster, request.Target); err != nil {
9873
resp.Status.Code = msgs.Error
9974
resp.Status.Msg = err.Error()
10075
return resp
10176
}
10277

103-
resp.Results = append(resp.Results, "created Pgtask (failover) for cluster "+request.ClusterName)
78+
resp.Results = "failover success for cluster " + cluster.Name
10479

10580
return resp
10681
}
@@ -186,7 +161,7 @@ func validateClusterName(clusterName, ns string) (*crv1.Pgcluster, error) {
186161
// specified, and then ensuring the PG pod created by the deployment is not the current primary.
187162
// If the deployment is not found, or if the pod is the current primary, an error will be returned.
188163
// Otherwise the deployment is returned.
189-
func isValidFailoverTarget(deployName, clusterName, ns string) (*v1.Deployment, error) {
164+
func isValidFailoverTarget(deployName, clusterName, ns string) error {
190165
ctx := context.TODO()
191166

192167
// Using the following label selector, ensure the deployment specified using deployName exists in the
@@ -198,11 +173,11 @@ func isValidFailoverTarget(deployName, clusterName, ns string) (*v1.Deployment,
198173
List(ctx, metav1.ListOptions{LabelSelector: selector})
199174
if err != nil {
200175
log.Error(err)
201-
return nil, err
176+
return err
202177
} else if len(deployments.Items) == 0 {
203-
return nil, errors.New("no target found named " + deployName)
178+
return fmt.Errorf("no target found named %s", deployName)
204179
} else if len(deployments.Items) > 1 {
205-
return nil, errors.New("more than one target found named " + deployName)
180+
return fmt.Errorf("more than one target found named %s", deployName)
206181
}
207182

208183
// Using the following label selector, determine if the target specified is the current
@@ -212,8 +187,8 @@ func isValidFailoverTarget(deployName, clusterName, ns string) (*v1.Deployment,
212187
"," + config.LABEL_PGHA_ROLE + "=" + config.LABEL_PGHA_ROLE_PRIMARY
213188
pods, _ := apiserver.Clientset.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{LabelSelector: selector})
214189
if len(pods.Items) > 0 {
215-
return nil, errors.New("The primary database cannot be selected as a failover target")
190+
return fmt.Errorf("The primary database cannot be selected as a failover target")
216191
}
217192

218-
return &deployments.Items[0], nil
193+
return nil
219194
}

internal/config/labels.go

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,14 @@ const (
2727

2828
const LABEL_PGTASK = "pg-task"
2929

30-
const (
31-
LABEL_FAILOVER = "failover"
32-
LABEL_RESTART = "restart"
33-
)
30+
const LABEL_RESTART = "restart"
3431

3532
const (
36-
LABEL_TARGET = "target"
3733
LABEL_RMDATA = "pgrmdata"
3834
)
3935

4036
const (
4137
LABEL_PGPOLICY = "pgpolicy"
42-
LABEL_INGEST = "ingest"
4338
LABEL_PGREMOVE = "pgremove"
4439
LABEL_PVCNAME = "pvcname"
4540
LABEL_EXPORTER = "crunchy-postgres-exporter"
@@ -179,8 +174,6 @@ const (
179174
LABEL_PGO_UPDATED_BY = "pgo-updated-by"
180175
)
181176

182-
const LABEL_FAILOVER_STARTED = "failover-started"
183-
184177
const GLOBAL_CUSTOM_CONFIGMAP = "pgo-custom-pg-config"
185178

186179
const (

internal/controller/pgtask/pgtaskcontroller.go

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,6 @@ func (c *Controller) processNextItem() bool {
122122
case crv1.PgtaskUpgrade:
123123
log.Debug("upgrade task added")
124124
clusteroperator.AddUpgrade(c.Client, tmpTask, keyNamespace)
125-
case crv1.PgtaskFailover:
126-
log.Debug("failover task added")
127-
if !dupeFailover(c.Client, tmpTask, keyNamespace) {
128-
clusteroperator.FailoverBase(keyNamespace, c.Client, tmpTask, c.Client.Config)
129-
} else {
130-
log.Debugf("skipping duplicate onAdd failover task %s/%s", keyNamespace, keyResourceName)
131-
}
132125
case crv1.PgtaskRollingUpdate:
133126
log.Debug("rolling update task added")
134127
// first, attempt to get the pgcluster object
@@ -164,9 +157,6 @@ func (c *Controller) processNextItem() bool {
164157
case crv1.PgtaskpgRestore:
165158
log.Debug("pgDump restore task added")
166159
pgdumpoperator.Restore(keyNamespace, c.Client, tmpTask)
167-
168-
case crv1.PgtaskAutoFailover:
169-
log.Debugf("autofailover task added %s", keyResourceName)
170160
case crv1.PgtaskWorkflow:
171161
log.Debugf("workflow task added [%s] ID [%s]", keyResourceName, tmpTask.Spec.Parameters[crv1.PgtaskWorkflowID])
172162

@@ -217,24 +207,6 @@ func (c *Controller) AddPGTaskEventHandler() {
217207
log.Debugf("pgtask Controller: added event handler to informer")
218208
}
219209

220-
// de-dupe logic for a failover, if the failover started
221-
// parameter is set, it means a failover has already been
222-
// started on this
223-
func dupeFailover(clientset pgo.Interface, task *crv1.Pgtask, ns string) bool {
224-
ctx := context.TODO()
225-
tmp, err := clientset.CrunchydataV1().Pgtasks(ns).Get(ctx, task.Spec.Name, metav1.GetOptions{})
226-
if err != nil {
227-
// a big time error if this occurs
228-
return false
229-
}
230-
231-
if tmp.Spec.Parameters[config.LABEL_FAILOVER_STARTED] == "" {
232-
return false
233-
}
234-
235-
return true
236-
}
237-
238210
// de-dupe logic for a delete data, if the delete data job started
239211
// parameter is set, it means a delete data job has already been
240212
// started on this

0 commit comments

Comments
 (0)