Skip to content

Commit 4642861

Browse files
Jonathan S. Katzjkatz
authored andcommitted
Enable user-defined forced failovers
While this was the default behavior in Operator's past when using `pgo failover`, a previous commit changed the Operator internals to leverage a "controlled switchover" which is a bit nicer (i.e. it only works if there is a healthy instance to fail over to). However, there are situations where one must force a failover, and as such, we need to allow for one to do so. This adds the `--force` flag to `pgo failover` to allow for a forced failover. Note that `--target` must explicitly be set when forcing a failover.
1 parent a4dc532 commit 4642861

File tree

10 files changed

+259
-91
lines changed

10 files changed

+259
-91
lines changed

cmd/pgo/cmd/failover.go

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package cmd
1919
import (
2020
"fmt"
2121
"os"
22+
"strings"
2223

2324
"github.com/crunchydata/postgres-operator/cmd/pgo/api"
2425
"github.com/crunchydata/postgres-operator/cmd/pgo/util"
@@ -60,29 +61,38 @@ var failoverCmd = &cobra.Command{
6061
func init() {
6162
RootCmd.AddCommand(failoverCmd)
6263

63-
failoverCmd.Flags().BoolVarP(&Query, "query", "", false, "Prints the list of failover candidates.")
64+
failoverCmd.Flags().BoolVar(&Force, "force", false, "Force the failover to occur, regardless "+
65+
"of the health of the target instance. Must be used with \"--target\".")
6466
failoverCmd.Flags().BoolVar(&NoPrompt, "no-prompt", false, "No command line confirmation.")
67+
failoverCmd.Flags().BoolVar(&Query, "query", false, "Prints the list of failover candidates.")
6568
failoverCmd.Flags().StringVarP(&Target, "target", "", "", "The replica target which the failover will occur on.")
6669
}
6770

6871
// createFailover ....
6972
func createFailover(args []string, ns string) {
7073
log.Debugf("createFailover called %v", args)
7174

72-
request := new(msgs.CreateFailoverRequest)
73-
request.Namespace = ns
74-
request.ClusterName = args[0]
75-
request.Target = Target
76-
request.ClientVersion = msgs.PGO_VERSION
75+
request := &msgs.CreateFailoverRequest{
76+
ClientVersion: msgs.PGO_VERSION,
77+
ClusterName: args[0],
78+
Force: Force,
79+
Namespace: ns,
80+
Target: Target,
81+
}
7782

7883
response, err := api.CreateFailover(httpclient, &SessionCredentials, request)
7984
if err != nil {
8085
fmt.Println("Error: " + err.Error())
81-
os.Exit(2)
86+
os.Exit(1)
8287
}
8388

8489
if response.Status.Code != msgs.Ok {
85-
fmt.Println("Error: " + response.Status.Msg)
90+
fmt.Println("Error:", strings.ReplaceAll(response.Status.Msg, "Error: ", ""))
91+
92+
if strings.Contains(response.Status.Msg, "no primary") {
93+
fmt.Println(`Hint: Try using the "--force" flag`)
94+
}
95+
8696
os.Exit(1)
8797
}
8898

cmd/pgo/cmd/flags.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,16 @@ var DeleteData bool
2222
// even after a cluster is deleted. This is DEPRECATED
2323
var KeepData bool
2424

25-
var Query bool
25+
var (
26+
// Force indicates that the "force" action should be taken for that step. This
27+
// is different than NoPrompt as "Force" is for indicating that the API server
28+
// must try at all costs
29+
Force bool
30+
31+
// Query indicates that the attempted request is "querying" information
32+
// instead of taking some action
33+
Query bool
34+
)
2635

2736
var (
2837
Target string

docs/content/pgo-client/reference/pgo_failover.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ pgo failover [flags]
2323
### Options
2424

2525
```
26+
--force Force the failover to occur, regardless of the health of the target instance. Must be used with "--target".
2627
-h, --help help for failover
2728
--no-prompt No command line confirmation.
2829
--query Prints the list of failover candidates.
@@ -46,4 +47,4 @@ pgo failover [flags]
4647

4748
* [pgo](/pgo-client/reference/pgo/) - The pgo command line interface.
4849

49-
###### Auto generated by spf13/cobra on 1-Jan-2021
50+
###### Auto generated by spf13/cobra on 4-Jan-2021

internal/apiserver/failoverservice/failoverimpl.go

Lines changed: 51 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,19 @@ import (
1919
"context"
2020
"errors"
2121
"fmt"
22+
"strings"
2223

2324
"github.com/crunchydata/postgres-operator/internal/apiserver"
2425
"github.com/crunchydata/postgres-operator/internal/config"
2526
"github.com/crunchydata/postgres-operator/internal/operator"
2627
"github.com/crunchydata/postgres-operator/internal/util"
2728
crv1 "github.com/crunchydata/postgres-operator/pkg/apis/crunchydata.com/v1"
2829
msgs "github.com/crunchydata/postgres-operator/pkg/apiservermsgs"
30+
2931
log "github.com/sirupsen/logrus"
32+
v1 "k8s.io/api/core/v1"
3033
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34+
"k8s.io/apimachinery/pkg/fields"
3135
)
3236

3337
// CreateFailover is the API endpoint for triggering a manual failover of a
@@ -60,18 +64,24 @@ func CreateFailover(request *msgs.CreateFailoverRequest, ns, pgouser string) msg
6064
return resp
6165
}
6266

63-
if request.Target != "" {
64-
if err := isValidFailoverTarget(request.Target, request.ClusterName, ns); err != nil {
65-
resp.Status.Code = msgs.Error
66-
resp.Status.Msg = err.Error()
67-
return resp
68-
}
67+
if err := isValidFailoverTarget(request); err != nil {
68+
resp.Status.Code = msgs.Error
69+
resp.Status.Msg = err.Error()
70+
return resp
6971
}
7072

71-
// perform the switchover
72-
if err := operator.Switchover(apiserver.Clientset, apiserver.RESTConfig, cluster, request.Target); err != nil {
73+
// perform the switchover or failover, depending on which flag is selected
74+
// if we are forcing the failover, we need to use "Failover", otherwise we
75+
// perform a controlled switchover
76+
if request.Force {
77+
err = operator.Failover(apiserver.Clientset, apiserver.RESTConfig, cluster, request.Target)
78+
} else {
79+
err = operator.Switchover(apiserver.Clientset, apiserver.RESTConfig, cluster, request.Target)
80+
}
81+
82+
if err != nil {
7383
resp.Status.Code = msgs.Error
74-
resp.Status.Msg = err.Error()
84+
resp.Status.Msg = strings.ReplaceAll(err.Error(), "master", "primary")
7585
return resp
7686
}
7787

@@ -161,31 +171,53 @@ func validateClusterName(clusterName, ns string) (*crv1.Pgcluster, error) {
161171
// specified, and then ensuring the PG pod created by the deployment is not the current primary.
162172
// If the deployment is not found, or if the pod is the current primary, an error will be returned.
163173
// Otherwise the deployment is returned.
164-
func isValidFailoverTarget(deployName, clusterName, ns string) error {
174+
func isValidFailoverTarget(request *msgs.CreateFailoverRequest) error {
165175
ctx := context.TODO()
166176

177+
// if we're not forcing a failover and the target is blank, we can
178+
// return here
179+
// However, if we are forcing a failover and the target is blank, then we do
180+
// have an error
181+
if request.Target == "" {
182+
if !request.Force {
183+
return nil
184+
}
185+
186+
return fmt.Errorf("target is required when forcing a failover.")
187+
}
188+
167189
// Using the following label selector, ensure the deployment specified using deployName exists in the
168190
// cluster specified using clusterName:
169191
// pg-cluster=clusterName,deployment-name=deployName
170-
selector := config.LABEL_PG_CLUSTER + "=" + clusterName + "," + config.LABEL_DEPLOYMENT_NAME + "=" + deployName
171-
deployments, err := apiserver.Clientset.
172-
AppsV1().Deployments(ns).
173-
List(ctx, metav1.ListOptions{LabelSelector: selector})
192+
options := metav1.ListOptions{
193+
LabelSelector: fields.AndSelectors(
194+
fields.OneTermEqualSelector(config.LABEL_PG_CLUSTER, request.ClusterName),
195+
fields.OneTermEqualSelector(config.LABEL_DEPLOYMENT_NAME, request.Target),
196+
).String(),
197+
}
198+
deployments, err := apiserver.Clientset.AppsV1().Deployments(request.Namespace).List(ctx, options)
199+
174200
if err != nil {
175201
log.Error(err)
176202
return err
177203
} else if len(deployments.Items) == 0 {
178-
return fmt.Errorf("no target found named %s", deployName)
204+
return fmt.Errorf("no target found named %s", request.Target)
179205
} else if len(deployments.Items) > 1 {
180-
return fmt.Errorf("more than one target found named %s", deployName)
206+
return fmt.Errorf("more than one target found named %s", request.Target)
181207
}
182208

183209
// Using the following label selector, determine if the target specified is the current
184210
// primary for the cluster and return an error if it is:
185211
// pg-cluster=clusterName,deployment-name=deployName,role=primary
186-
selector = config.LABEL_PG_CLUSTER + "=" + clusterName + "," + config.LABEL_DEPLOYMENT_NAME + "=" + deployName +
187-
"," + config.LABEL_PGHA_ROLE + "=" + config.LABEL_PGHA_ROLE_PRIMARY
188-
pods, _ := apiserver.Clientset.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{LabelSelector: selector})
212+
options.FieldSelector = fields.OneTermEqualSelector("status.phase", string(v1.PodRunning)).String()
213+
options.LabelSelector = fields.AndSelectors(
214+
fields.OneTermEqualSelector(config.LABEL_PG_CLUSTER, request.ClusterName),
215+
fields.OneTermEqualSelector(config.LABEL_DEPLOYMENT_NAME, request.Target),
216+
fields.OneTermEqualSelector(config.LABEL_PGHA_ROLE, config.LABEL_PGHA_ROLE_PRIMARY),
217+
).String()
218+
219+
pods, _ := apiserver.Clientset.CoreV1().Pods(request.Namespace).List(ctx, options)
220+
189221
if len(pods.Items) > 0 {
190222
return fmt.Errorf("The primary database cannot be selected as a failover target")
191223
}

internal/controller/job/backresthandler.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ import (
2626
"github.com/crunchydata/postgres-operator/internal/config"
2727
"github.com/crunchydata/postgres-operator/internal/controller"
2828
"github.com/crunchydata/postgres-operator/internal/kubeapi"
29+
"github.com/crunchydata/postgres-operator/internal/operator"
2930
"github.com/crunchydata/postgres-operator/internal/operator/backrest"
30-
clusteroperator "github.com/crunchydata/postgres-operator/internal/operator/cluster"
3131
crv1 "github.com/crunchydata/postgres-operator/pkg/apis/crunchydata.com/v1"
3232
)
3333

@@ -92,9 +92,8 @@ func (c *Controller) handleBackrestBackupUpdate(job *apiv1.Job) error {
9292
job.ObjectMeta.Namespace)
9393

9494
} else if labels[config.LABEL_PGHA_BACKUP_TYPE] == crv1.BackupTypeFailover {
95-
err := clusteroperator.RemovePrimaryOnRoleChangeTag(c.Client, c.Client.Config,
96-
labels[config.LABEL_PG_CLUSTER], job.ObjectMeta.Namespace)
97-
if err != nil {
95+
if err := operator.RemovePrimaryOnRoleChangeTag(c.Client, c.Client.Config,
96+
labels[config.LABEL_PG_CLUSTER], job.ObjectMeta.Namespace); err != nil {
9897
log.Error(err)
9998
return err
10099
}

internal/operator/common.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
v1 "k8s.io/api/core/v1"
3333
kerrors "k8s.io/apimachinery/pkg/api/errors"
3434
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
35+
"k8s.io/apimachinery/pkg/fields"
3536
"k8s.io/client-go/kubernetes"
3637
)
3738

@@ -302,6 +303,49 @@ func SetContainerImageOverride(containerImageName string, container *v1.Containe
302303
}
303304
}
304305

306+
// getCandidatePod tries to get the candidate Pod for a switchover or failover.
307+
// If "candidateName" is provided, it will seek out the specific PostgreSQL
308+
// instance. Otherwise, it will just attempt to find a running Pod.
309+
//
310+
// If such a Pod cannot be found, we likely cannot use the instance for a
311+
// switchover for failover candidate as it is not running.
312+
func getCandidatePod(clientset kubernetes.Interface, cluster *crv1.Pgcluster, candidateName string) (*v1.Pod, error) {
313+
ctx := context.TODO()
314+
315+
// build the label selector. we are looking for any PostgreSQL instance within
316+
// this cluster, so that part is easy
317+
labelSelector := fields.Set{
318+
config.LABEL_PG_CLUSTER: cluster.Name,
319+
config.LABEL_PG_DATABASE: config.LABEL_TRUE,
320+
}
321+
322+
// if a candidateName is supplied, use that as part of the label selector to
323+
// find the candidate Pod
324+
if candidateName != "" {
325+
labelSelector[config.LABEL_DEPLOYMENT_NAME] = candidateName
326+
}
327+
328+
// ensure the Pod is part of the cluster and is running
329+
options := metav1.ListOptions{
330+
FieldSelector: fields.OneTermEqualSelector("status.phase", string(v1.PodRunning)).String(),
331+
LabelSelector: labelSelector.String(),
332+
}
333+
334+
pods, err := clientset.CoreV1().Pods(cluster.Namespace).List(ctx, options)
335+
if err != nil {
336+
return nil, err
337+
}
338+
339+
// if no Pods are found, then also return an error as we then cannot switch
340+
// over to this instance
341+
if len(pods.Items) == 0 {
342+
return nil, fmt.Errorf("no pods found for instance %s", candidateName)
343+
}
344+
345+
// the list returns multiple Pods, so just return the first one
346+
return &pods.Items[0], nil
347+
}
348+
305349
// initializeContainerImageOverrides initializes the container image overrides
306350
// that could be set if there are any `RELATED_IMAGE_*` environmental variables
307351
func initializeContainerImageOverrides() {
Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
// Package cluster holds the cluster CRD logic and definitions
2-
// A cluster is comprised of a primary service, replica service,
3-
// primary deployment, and replica deployment
4-
package cluster
1+
package operator
52

63
/*
74
Copyright 2018 - 2021 Crunchy Data Solutions, Inc.
@@ -24,6 +21,7 @@ import (
2421

2522
"github.com/crunchydata/postgres-operator/internal/config"
2623
"github.com/crunchydata/postgres-operator/internal/kubeapi"
24+
crv1 "github.com/crunchydata/postgres-operator/pkg/apis/crunchydata.com/v1"
2725
log "github.com/sirupsen/logrus"
2826
v1 "k8s.io/api/core/v1"
2927
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -35,6 +33,56 @@ import (
3533
var roleChangeCmd = []string{"patronictl", "edit-config", "--force",
3634
"--set", "tags.primary_on_role_change=null"}
3735

36+
// Failover performs a failover to a PostgreSQL cluster, which is effectively
37+
// a "forced switchover." In other words, failover will force ensure that
38+
// there is a primary available.
39+
//
40+
// NOTE: This is reserve as the "last resort" case. If you want a controlled
41+
// failover, you want "Switchover".
42+
//
43+
// A target must be specified. The target should contain the name of the target
44+
// instances (Deployment), is not empty then we will attempt to locate that
45+
// target Pod.
46+
//
47+
// The target Pod name, called the candidate is passed into the failover
48+
// command generation function, and then is ultimately used in the failover.
49+
func Failover(clientset kubernetes.Interface, restConfig *rest.Config, cluster *crv1.Pgcluster, target string) error {
50+
// ensure target is not empty
51+
if target == "" {
52+
return fmt.Errorf("failover requires a target instance to be specified.")
53+
}
54+
55+
// When the target is specified, we will attempt to get the Pod that
56+
// represents that target.
57+
//
58+
// If it is not specified, then we will attempt to get any Pod.
59+
//
60+
// If either errors, we will return an error
61+
pod, err := getCandidatePod(clientset, cluster, target)
62+
63+
if err != nil {
64+
return err
65+
}
66+
67+
candidate := pod.Name
68+
69+
// generate the command
70+
cmd := generatePostgresFailoverCommand(cluster.Name, candidate)
71+
72+
// good to generally log which instances are being used in the failover
73+
log.Infof("failover started for cluster %q", cluster.Name)
74+
75+
if _, stderr, err := kubeapi.ExecToPodThroughAPI(restConfig, clientset,
76+
cmd, "database", pod.Name, cluster.Namespace, nil); err != nil {
77+
return fmt.Errorf(stderr)
78+
}
79+
80+
log.Infof("failover completed for cluster %q", cluster.Name)
81+
82+
// and that's all
83+
return nil
84+
}
85+
3886
// RemovePrimaryOnRoleChangeTag sets the 'primary_on_role_change' tag to null in the
3987
// Patroni DCS, effectively removing the tag. This is accomplished by exec'ing into
4088
// the primary PG pod, and sending a patch request to update the appropriate data (i.e.
@@ -77,3 +125,21 @@ func RemovePrimaryOnRoleChangeTag(clientset kubernetes.Interface, restconfig *re
77125
}
78126
return nil
79127
}
128+
129+
// generatePostgresFailoverCommand creates the command that is used to issue
130+
// a failover command (ensure that there is a promoted primary).
131+
//
132+
// There are two ways to run this command:
133+
//
134+
// 1. Pass in only a clusterName. Patroni will select the best candidate
135+
// 2. Pass in a clusterName AND a target candidate name, which has to be the
136+
// name of a Pod
137+
func generatePostgresFailoverCommand(clusterName, candidate string) []string {
138+
cmd := []string{"patronictl", "failover", "--force", clusterName}
139+
140+
if candidate != "" {
141+
cmd = append(cmd, "--candidate", candidate)
142+
}
143+
144+
return cmd
145+
}

0 commit comments

Comments
 (0)