CrunchyData
diff --git a/‎cmd/pgo/cmd/failover.go‎
Lines changed: 10 additions & 11 deletions b/‎cmd/pgo/cmd/failover.go‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎docs/content/pgo-client/common-tasks.md‎
Lines changed: 21 additions & 6 deletions b/‎docs/content/pgo-client/common-tasks.md‎
Lines changed: 21 additions & 6 deletions
diff --git a/‎docs/content/pgo-client/reference/pgo_failover.md‎
Lines changed: 8 additions & 3 deletions b/‎docs/content/pgo-client/reference/pgo_failover.md‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎docs/content/tutorial/high-availability.md‎
Lines changed: 22 additions & 1 deletion b/‎docs/content/tutorial/high-availability.md‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎internal/apiserver/failoverservice/failoverimpl.go‎
Lines changed: 23 additions & 48 deletions b/‎internal/apiserver/failoverservice/failoverimpl.go‎
Lines changed: 23 additions & 48 deletions
diff --git a/‎internal/config/labels.go‎
Lines changed: 1 addition & 8 deletions b/‎internal/config/labels.go‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎internal/controller/pgtask/pgtaskcontroller.go‎
Lines changed: 0 additions & 28 deletions b/‎internal/controller/pgtask/pgtaskcontroller.go‎
Lines changed: 0 additions & 28 deletions
@@ -32,7 +32,12 @@ var failoverCmd = &cobra.Command{
 	Short: "Performs a manual failover",
 	Long: `Performs a manual failover. For example:
 
-	pgo failover mycluster`,
+	# have the operator select the best target candidate
+	pgo failover hippo
+	# get a list of target candidates
+	pgo failover hippo --query
+	# failover to a specific target candidate
+	pgo failover hippo --target=hippo-abcd`,
 	Run: func(cmd *cobra.Command, args []string) {
 		if Namespace == "" {
 			Namespace = PGONamespace
@@ -44,10 +49,6 @@ var failoverCmd = &cobra.Command{
 			if Query {
 				queryFailover(args, Namespace)
 			} else if util.AskForConfirmation(NoPrompt, "") {
-				if Target == "" {
-					fmt.Println(`Error: The --target flag is required for failover.`)
-					return
-				}
 				createFailover(args, Namespace)
 			} else {
 				fmt.Println("Aborting...")
@@ -80,14 +81,12 @@ func createFailover(args []string, ns string) {
 		os.Exit(2)
 	}
 
-	if response.Status.Code == msgs.Ok {
-		for k := range response.Results {
-			fmt.Println(response.Results[k])
-		}
-	} else {
+	if response.Status.Code != msgs.Ok {
 		fmt.Println("Error: " + response.Status.Msg)
-		os.Exit(2)
+		os.Exit(1)
 	}
+
+	fmt.Println(response.Results)
 }
 
 // queryFailover is a helper function to return the user information about the
 
@@ -816,13 +816,26 @@ pgo failover --query hacluster
 
 The PostgreSQL Operator is set up with an automated failover system based on
 distributed consensus, but there may be times where you wish to have your
-cluster manually failover. If you wish to have your cluster manually failover,
-first, query your cluster to determine which failover targets are available.
-The query command also provides information that may help your decision, such as
-replication lag:
+cluster manually failover. There are two ways to issue a manual failover to
+your PostgreSQL cluster:
+
+1. Allow for the PostgreSQL Operator to select the best replica candidate to
+failover to
+2. Select your own replica candidate to failover to.
+
+To have the PostgreSQL Operator select the best replica candidate for failover,
+all you need to do is execute the following command:
+
+```
+pgo failover hacluster
+```
+
+If you wish to have your cluster manually failover, you must first query your
+cluster to determine which failover targets are available. The query command
+also provides information that may help your decision, such as replication lag:
 
 ```shell
-pgo failover --query hacluster
+pgo failover hacluster --query
 ```
 
 Once you have selected the replica that is best for your to failover to, you can
@@ -833,7 +846,9 @@ pgo failover hacluster --target=hacluster-abcd
 ```
 
 where `hacluster-abcd` is the name of the PostgreSQL instance that you want to
-promote to become the new primary
+promote to become the new primary.
+
+Both methods perform the failover immediately upon execution.
 
 #### Destroying a Replica
 
 
@@ -9,7 +9,12 @@ Performs a manual failover
 
 Performs a manual failover. For example:
 
-	pgo failover mycluster
+	# have the operator select the best target candidate
+	pgo failover hippo
+	# get a list of target candidates
+	pgo failover hippo --query
+	# failover to a specific target candidate
+	pgo failover hippo --target=hippo-abcd
 
 ```
 pgo failover [flags]
@@ -27,7 +32,7 @@ pgo failover [flags]
 ### Options inherited from parent commands
 
 ```
-      --apiserver-url string     The URL for the PostgreSQL Operator apiserver that will process the request from the pgo client.
+      --apiserver-url string     The URL for the PostgreSQL Operator apiserver that will process the request from the pgo client. Note that the URL should **not** end in a '/'.
       --debug                    Enable additional output for debugging.
       --disable-tls              Disable TLS authentication to the Postgres Operator.
       --exclude-os-trust         Exclude CA certs from OS default trust store
@@ -41,4 +46,4 @@ pgo failover [flags]
 
 * [pgo](/pgo-client/reference/pgo/)	 - The pgo command line interface.
 
-###### Auto generated by spf13/cobra on 1-Oct-2020
+###### Auto generated by spf13/cobra on 1-Jan-2021
@@ -62,7 +62,28 @@ pgo scaledown hippo --target=hippo-ojnd
 
 ## Manual Failover
 
-Each PostgreSQL cluster will manage its own availability. If you wish to manually fail over, you will need to use the [`pgo failover`]({{< relref "pgo-client/reference/pgo_failover.md">}}) command. First, determine which instance you want to fail over to:
+Each PostgreSQL cluster will manage its own availability. If you wish to manually fail over, you will need to use the [`pgo failover`]({{< relref "pgo-client/reference/pgo_failover.md">}}) command.
+
+There are two ways to issue a manual failover to your PostgreSQL cluster:
+
+1. Allow for the PostgreSQL Operator to select the best replica candidate for failover.
+2. Select your own replica candidate for failover.
+
+Both methods are detailed below.
+
+### Manual Failover - PostgreSQL Operator Candidate Selection
+
+To have the PostgreSQL Operator select the best replica candidate for failover, all you need to do is execute the following command:
+
+```
+pgo failover hippo
+```
+
+The PostgreSQL Operator will determine which is the best replica candidate to fail over to, and take into account factors such as replication lag and current timeline.
+
+### Manual Failover - Manual Selection
+
+If you wish to have your cluster manually failover, you must first query your determine which instance you want to fail over to. You can do so with the following command:
 
 ```
 pgo failover hippo --query
 
@@ -18,29 +18,32 @@ limitations under the License.
 import (
 	"context"
 	"errors"
+	"fmt"
 
 	"github.com/crunchydata/postgres-operator/internal/apiserver"
 	"github.com/crunchydata/postgres-operator/internal/config"
+	"github.com/crunchydata/postgres-operator/internal/operator"
 	"github.com/crunchydata/postgres-operator/internal/util"
 	crv1 "github.com/crunchydata/postgres-operator/pkg/apis/crunchydata.com/v1"
 	msgs "github.com/crunchydata/postgres-operator/pkg/apiservermsgs"
 	log "github.com/sirupsen/logrus"
-	v1 "k8s.io/api/apps/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-//  CreateFailover ...
+// CreateFailover is the API endpoint for triggering a manual failover of a
+// cluster. It performs this function inline, i.e. it does not trigger any
+// asynchronous methods.
+//
 // pgo failover mycluster
-// pgo failover all
-// pgo failover --selector=name=mycluster
 func CreateFailover(request *msgs.CreateFailoverRequest, ns, pgouser string) msgs.CreateFailoverResponse {
-	ctx := context.TODO()
+	log.Debugf("create failover called for %s", request.ClusterName)
 
-	var err error
-	resp := msgs.CreateFailoverResponse{}
-	resp.Status.Code = msgs.Ok
-	resp.Status.Msg = ""
-	resp.Results = make([]string, 0)
+	resp := msgs.CreateFailoverResponse{
+		Results: "",
+		Status: msgs.Status{
+			Code: msgs.Ok,
+		},
+	}
 
 	cluster, err := validateClusterName(request.ClusterName, ns)
 	if err != nil {
@@ -58,49 +61,21 @@ func CreateFailover(request *msgs.CreateFailoverRequest, ns, pgouser string) msg
 	}
 
 	if request.Target != "" {
-		_, err = isValidFailoverTarget(request.Target, request.ClusterName, ns)
-		if err != nil {
+		if err := isValidFailoverTarget(request.Target, request.ClusterName, ns); err != nil {
 			resp.Status.Code = msgs.Error
 			resp.Status.Msg = err.Error()
 			return resp
 		}
 	}
 
-	log.Debugf("create failover called for %s", request.ClusterName)
-
-	// Create a pgtask
-	spec := crv1.PgtaskSpec{}
-	spec.Namespace = ns
-	spec.Name = request.ClusterName + "-" + config.LABEL_FAILOVER
-
-	// previous failovers will leave a pgtask so remove it first
-	_ = apiserver.Clientset.CrunchydataV1().Pgtasks(ns).Delete(ctx, spec.Name, metav1.DeleteOptions{})
-
-	spec.TaskType = crv1.PgtaskFailover
-	spec.Parameters = make(map[string]string)
-	spec.Parameters[request.ClusterName] = request.ClusterName
-
-	labels := make(map[string]string)
-	labels["target"] = request.Target
-	labels[config.LABEL_PG_CLUSTER] = request.ClusterName
-	labels[config.LABEL_PGOUSER] = pgouser
-
-	newInstance := &crv1.Pgtask{
-		ObjectMeta: metav1.ObjectMeta{
-			Name:   spec.Name,
-			Labels: labels,
-		},
-		Spec: spec,
-	}
-
-	_, err = apiserver.Clientset.CrunchydataV1().Pgtasks(ns).Create(ctx, newInstance, metav1.CreateOptions{})
-	if err != nil {
+	// perform the switchover
+	if err := operator.Switchover(apiserver.Clientset, apiserver.RESTConfig, cluster, request.Target); err != nil {
 		resp.Status.Code = msgs.Error
 		resp.Status.Msg = err.Error()
 		return resp
 	}
 
-	resp.Results = append(resp.Results, "created Pgtask (failover) for cluster "+request.ClusterName)
+	resp.Results = "failover success for cluster " + cluster.Name
 
 	return resp
 }
@@ -186,7 +161,7 @@ func validateClusterName(clusterName, ns string) (*crv1.Pgcluster, error) {
 // specified, and then ensuring the PG pod created by the deployment is not the current primary.
 // If the deployment is not found, or if the pod is the current primary, an error will be returned.
 // Otherwise the deployment is returned.
-func isValidFailoverTarget(deployName, clusterName, ns string) (*v1.Deployment, error) {
+func isValidFailoverTarget(deployName, clusterName, ns string) error {
 	ctx := context.TODO()
 
 	// Using the following label selector, ensure the deployment specified using deployName exists in the
@@ -198,11 +173,11 @@ func isValidFailoverTarget(deployName, clusterName, ns string) (*v1.Deployment,
 		List(ctx, metav1.ListOptions{LabelSelector: selector})
 	if err != nil {
 		log.Error(err)
-		return nil, err
+		return err
 	} else if len(deployments.Items) == 0 {
-		return nil, errors.New("no target found named " + deployName)
+		return fmt.Errorf("no target found named %s", deployName)
 	} else if len(deployments.Items) > 1 {
-		return nil, errors.New("more than one target found named " + deployName)
+		return fmt.Errorf("more than one target found named %s", deployName)
 	}
 
 	// Using the following label selector, determine if the target specified is the current
@@ -212,8 +187,8 @@ func isValidFailoverTarget(deployName, clusterName, ns string) (*v1.Deployment,
 		"," + config.LABEL_PGHA_ROLE + "=" + config.LABEL_PGHA_ROLE_PRIMARY
 	pods, _ := apiserver.Clientset.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{LabelSelector: selector})
 	if len(pods.Items) > 0 {
-		return nil, errors.New("The primary database cannot be selected as a failover target")
+		return fmt.Errorf("The primary database cannot be selected as a failover target")
 	}
 
-	return &deployments.Items[0], nil
+	return nil
 }
@@ -27,19 +27,14 @@ const (
 
 const LABEL_PGTASK = "pg-task"
 
-const (
-	LABEL_FAILOVER = "failover"
-	LABEL_RESTART  = "restart"
-)
+const LABEL_RESTART = "restart"
 
 const (
-	LABEL_TARGET = "target"
 	LABEL_RMDATA = "pgrmdata"
 )
 
 const (
 	LABEL_PGPOLICY           = "pgpolicy"
-	LABEL_INGEST             = "ingest"
 	LABEL_PGREMOVE           = "pgremove"
 	LABEL_PVCNAME            = "pvcname"
 	LABEL_EXPORTER           = "crunchy-postgres-exporter"
@@ -179,8 +174,6 @@ const (
 	LABEL_PGO_UPDATED_BY        = "pgo-updated-by"
 )
 
-const LABEL_FAILOVER_STARTED = "failover-started"
-
 const GLOBAL_CUSTOM_CONFIGMAP = "pgo-custom-pg-config"
 
 const (
 
@@ -122,13 +122,6 @@ func (c *Controller) processNextItem() bool {
 	case crv1.PgtaskUpgrade:
 		log.Debug("upgrade task added")
 		clusteroperator.AddUpgrade(c.Client, tmpTask, keyNamespace)
-	case crv1.PgtaskFailover:
-		log.Debug("failover task added")
-		if !dupeFailover(c.Client, tmpTask, keyNamespace) {
-			clusteroperator.FailoverBase(keyNamespace, c.Client, tmpTask, c.Client.Config)
-		} else {
-			log.Debugf("skipping duplicate onAdd failover task %s/%s", keyNamespace, keyResourceName)
-		}
 	case crv1.PgtaskRollingUpdate:
 		log.Debug("rolling update task added")
 		// first, attempt to get the pgcluster object
@@ -164,9 +157,6 @@ func (c *Controller) processNextItem() bool {
 	case crv1.PgtaskpgRestore:
 		log.Debug("pgDump restore task added")
 		pgdumpoperator.Restore(keyNamespace, c.Client, tmpTask)
-
-	case crv1.PgtaskAutoFailover:
-		log.Debugf("autofailover task added %s", keyResourceName)
 	case crv1.PgtaskWorkflow:
 		log.Debugf("workflow task added [%s] ID [%s]", keyResourceName, tmpTask.Spec.Parameters[crv1.PgtaskWorkflowID])
 
@@ -217,24 +207,6 @@ func (c *Controller) AddPGTaskEventHandler() {
 	log.Debugf("pgtask Controller: added event handler to informer")
 }
 
-// de-dupe logic for a failover, if the failover started
-// parameter is set, it means a failover has already been
-// started on this
-func dupeFailover(clientset pgo.Interface, task *crv1.Pgtask, ns string) bool {
-	ctx := context.TODO()
-	tmp, err := clientset.CrunchydataV1().Pgtasks(ns).Get(ctx, task.Spec.Name, metav1.GetOptions{})
-	if err != nil {
-		// a big time error if this occurs
-		return false
-	}
-
-	if tmp.Spec.Parameters[config.LABEL_FAILOVER_STARTED] == "" {
-		return false
-	}
-
-	return true
-}
-
 // de-dupe logic for a delete data, if the delete data job started
 // parameter is set, it means a delete data job has already been
 // started on this