Skip to content

Commit 19de963

Browse files
jkatzJonathan S. Katz
authored andcommitted
Existence check for primary Deployment before scaling
During the cluster initialization process, there is a step where the Deployment object representing the primary is explicitly scaled from 0 to 1 Pods. However, there is a case that may trigger the Deployment scaling before the Deployment is created. As such, we can add a guard to check that the Deployment is actually created before proceeding with the scaling operation. This patch also modifies a debug line around the cluster Deployment creation, as the debug output came after the point at which the code may have exited due to error.
1 parent 611ce30 commit 19de963

File tree

2 files changed

+49
-6
lines changed

2 files changed

+49
-6
lines changed

internal/controller/pod/inithandler.go

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License.
1818
import (
1919
"fmt"
2020
"strconv"
21+
"time"
2122

2223
"github.com/crunchydata/postgres-operator/internal/config"
2324
"github.com/crunchydata/postgres-operator/internal/controller"
@@ -28,12 +29,14 @@ import (
2829
taskoperator "github.com/crunchydata/postgres-operator/internal/operator/task"
2930
"github.com/crunchydata/postgres-operator/internal/util"
3031
crv1 "github.com/crunchydata/postgres-operator/pkg/apis/crunchydata.com/v1"
32+
33+
log "github.com/sirupsen/logrus"
3134
apiv1 "k8s.io/api/core/v1"
3235
kerrors "k8s.io/apimachinery/pkg/api/errors"
3336
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
37+
"k8s.io/apimachinery/pkg/fields"
3438
"k8s.io/apimachinery/pkg/types"
35-
36-
log "github.com/sirupsen/logrus"
39+
"k8s.io/apimachinery/pkg/util/wait"
3740
)
3841

3942
// handleClusterInit is responsible for proceeding with initialization of the PG cluster once the
@@ -85,6 +88,16 @@ func (c *Controller) handleBackRestRepoInit(newPod *apiv1.Pod, cluster *crv1.Pgc
8588
return nil
8689
}
8790

91+
// first: a sanity check that there exists a primary deployment to scale. this
92+
// is to attempt to avoid any silent failures in the deployment scaling
93+
// function.
94+
//
95+
// If we do encounter an error, we will proceed in case the deployment becomes
96+
// available after.
97+
if err := c.waitForPrimaryDeployment(cluster); err != nil {
98+
log.Warn(err)
99+
}
100+
88101
clusterInfo, err := clusteroperator.ScaleClusterDeployments(c.Client, *cluster, 1,
89102
true, false, false, false)
90103
if err != nil {
@@ -288,3 +301,33 @@ func (c *Controller) labelPostgresPodAndDeployment(newpod *apiv1.Pod) {
288301
}
289302

290303
}
304+
305+
// waitForPrimaryDeployment checks to see that a primary deployment is
306+
// available. It does not check readiness, only that the deployment exists. This
307+
// used before scaling to ensure scaling does not fail silently
308+
func (c *Controller) waitForPrimaryDeployment(cluster *crv1.Pgcluster) error {
309+
primaryDeploymentName := cluster.Annotations[config.ANNOTATION_CURRENT_PRIMARY]
310+
options := metav1.ListOptions{
311+
LabelSelector: fields.AndSelectors(
312+
fields.OneTermEqualSelector(config.LABEL_PG_CLUSTER, cluster.Name),
313+
fields.OneTermEqualSelector(config.LABEL_PG_DATABASE, config.LABEL_TRUE),
314+
fields.OneTermEqualSelector(config.LABEL_DEPLOYMENT_NAME, primaryDeploymentName),
315+
).String(),
316+
}
317+
318+
// start polling to see if the primary deployment is created
319+
if err := wait.PollImmediate(5*time.Second, 60*time.Second, func() (bool, error) {
320+
// check to see if the deployment exists
321+
d, err := c.Client.AppsV1().Deployments(cluster.Namespace).List(options)
322+
323+
if err != nil {
324+
log.Warnf("could not find primary deployment for scaling: %s", err)
325+
}
326+
327+
return err == nil && len(d.Items) > 0, nil
328+
}); err != nil {
329+
return fmt.Errorf("primary deployment lookup timeout reached for %q", primaryDeploymentName)
330+
}
331+
332+
return nil
333+
}

internal/operator/cluster/clusterlogic.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,15 +141,15 @@ func addClusterDeployments(clientset kubeapi.Interface,
141141
deploymentFields := getClusterDeploymentFields(clientset, cl,
142142
dataVolume, walVolume, tablespaceVolumes)
143143

144+
if operator.CRUNCHY_DEBUG {
145+
_ = config.DeploymentTemplate.Execute(os.Stdout, deploymentFields)
146+
}
147+
144148
var primaryDoc bytes.Buffer
145149
if err := config.DeploymentTemplate.Execute(&primaryDoc, deploymentFields); err != nil {
146150
return err
147151
}
148152

149-
if operator.CRUNCHY_DEBUG {
150-
config.DeploymentTemplate.Execute(os.Stdout, deploymentFields)
151-
}
152-
153153
deployment := &appsv1.Deployment{}
154154
if err := json.Unmarshal(primaryDoc.Bytes(), deployment); err != nil {
155155
return err

0 commit comments

Comments
 (0)