Skip to content

Commit 131a9b2

Browse files
gkechegegunesgithub-actions[bot]hors
authored
K8SPSMDB 1472 fixes on sts updates and df execution flows when pvc resizing (#2220)
* K8SPSMDB-1472 sts updates forbidden, logs fixes * validate that pod is running + retry the df command on pod if fails * fixes * fix go imports Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * simply logic * imports * re-read the sts for checking the annotation fixing both pvc resize flows * add some logs for debugging * add more checks on the e2e test * additional fixes * make node not found error debug * use less resources on the e2e test --------- Co-authored-by: Ege Güneş <ege.gunes@percona.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Viacheslav Sarzhan <slava.sarzhan@percona.com>
1 parent de1f714 commit 131a9b2

File tree

9 files changed

+102
-40
lines changed

9 files changed

+102
-40
lines changed

e2e-tests/pvc-auto-resize/conf/some-name.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ spec:
1313
enableVolumeScaling: true
1414
autoscaling:
1515
enabled: true
16-
triggerThresholdPercent: 80
16+
triggerThresholdPercent: 50
1717
growthStep: "2Gi"
1818
maxSize: "10Gi"
1919
backup:

e2e-tests/pvc-auto-resize/run

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -137,10 +137,10 @@ if [[ ${initial_size} != "1Gi" ]]; then
137137
exit 1
138138
fi
139139

140-
desc 'fill disk to trigger autoscaling threshold 80 percent'
141-
# 1Gi = 1024MB, we want to fill to ~85% to trigger the 80% threshold
142-
# Fill with 850MB to exceed threshold
143-
fill_disk_to_threshold "${cluster}-rs0-0" 80 850
140+
desc 'fill disk to trigger autoscaling threshold 50 percent'
141+
# 1Gi = 1024MB, we want to fill to ~60% to trigger the 50% threshold
142+
# Fill with 600MB to exceed threshold
143+
fill_disk_to_threshold "${cluster}-rs0-0" 50 600
144144

145145
desc 'wait for auto-resize to trigger'
146146
wait_for_auto_resize "mongod-data-${cluster}-rs0-0" "3Gi" 60 10
@@ -161,7 +161,7 @@ desc 'test namespace resource quota enforcement'
161161
apply_resourcequota 9Gi
162162

163163
echo "Filling disk again to trigger autoscaling with quota limit"
164-
fill_disk_to_threshold "${cluster}-rs0-0" 80 2400
164+
fill_disk_to_threshold "${cluster}-rs0-0" 50 1700
165165

166166
echo "Waiting to verify PVC does not resize beyond quota (should stay at 3Gi)"
167167
sleep 30
@@ -184,13 +184,17 @@ desc 'test maxSize limit'
184184
# Increase quota to allow maxSize testing (3 replicas * 10Gi maxSize = 30Gi)
185185
apply_resourcequota 30Gi
186186
echo "Continuing to fill disk to test maxSize enforcement"
187-
fill_disk_to_threshold "${cluster}-rs0-0" 80 4000
187+
fill_disk_to_threshold "${cluster}-rs0-0" 50 2900
188188
wait_for_auto_resize "mongod-data-${cluster}-rs0-0" "7Gi" 60 10
189+
wait_for_auto_resize "mongod-data-${cluster}-rs0-1" "7Gi" 60 10
190+
wait_for_auto_resize "mongod-data-${cluster}-rs0-2" "7Gi" 60 10
189191

190-
fill_disk_to_threshold "${cluster}-rs0-0" 80 5600
192+
fill_disk_to_threshold "${cluster}-rs0-0" 50 4000
191193
wait_for_auto_resize "mongod-data-${cluster}-rs0-0" "9Gi" 60 10
194+
wait_for_auto_resize "mongod-data-${cluster}-rs0-1" "9Gi" 60 10
195+
wait_for_auto_resize "mongod-data-${cluster}-rs0-2" "9Gi" 60 10
192196

193-
fill_disk_to_threshold "${cluster}-rs0-0" 80 7200
197+
fill_disk_to_threshold "${cluster}-rs0-0" 50 5200
194198

195199
final_size=$(get_pvc_size "mongod-data-${cluster}-rs0-0")
196200
if [[ ${final_size} == "10Gi" ]]; then

pkg/controller/perconaservermongodb/metrics_client.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,14 @@ import (
55
"context"
66
"strconv"
77
"strings"
8+
"time"
89

910
"github.com/pkg/errors"
1011
corev1 "k8s.io/api/core/v1"
12+
"k8s.io/apimachinery/pkg/util/wait"
13+
"k8s.io/client-go/util/retry"
1114

15+
"github.com/percona/percona-server-mongodb-operator/pkg/naming"
1216
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb/config"
1317
)
1418

@@ -25,6 +29,16 @@ func (r *ReconcilePerconaServerMongoDB) getPVCUsageFromMetrics(
2529
pod *corev1.Pod,
2630
pvcName string,
2731
) (*PVCUsage, error) {
32+
if pod == nil {
33+
return nil, errors.New("pod is nil")
34+
}
35+
36+
backoff := wait.Backoff{
37+
Steps: 5,
38+
Duration: 5 * time.Second,
39+
Factor: 2.0,
40+
}
41+
2842
// Execute df command in the mongod container to get disk usage
2943
// df -B1 /data/db outputs in bytes
3044
// Example output:
@@ -33,9 +47,18 @@ func (r *ReconcilePerconaServerMongoDB) getPVCUsageFromMetrics(
3347
var stdout, stderr bytes.Buffer
3448
command := []string{"df", "-B1", config.MongodContainerDataDir}
3549

36-
err := r.clientcmd.Exec(ctx, pod, "mongod", command, nil, &stdout, &stderr, false)
50+
err := retry.OnError(backoff, func(err error) bool { return true }, func() error {
51+
stdout.Reset()
52+
stderr.Reset()
53+
54+
err := r.clientcmd.Exec(ctx, pod, naming.ComponentMongod, command, nil, &stdout, &stderr, false)
55+
if err != nil {
56+
return errors.Wrapf(err, "failed to execute df in pod %s: %s", pod.Name, stderr.String())
57+
}
58+
return nil
59+
})
3760
if err != nil {
38-
return nil, errors.Wrapf(err, "failed to execute df in pod %s: %s", pod.Name, stderr.String())
61+
return nil, errors.Wrap(err, "wait for df execution")
3962
}
4063

4164
lines := strings.Split(strings.TrimSpace(stdout.String()), "\n")

pkg/controller/perconaservermongodb/metrics_client_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,24 @@ func TestGetPVCUsageFromMetrics(t *testing.T) {
145145
Name: "test-pod-0",
146146
Namespace: "test-namespace",
147147
},
148+
Spec: corev1.PodSpec{
149+
Containers: []corev1.Container{
150+
{
151+
Name: "mongod",
152+
},
153+
},
154+
},
155+
Status: corev1.PodStatus{
156+
Phase: corev1.PodRunning,
157+
ContainerStatuses: []corev1.ContainerStatus{
158+
{
159+
Name: "mongod",
160+
State: corev1.ContainerState{
161+
Running: &corev1.ContainerStateRunning{},
162+
},
163+
},
164+
},
165+
},
148166
}
149167

150168
result, err := r.getPVCUsageFromMetrics(ctx, pod, tt.pvcName)

pkg/controller/perconaservermongodb/mgo.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"go.mongodb.org/mongo-driver/x/mongo/driver/topology"
1616
appsv1 "k8s.io/api/apps/v1"
1717
corev1 "k8s.io/api/core/v1"
18+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
1819
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1920
"k8s.io/apimachinery/pkg/types"
2021
"k8s.io/apimachinery/pkg/util/wait"
@@ -448,6 +449,10 @@ func (r *ReconcilePerconaServerMongoDB) updateConfigMembers(ctx context.Context,
448449

449450
err = cli.WriteConfig(ctx, cnf, false)
450451
if err != nil {
452+
if strings.Contains(err.Error(), "NodeNotFound") {
453+
log.V(1).Info("NodeNotFound error during replset reconfig after removing old members, will retry on next reconcile", "replset", rs.Name)
454+
return rsMembers, 0, nil
455+
}
451456
return rsMembers, 0, errors.Wrap(err, "delete: write mongo config")
452457
}
453458
}
@@ -1063,6 +1068,9 @@ func (r *ReconcilePerconaServerMongoDB) restoreInProgress(ctx context.Context, c
10631068
stsName := cr.Name + "-" + replset.Name
10641069
nn := types.NamespacedName{Name: stsName, Namespace: cr.Namespace}
10651070
if err := r.client.Get(ctx, nn, &sts); err != nil {
1071+
if k8serrors.IsNotFound(err) {
1072+
return false, nil
1073+
}
10661074
return false, errors.Wrapf(err, "get statefulset %s", stsName)
10671075
}
10681076
_, ok := sts.Annotations[api.AnnotationRestoreInProgress]

pkg/controller/perconaservermongodb/statefulset.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,23 @@ func (r *ReconcilePerconaServerMongoDB) reconcileStatefulSet(ctx context.Context
5454
return sfs, nil
5555
}
5656

57-
if err := r.reconcilePVCs(ctx, cr, sfs, ls, volumeSpec); err != nil {
58-
return nil, errors.Wrapf(err, "reconcile PVCs for %s", sfs.Name)
59-
}
60-
61-
// (non-blocking)
6257
if err := r.reconcileStorageAutoscaling(ctx, cr, sfs, volumeSpec, ls); err != nil {
6358
log.Error(err, "failed to reconcile storage autoscaling", "statefulset", sfs.Name)
6459
}
6560

66-
if _, ok := sfs.Annotations[api.AnnotationPVCResizeInProgress]; ok {
67-
log.V(1).Info("PVC resize in progress, skipping reconciliation of statefulset", "name", sfs.Name)
68-
return sfs, nil
61+
if err := r.reconcilePVCs(ctx, cr, sfs, ls, volumeSpec); err != nil {
62+
return nil, errors.Wrapf(err, "reconcile PVCs for %s", sfs.Name)
63+
}
64+
65+
// Re-read the StatefulSet from the cluster to check for the PVC resize
66+
// annotation, as it may have been set during reconcilePVCs and the local
67+
// sfs object would be stale.
68+
currentSts := new(appsv1.StatefulSet)
69+
if err := r.client.Get(ctx, types.NamespacedName{Name: sfs.Name, Namespace: sfs.Namespace}, currentSts); err == nil {
70+
if _, ok := currentSts.Annotations[api.AnnotationPVCResizeInProgress]; ok {
71+
log.V(1).Info("PVC resize in progress, skipping reconciliation of statefulset", "name", sfs.Name)
72+
return sfs, nil
73+
}
6974
}
7075

7176
err = r.createOrUpdate(ctx, sfs)

pkg/controller/perconaservermongodb/volume_autoscaling.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
logf "sigs.k8s.io/controller-runtime/pkg/log"
1616

1717
api "github.com/percona/percona-server-mongodb-operator/pkg/apis/psmdb/v1"
18+
"github.com/percona/percona-server-mongodb-operator/pkg/naming"
1819
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb/config"
1920
)
2021

@@ -85,7 +86,7 @@ func (r *ReconcilePerconaServerMongoDB) reconcileStorageAutoscaling(
8586

8687
if err := r.checkAndResizePVC(ctx, cr, &pvc, pod, volumeSpec); err != nil {
8788
log.Error(err, "failed to check/resize PVC", "pvc", pvc.Name)
88-
r.updateAutoscalingStatus(cr, pvc.Name, nil, err)
89+
r.updateAutoscalingStatus(ctx, cr, pvc.Name, nil, err)
8990
}
9091
}
9192

@@ -102,8 +103,8 @@ func (r *ReconcilePerconaServerMongoDB) checkAndResizePVC(
102103
) error {
103104
log := logf.FromContext(ctx).WithName("StorageAutoscaling").WithValues("pvc", pvc.Name)
104105

105-
if pod.Status.Phase != corev1.PodRunning {
106-
log.V(1).Info("skipping PVC check: pod not running", "phase", pod.Status.Phase)
106+
if !isContainerAndPodRunning(*pod, naming.ComponentMongod) {
107+
log.V(1).Info("skipping PVC metrics check: container and pod not running", "phase", pod.Status.Phase)
107108
return nil
108109
}
109110

@@ -112,7 +113,7 @@ func (r *ReconcilePerconaServerMongoDB) checkAndResizePVC(
112113
return errors.Wrap(err, "get PVC usage from metrics")
113114
}
114115

115-
r.updateAutoscalingStatus(cr, pvc.Name, usage, nil)
116+
r.updateAutoscalingStatus(ctx, cr, pvc.Name, usage, nil)
116117

117118
if !r.shouldTriggerResize(ctx, cr, pvc, usage) {
118119
return nil
@@ -140,9 +141,6 @@ func (r *ReconcilePerconaServerMongoDB) shouldTriggerResize(
140141
config := cr.Spec.StorageAutoscaling()
141142

142143
if usage.UsagePercent < config.TriggerThresholdPercent {
143-
log.V(1).Info("usage below threshold",
144-
"usage", usage.UsagePercent,
145-
"threshold", config.TriggerThresholdPercent)
146144
return false
147145
}
148146

@@ -201,12 +199,11 @@ func (r *ReconcilePerconaServerMongoDB) triggerResize(
201199
) error {
202200
log := logf.FromContext(ctx).WithName("StorageAutoscaling").WithValues("pvc", pvc.Name)
203201

204-
patch := client.MergeFrom(cr.DeepCopy())
202+
orig := cr.DeepCopy()
205203

206-
// We are modifying cr directly through the pointer. So the original cr object does get the storage size updated.
207204
volumeSpec.PersistentVolumeClaim.Resources.Requests[corev1.ResourceStorage] = newSize
208205

209-
if err := r.client.Patch(ctx, cr.DeepCopy(), patch); err != nil {
206+
if err := r.client.Patch(ctx, cr.DeepCopy(), client.MergeFrom(orig)); err != nil {
210207
return errors.Wrap(err, "patch CR with new storage size")
211208
}
212209

@@ -219,11 +216,19 @@ func (r *ReconcilePerconaServerMongoDB) triggerResize(
219216

220217
// updateAutoscalingStatus updates the status for a specific PVC
221218
func (r *ReconcilePerconaServerMongoDB) updateAutoscalingStatus(
219+
ctx context.Context,
222220
cr *api.PerconaServerMongoDB,
223221
pvcName string,
224222
usage *PVCUsage,
225223
err error,
226224
) {
225+
log := logf.FromContext(ctx).WithName("StorageAutoscaling")
226+
227+
if pvcName == "" {
228+
log.V(1).Info("no pvc name specified")
229+
return
230+
}
231+
227232
if cr.Status.StorageAutoscaling == nil {
228233
cr.Status.StorageAutoscaling = make(map[string]api.StorageAutoscalingStatus)
229234
}
@@ -239,7 +244,6 @@ func (r *ReconcilePerconaServerMongoDB) updateAutoscalingStatus(
239244
status.ResizeCount++
240245
}
241246
}
242-
243247
status.CurrentSize = newSize.String()
244248
status.LastError = ""
245249
}

pkg/controller/perconaservermongodb/volume_autoscaling_test.go

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package perconaservermongodb
22

33
import (
4-
"context"
54
"errors"
65
"testing"
76

@@ -19,7 +18,6 @@ import (
1918

2019
func TestShouldTriggerResize(t *testing.T) {
2120
r := &ReconcilePerconaServerMongoDB{}
22-
ctx := context.Background()
2321

2422
tests := []struct {
2523
name string
@@ -139,7 +137,7 @@ func TestShouldTriggerResize(t *testing.T) {
139137

140138
for _, tt := range tests {
141139
t.Run(tt.name, func(t *testing.T) {
142-
result := r.shouldTriggerResize(ctx, tt.cr, tt.pvc, tt.usage)
140+
result := r.shouldTriggerResize(t.Context(), tt.cr, tt.pvc, tt.usage)
143141
assert.Equal(t, tt.expected, result)
144142
})
145143
}
@@ -410,7 +408,6 @@ func TestUpdateAutoscalingStatus(t *testing.T) {
410408
TotalBytes: 10 * 1024 * 1024 * 1024,
411409
UsagePercent: 50,
412410
},
413-
err: nil,
414411
expectedStatus: api.StorageAutoscalingStatus{
415412
CurrentSize: "10Gi",
416413
LastError: "",
@@ -429,7 +426,6 @@ func TestUpdateAutoscalingStatus(t *testing.T) {
429426
},
430427
},
431428
pvcName: "mongod-data-test-rs0-0",
432-
usage: nil,
433429
err: errors.New("connection refused"),
434430
expectedStatus: api.StorageAutoscalingStatus{
435431
CurrentSize: "10Gi",
@@ -453,7 +449,6 @@ func TestUpdateAutoscalingStatus(t *testing.T) {
453449
TotalBytes: 20 * 1024 * 1024 * 1024,
454450
UsagePercent: 40,
455451
},
456-
err: nil,
457452
expectedStatus: api.StorageAutoscalingStatus{
458453
CurrentSize: "20Gi",
459454
LastError: "",
@@ -464,7 +459,7 @@ func TestUpdateAutoscalingStatus(t *testing.T) {
464459

465460
for name, tt := range tests {
466461
t.Run(name, func(t *testing.T) {
467-
r.updateAutoscalingStatus(tt.cr, tt.pvcName, tt.usage, tt.err)
462+
r.updateAutoscalingStatus(t.Context(), tt.cr, tt.pvcName, tt.usage, tt.err)
468463

469464
require.NotNil(t, tt.cr.Status.StorageAutoscaling)
470465
status, ok := tt.cr.Status.StorageAutoscaling[tt.pvcName]
@@ -482,8 +477,6 @@ func TestUpdateAutoscalingStatus(t *testing.T) {
482477
}
483478

484479
func TestTriggerResize(t *testing.T) {
485-
ctx := context.Background()
486-
487480
tests := map[string]struct {
488481
cr *api.PerconaServerMongoDB
489482
pvc *corev1.PersistentVolumeClaim
@@ -695,7 +688,7 @@ func TestTriggerResize(t *testing.T) {
695688

696689
originalSize := volumeSpec.PersistentVolumeClaim.Resources.Requests[corev1.ResourceStorage]
697690

698-
err = r.triggerResize(ctx, tt.cr, tt.pvc, tt.newSize, volumeSpec)
691+
err = r.triggerResize(t.Context(), tt.cr, tt.pvc, tt.newSize, volumeSpec)
699692
require.NoError(t, err)
700693

701694
updatedSize := volumeSpec.PersistentVolumeClaim.Resources.Requests[corev1.ResourceStorage]

pkg/controller/perconaservermongodb/volumes.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,13 @@ func (r *ReconcilePerconaServerMongoDB) resizeVolumesIfNeeded(ctx context.Contex
251251
continue
252252
}
253253

254+
// Re-read the PVC to get the latest resourceVersion before updating,
255+
// as it may have been modified since the initial list (e.g. by fixVolumeLabels
256+
// or by the Kubernetes PVC controller).
257+
if err := r.client.Get(ctx, client.ObjectKeyFromObject(&pvc), &pvc); err != nil {
258+
return errors.Wrapf(err, "get persistentvolumeclaim/%s", pvc.Name)
259+
}
260+
254261
if pvc.Status.Capacity.Storage().Cmp(requested) == 0 {
255262
log.Info("PVC already resized", "name", pvc.Name, "actual", pvc.Status.Capacity.Storage(), "requested", requested)
256263
continue

0 commit comments

Comments
 (0)