Skip to content

Commit 2bdc585

Browse files
committed
Do not hibernate if backup is in progress
1 parent 0f081bc commit 2bdc585

File tree

6 files changed

+181
-2
lines changed

6 files changed

+181
-2
lines changed

doc/examples/cluster-example.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ metadata:
3838
namespace: default
3939
spec:
4040
schedule: "0 0 2 * * *" # Daily at 2 AM (seconds, minutes, hours, day of month, month, day of week)
41+
immediate: true
4142
cluster:
4243
name: cluster-example
4344
backupOwnerReference: self

internal/sidecar/cluster_client.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,19 @@ func (r *cnpgClusterClient) updateClusterScheduledBackup(ctx context.Context, sc
138138
return r.client.Update(ctx, scheduledBackup)
139139
}
140140

141+
func (r *cnpgClusterClient) getClusterBackups(ctx context.Context) ([]cnpgv1.Backup, error) {
142+
// Use label selector to filter backups for this cluster
143+
listOptions := []client.ListOption{
144+
client.InNamespace(r.clusterKey.Namespace),
145+
client.MatchingLabels{"cnpg.io/cluster": r.clusterKey.Name},
146+
}
147+
var backupList cnpgv1.BackupList
148+
if err := r.client.List(ctx, &backupList, listOptions...); err != nil {
149+
return nil, err
150+
}
151+
return backupList.Items, nil
152+
}
153+
141154
func (p *postgreSQLCredentials) connString() string {
142155
return fmt.Sprintf("host=%s port=%s user=%s password=%s dbname=%s sslmode=require",
143156
p.host, p.port, p.username, p.password, p.database)

internal/sidecar/helper_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ type mockClusterClient struct {
1313
getClusterCredentialsFunc func(ctx context.Context) (*postgreSQLCredentials, error)
1414
getClusterScheduledBackupFunc func(ctx context.Context) (*cnpgv1.ScheduledBackup, error)
1515
updateClusterScheduledBackupFunc func(ctx context.Context, scheduledBackup *cnpgv1.ScheduledBackup) error
16+
getClusterBackupsFunc func(ctx context.Context, i uint) ([]cnpgv1.Backup, error)
17+
getClusterBackupsCalls uint
1618
}
1719

1820
func (m *mockClusterClient) getCluster(ctx context.Context, forceUpdate bool) (*cnpgv1.Cluster, error) {
@@ -50,6 +52,14 @@ func (m *mockClusterClient) updateClusterScheduledBackup(ctx context.Context, sc
5052
return nil
5153
}
5254

55+
func (m *mockClusterClient) getClusterBackups(ctx context.Context) ([]cnpgv1.Backup, error) {
56+
m.getClusterBackupsCalls++
57+
if m.getClusterBackupsFunc != nil {
58+
return m.getClusterBackupsFunc(ctx, m.getClusterBackupsCalls)
59+
}
60+
return []cnpgv1.Backup{}, nil
61+
}
62+
5363
type mockQuerier struct {
5464
queryFunc func(ctx context.Context, query string, args ...any) (postgres.Row, error)
5565
}

internal/sidecar/scale_to_zero.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ type clusterClient interface {
3838
getClusterCredentials(ctx context.Context) (*postgreSQLCredentials, error)
3939
getClusterScheduledBackup(ctx context.Context) (*cnpgv1.ScheduledBackup, error)
4040
updateClusterScheduledBackup(ctx context.Context, scheduledBackup *cnpgv1.ScheduledBackup) error
41+
getClusterBackups(ctx context.Context) ([]cnpgv1.Backup, error)
4142
}
4243

4344
type config struct {
@@ -122,7 +123,14 @@ func (s *scaleToZero) Start(ctx context.Context) error {
122123
continue
123124
}
124125

125-
if !isActive {
126+
// if we can't determine if there's a backup in progress, we don't
127+
// error, and assume there aren't any
128+
isBackupInProgress, err := s.isBackupInProgress(ctx)
129+
if err != nil {
130+
contextLogger.Error(err, "failed to check backup status")
131+
}
132+
133+
if !isActive && !isBackupInProgress {
126134
if err := s.hibernate(ctx); err != nil {
127135
contextLogger.Error(err, "hibernation failed")
128136
// we stop the scale to zero sidecar if this is not the primary instance
@@ -302,3 +310,22 @@ func (s *scaleToZero) pauseScheduledBackup(ctx context.Context) error {
302310

303311
return nil
304312
}
313+
314+
func (s *scaleToZero) isBackupInProgress(ctx context.Context) (bool, error) {
315+
backups, err := s.client.getClusterBackups(ctx)
316+
if err != nil {
317+
return false, fmt.Errorf("failed to get cluster backups: %w", err)
318+
}
319+
320+
log.FromContext(ctx).Debug("checking backups for in-progress status", "count", len(backups))
321+
322+
for _, backup := range backups {
323+
log.FromContext(ctx).Debug("backup status", "cluster", s.clusterName, "backup", backup.Name, "status", backup.Status.Phase)
324+
if backup.Status.Phase == cnpgv1.BackupPhaseRunning {
325+
log.FromContext(ctx).Info("backup is in progress", "cluster", s.clusterName, "backup", backup.Name)
326+
return true, nil
327+
}
328+
}
329+
330+
return false, nil
331+
}

internal/sidecar/scale_to_zero_test.go

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,133 @@ func TestScaleToZero_Start(t *testing.T) {
286286

287287
wantErr: nil,
288288
},
289+
{
290+
name: "cluster with scale to zero enabled and inactive cluster with ongoing backup, no hibernation triggered until complete",
291+
client: func(done chan struct{}) *mockClusterClient {
292+
return &mockClusterClient{
293+
getClusterFunc: func(ctx context.Context, forceUpdate bool) (*cnpgv1.Cluster, error) {
294+
return &cnpgv1.Cluster{
295+
Status: cnpgv1.ClusterStatus{
296+
Phase: healthyClusterStatus,
297+
CurrentPrimary: "test-pod-1",
298+
},
299+
ObjectMeta: metav1.ObjectMeta{
300+
Annotations: map[string]string{
301+
scaleToZeroEnabledAnnotation: "true",
302+
inactivityMinutesAnnotation: "5",
303+
},
304+
},
305+
}, nil
306+
},
307+
updateClusterFunc: func(ctx context.Context, cluster *cnpgv1.Cluster) error {
308+
defer func() { done <- struct{}{} }()
309+
require.NotNil(t, cluster)
310+
require.Equal(t, "on", cluster.Annotations[hibernationAnnotation])
311+
return nil
312+
},
313+
getClusterScheduledBackupFunc: func(ctx context.Context) (*cnpgv1.ScheduledBackup, error) {
314+
return nil, fmt.Errorf("scheduledbackups.postgresql.cnpg.io \"test-cluster\" not found")
315+
},
316+
getClusterBackupsFunc: func(ctx context.Context, i uint) ([]cnpgv1.Backup, error) {
317+
backup := func(status cnpgv1.BackupPhase) cnpgv1.Backup {
318+
return cnpgv1.Backup{
319+
Spec: cnpgv1.BackupSpec{
320+
Cluster: cnpgv1.LocalObjectReference{
321+
Name: "test-cluster",
322+
},
323+
},
324+
Status: cnpgv1.BackupStatus{
325+
Phase: status,
326+
},
327+
}
328+
}
329+
switch i {
330+
case 1:
331+
return []cnpgv1.Backup{
332+
backup(cnpgv1.BackupPhaseRunning),
333+
}, nil
334+
case 2:
335+
return []cnpgv1.Backup{
336+
backup(cnpgv1.BackupPhaseCompleted),
337+
}, nil
338+
default:
339+
return nil, fmt.Errorf("unexpected call to getClusterBackups with index %d", i)
340+
}
341+
},
342+
}
343+
},
344+
345+
querier: func(_ chan struct{}) *mockQuerier {
346+
return &mockQuerier{
347+
queryFunc: func(ctx context.Context, query string, args ...any) (postgres.Row, error) {
348+
return &mockRow{
349+
scanFn: func(dest ...any) error {
350+
require.Len(t, dest, 1)
351+
count, ok := dest[0].(*int)
352+
require.True(t, ok)
353+
*count = 0 // Simulate an inactive cluster
354+
return nil
355+
},
356+
}, nil
357+
},
358+
}
359+
},
360+
lastActive: time.Now().Add(-time.Minute * 10), // Simulate inactivity
361+
362+
wantErr: nil,
363+
},
364+
{
365+
name: "cluster with scale to zero enabled and inactive cluster with unknown backups, hibernation triggered",
366+
client: func(done chan struct{}) *mockClusterClient {
367+
return &mockClusterClient{
368+
getClusterFunc: func(ctx context.Context, forceUpdate bool) (*cnpgv1.Cluster, error) {
369+
return &cnpgv1.Cluster{
370+
Status: cnpgv1.ClusterStatus{
371+
Phase: healthyClusterStatus,
372+
CurrentPrimary: "test-pod-1",
373+
},
374+
ObjectMeta: metav1.ObjectMeta{
375+
Annotations: map[string]string{
376+
scaleToZeroEnabledAnnotation: "true",
377+
inactivityMinutesAnnotation: "5",
378+
},
379+
},
380+
}, nil
381+
},
382+
updateClusterFunc: func(ctx context.Context, cluster *cnpgv1.Cluster) error {
383+
defer func() { done <- struct{}{} }()
384+
require.NotNil(t, cluster)
385+
require.Equal(t, "on", cluster.Annotations[hibernationAnnotation])
386+
return nil
387+
},
388+
getClusterScheduledBackupFunc: func(ctx context.Context) (*cnpgv1.ScheduledBackup, error) {
389+
return nil, fmt.Errorf("scheduledbackups.postgresql.cnpg.io \"test-cluster\" not found")
390+
},
391+
getClusterBackupsFunc: func(ctx context.Context, i uint) ([]cnpgv1.Backup, error) {
392+
return nil, errTest
393+
},
394+
}
395+
},
396+
397+
querier: func(_ chan struct{}) *mockQuerier {
398+
return &mockQuerier{
399+
queryFunc: func(ctx context.Context, query string, args ...any) (postgres.Row, error) {
400+
return &mockRow{
401+
scanFn: func(dest ...any) error {
402+
require.Len(t, dest, 1)
403+
count, ok := dest[0].(*int)
404+
require.True(t, ok)
405+
*count = 0 // Simulate an inactive cluster
406+
return nil
407+
},
408+
}, nil
409+
},
410+
}
411+
},
412+
lastActive: time.Now().Add(-time.Minute * 10), // Simulate inactivity
413+
414+
wantErr: nil,
415+
},
289416
{
290417
name: "cluster with scale to zero enabled and inactive cluster, hibernation triggered, scheduled backup get error ignored",
291418
client: func(done chan struct{}) *mockClusterClient {

internal/sidecar/sidecar.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ func generateScheme(ctx context.Context) *runtime.Scheme {
8484
schemeBuilder := &scheme.Builder{GroupVersion: schemeGroupVersion}
8585
schemeBuilder.Register(
8686
&cnpgv1.Cluster{}, &cnpgv1.ClusterList{},
87-
&cnpgv1.ScheduledBackup{}, &cnpgv1.ScheduledBackupList{})
87+
&cnpgv1.ScheduledBackup{}, &cnpgv1.ScheduledBackupList{},
88+
&cnpgv1.Backup{}, &cnpgv1.BackupList{})
8889
utilruntime.Must(schemeBuilder.AddToScheme(result))
8990

9091
schemeLog := log.FromContext(ctx)

0 commit comments

Comments
 (0)