Skip to content

Commit 9a0e186

Browse files
mnenciaarmru
authored andcommitted
fix(fencing): allow fencing when WAL disk is full (cloudnative-pg#10302)
Move the WAL disk space check from the instance manager startup path into the PostgreSQL lifecycle loop, after the fencing check. This ensures the controller-runtime manager always starts, allowing fencing annotations to be processed even when WAL disk is full. Signed-off-by: Marco Nenciarini <marco.nenciarini@enterprisedb.com> Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Co-authored-by: Armando Ruocco <armando.ruocco@enterprisedb.com> (cherry picked from commit f6d03dd) (cherry picked from commit 72998cf)
1 parent 037f556 commit 9a0e186

File tree

3 files changed

+22
-16
lines changed

3 files changed

+22
-16
lines changed

internal/cmd/manager/instance/run/cmd.go

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,6 @@ import (
6868
var (
6969
scheme = runtime.NewScheme()
7070

71-
// errNoFreeWALSpace is returned when there isn't enough disk space
72-
// available to store at least two WAL files.
73-
errNoFreeWALSpace = fmt.Errorf("no free disk space for WALs")
74-
7571
// errWALArchivePluginNotAvailable is returned when the configured
7672
// WAL archiving plugin is not available or cannot be found.
7773
errWALArchivePluginNotAvailable = fmt.Errorf("WAL archive plugin not available")
@@ -127,7 +123,7 @@ func NewCmd() *cobra.Command {
127123
return runSubCommand(ctx, instance, pprofHTTPServer, skipNameValidation)
128124
})
129125

130-
if errors.Is(err, errNoFreeWALSpace) {
126+
if errors.Is(err, postgres.ErrNoFreeWALSpace) {
131127
os.Exit(apiv1.MissingWALDiskSpaceExitCode)
132128
}
133129
if errors.Is(err, errWALArchivePluginNotAvailable) {
@@ -180,15 +176,6 @@ func runSubCommand(
180176
"build", versions.Info,
181177
"skipNameValidation", skipNameValidation)
182178

183-
contextLogger.Info("Checking for free disk space for WALs before starting PostgreSQL")
184-
hasDiskSpaceForWals, err := instance.CheckHasDiskSpaceForWAL(ctx)
185-
if err != nil {
186-
contextLogger.Error(err, "Error while checking if there is enough disk space for WALs, skipping")
187-
} else if !hasDiskSpaceForWals {
188-
contextLogger.Info("Detected low-disk space condition, avoid starting the instance")
189-
return errNoFreeWALSpace
190-
}
191-
192179
mgr, err := ctrl.NewManager(config.GetConfigOrDie(), ctrl.Options{
193180
Scheme: scheme,
194181
Cache: cache.Options{
@@ -409,16 +396,24 @@ func runSubCommand(
409396
contextLogger.Info("starting controller-runtime manager")
410397
if err := mgr.Start(onlineUpgradeCtx); err != nil {
411398
contextLogger.Error(err, "unable to run controller-runtime manager")
399+
if errors.Is(err, postgres.ErrNoFreeWALSpace) {
400+
return makeUnretryableError(postgres.ErrNoFreeWALSpace)
401+
}
402+
if hasSpace, checkErr := instance.CheckHasDiskSpaceForWAL(ctx); checkErr == nil && !hasSpace {
403+
contextLogger.Warning("Detected low WAL disk space, but the manager error is not WAL-space related",
404+
"originalError", err)
405+
return makeUnretryableError(fmt.Errorf("%w: %w", postgres.ErrNoFreeWALSpace, err))
406+
}
412407
return makeUnretryableError(err)
413408
}
414409

415410
contextLogger.Info("Checking for free disk space for WALs after PostgreSQL finished")
416-
hasDiskSpaceForWals, err = instance.CheckHasDiskSpaceForWAL(ctx)
411+
hasDiskSpaceForWals, err := instance.CheckHasDiskSpaceForWAL(ctx)
417412
if err != nil {
418413
contextLogger.Error(err, "Error while checking if there is enough disk space for WALs, skipping")
419414
} else if !hasDiskSpaceForWals {
420415
contextLogger.Info("Detected low-disk space condition")
421-
return makeUnretryableError(errNoFreeWALSpace)
416+
return makeUnretryableError(postgres.ErrNoFreeWALSpace)
422417
}
423418

424419
if instance.Cluster != nil {

internal/cmd/manager/instance/run/lifecycle/run.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,13 @@ func (i *PostgresLifecycle) runPostgresAndWait(ctx context.Context) <-chan error
9090
return nil
9191
}
9292

93+
if hasDiskSpace, err := i.instance.CheckHasDiskSpaceForWAL(postgresContext); err != nil {
94+
contextLogger.Error(err, "Error checking WAL disk space, skipping")
95+
} else if !hasDiskSpace {
96+
contextLogger.Info("Not enough WAL disk space, avoid starting PostgreSQL")
97+
return postgres.ErrNoFreeWALSpace
98+
}
99+
93100
i.instance.LogPgControldata(postgresContext, "postmaster start up")
94101
defer i.instance.LogPgControldata(postgresContext, "postmaster has exited")
95102

pkg/management/postgres/instance.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ var (
133133

134134
// ErrNoConnectionEstablished postgres is alive, but rejecting connections
135135
ErrNoConnectionEstablished = fmt.Errorf("could not establish connection")
136+
137+
// ErrNoFreeWALSpace is returned when there isn't enough disk space
138+
// available to store at least two WAL files.
139+
ErrNoFreeWALSpace = errors.New("no free disk space for WALs")
136140
)
137141

138142
// Instance represent a PostgreSQL instance to be executed

0 commit comments

Comments
 (0)