Skip to content

Commit e3124aa

Browse files
authored
Classify wal segment removed as notify user (#4107)
From prod logs, this error either shows up once then recovers, or persists until user resyncs. Nothing we can do to help, so classify as notify_user
1 parent 8cb8f77 commit e3124aa

File tree

2 files changed

+31
-18
lines changed

2 files changed

+31
-18
lines changed

flow/alerting/classifier.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,9 @@ var (
221221
ErrorNotifyPostgresLogicalMessageProcessing = ErrorClass{
222222
Class: "NOTIFY_POSTGRES_LOGICAL_MESSAGE_PROCESSING_ERROR", action: NotifyUser,
223223
}
224+
ErrorNotifyWalSegmentRemoved = ErrorClass{
225+
Class: "NOTIFY_WAL_SEGMENT_REMOVED", action: NotifyUser,
226+
}
224227
ErrorNotifyClickHouseSupportIsDisabledError = ErrorClass{
225228
Class: "NOTIFY_CLICKHOUSE_SUPPORT_IS_DISABLED_ERROR", action: NotifyUser,
226229
}
@@ -527,9 +530,12 @@ func GetErrorClass(ctx context.Context, err error) (ErrorClass, ErrorInfo) {
527530

528531
case pgerrcode.UndefinedFile:
529532
// Handle WAL segment removed errors
530-
// There is a quirk in some PG installs where replication can try read a segment that hasn't been created yet but will show up
533+
// It either shows up once then disappears
534+
// (quirk in some PG installs where replication can try read a segment that hasn't been created yet)
535+
// or shows up and persists
536+
// NotifyUser with repeat threshold accommodates both
531537
if PostgresWalSegmentRemovedRe.MatchString(pgErr.Message) {
532-
return ErrorRetryRecoverable, pgErrorInfo
538+
return ErrorNotifyWalSegmentRemoved, pgErrorInfo
533539
}
534540
// Handles missing spill-to-disk file during logical decoding (transient error)
535541
if PostgresSpillFileMissingRe.MatchString(pgErr.Message) {
@@ -557,9 +563,12 @@ func GetErrorClass(ctx context.Context, err error) (ErrorClass, ErrorInfo) {
557563
}
558564

559565
// Handle WAL segment removed errors
560-
// There is a quirk in some PG installs where replication can try read a segment that hasn't been created yet but will show up
566+
// It either shows up once then disappears
567+
// (quirk in some PG installs where replication can try read a segment that hasn't been created yet)
568+
// or shows up and persists
569+
// NotifyUser with repeat threshold accommodates both
561570
if PostgresWalSegmentRemovedRe.MatchString(pgErr.Message) {
562-
return ErrorRetryRecoverable, pgErrorInfo
571+
return ErrorNotifyWalSegmentRemoved, pgErrorInfo
563572
}
564573

565574
// Handle Neon quota exceeded errors

flow/alerting/classifier_test.go

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -107,21 +107,25 @@ func TestClickHouseSelectFromDestinationDuringQrepAsMvError(t *testing.T) {
107107
}, errInfo, "Unexpected error info")
108108
}
109109

110-
func TestPostgresWalRemovedErrorShouldBeRecoverable(t *testing.T) {
111-
// Simulate a WAL removed error
112-
err := &exceptions.PostgresWalError{
113-
Msg: &pgproto3.ErrorResponse{
114-
Severity: "ERROR",
115-
Code: pgerrcode.InternalError,
116-
Message: "requested WAL segment 000000010001337F0000002E has already been removed",
117-
},
110+
func TestPostgresWalRemovedErrorShouldBeNotifyUser(t *testing.T) {
111+
for _, code := range []string{pgerrcode.InternalError, pgerrcode.UndefinedFile} {
112+
t.Run(code, func(t *testing.T) {
113+
// Simulate a WAL removed error
114+
err := &exceptions.PostgresWalError{
115+
Msg: &pgproto3.ErrorResponse{
116+
Severity: "ERROR",
117+
Code: code,
118+
Message: "requested WAL segment 000000010001337F0000002E has already been removed",
119+
},
120+
}
121+
errorClass, errInfo := GetErrorClass(t.Context(), fmt.Errorf("error in WAL: %w", err))
122+
assert.Equal(t, ErrorNotifyWalSegmentRemoved, errorClass, "Unexpected error class")
123+
assert.Equal(t, ErrorInfo{
124+
Source: ErrorSourcePostgres,
125+
Code: code,
126+
}, errInfo, "Unexpected error info")
127+
})
118128
}
119-
errorClass, errInfo := GetErrorClass(t.Context(), fmt.Errorf("error in WAL: %w", err))
120-
assert.Equal(t, ErrorRetryRecoverable, errorClass, "Unexpected error class")
121-
assert.Equal(t, ErrorInfo{
122-
Source: ErrorSourcePostgres,
123-
Code: pgerrcode.InternalError,
124-
}, errInfo, "Unexpected error info")
125129
}
126130

127131
func TestAuroraInternalWALErrorShouldBeRecoverable(t *testing.T) {

0 commit comments

Comments
 (0)