Skip to content

Commit 5c09eb1

Browse files
committed
sql: retry all "rpc error" distributed errors as local
This commit includes all errors that contain `rpc error` substring to be retried-as-local. In particular, this allows us to avoid problems with DistSQL using no-longer-live SQL pod after that pod is shutdown. (This usage of the downed pod is currently expected given that the cache of live instances isn't updated when the pod is shutdown.) Release note: None
1 parent 847722e commit 5c09eb1

File tree

4 files changed

+27
-23
lines changed

4 files changed

+27
-23
lines changed

pkg/jobs/joberror/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ go_library(
88
deps = [
99
"//pkg/kv/kvclient/kvcoord",
1010
"//pkg/sql/flowinfra",
11+
"//pkg/sql/sqlerrors",
1112
"//pkg/util/circuit",
1213
"//pkg/util/grpcutil",
1314
"//pkg/util/sysutil",

pkg/jobs/joberror/errors.go

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,16 @@
1111
package joberror
1212

1313
import (
14-
"strings"
15-
1614
circuitbreaker "github.com/cockroachdb/circuitbreaker"
1715
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
1816
"github.com/cockroachdb/cockroach/pkg/sql/flowinfra"
17+
"github.com/cockroachdb/cockroach/pkg/sql/sqlerrors"
1918
"github.com/cockroachdb/cockroach/pkg/util/circuit"
2019
"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
2120
"github.com/cockroachdb/cockroach/pkg/util/sysutil"
2221
"github.com/cockroachdb/errors"
2322
)
2423

25-
// IsDistSQLRetryableError returns true if the supplied error, or any of its parent
26-
// causes is an rpc error.
27-
// This is an unfortunate implementation that should be looking for a more
28-
// specific error.
29-
func IsDistSQLRetryableError(err error) bool {
30-
if err == nil {
31-
return false
32-
}
33-
34-
// TODO(knz): this is a bad implementation. Make it go away
35-
// by avoiding string comparisons.
36-
37-
errStr := err.Error()
38-
// When a crdb node dies, any DistSQL flows with processors scheduled on
39-
// it get an error with "rpc error" in the message from the call to
40-
// `(*DistSQLPlanner).Run`.
41-
return strings.Contains(errStr, `rpc error`)
42-
}
43-
4424
// isBreakerOpenError returns true if err is a circuit.ErrBreakerOpen.
4525
//
4626
// NB: Two packages have ErrBreakerOpen error types. The cicruitbreaker package
@@ -57,7 +37,7 @@ func IsPermanentBulkJobError(err error) bool {
5737
if err == nil {
5838
return false
5939
}
60-
return !IsDistSQLRetryableError(err) &&
40+
return !sqlerrors.IsDistSQLRetryableError(err) &&
6141
!grpcutil.IsClosedConnection(err) &&
6242
!flowinfra.IsFlowRetryableError(err) &&
6343
!flowinfra.IsNoInboundStreamConnectionError(err) &&

pkg/sql/distsql_running.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ import (
4444
"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
4545
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
4646
"github.com/cockroachdb/cockroach/pkg/sql/sessiondatapb"
47+
"github.com/cockroachdb/cockroach/pkg/sql/sqlerrors"
4748
"github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry"
4849
"github.com/cockroachdb/cockroach/pkg/sql/types"
4950
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
@@ -1998,7 +1999,8 @@ func (dsp *DistSQLPlanner) PlanAndRun(
19981999
// cancellation has already occurred.
19992000
return
20002001
}
2001-
if !pgerror.IsSQLRetryableError(distributedErr) &&
2002+
if !sqlerrors.IsDistSQLRetryableError(distributedErr) &&
2003+
!pgerror.IsSQLRetryableError(distributedErr) &&
20022004
!flowinfra.IsFlowRetryableError(distributedErr) &&
20032005
!isDialErr(distributedErr) {
20042006
// Only re-run the query if we think there is a high chance of a

pkg/sql/sqlerrors/errors.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
package sqlerrors
1313

1414
import (
15+
"strings"
16+
1517
"github.com/cockroachdb/cockroach/pkg/roachpb"
1618
"github.com/cockroachdb/cockroach/pkg/security/username"
1719
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
@@ -411,3 +413,22 @@ func errHasCode(err error, code ...pgcode.Code) bool {
411413
}
412414
return false
413415
}
416+
417+
// IsDistSQLRetryableError returns true if the supplied error, or any of its parent
418+
// causes is an rpc error.
419+
// This is an unfortunate implementation that should be looking for a more
420+
// specific error.
421+
func IsDistSQLRetryableError(err error) bool {
422+
if err == nil {
423+
return false
424+
}
425+
426+
// TODO(knz): this is a bad implementation. Make it go away
427+
// by avoiding string comparisons.
428+
429+
errStr := err.Error()
430+
// When a crdb node dies, any DistSQL flows with processors scheduled on
431+
// it get an error with "rpc error" in the message from the call to
432+
// `(*DistSQLPlanner).Run`.
433+
return strings.Contains(errStr, `rpc error`)
434+
}

0 commit comments

Comments
 (0)