Skip to content

Commit e9a0b1c

Browse files
craig[bot]spilchenyuzefovich
committed
154573: sql/inspect: filter SHOW INSPECT ERRORS by job payload r=spilchen a=spilchen Previously, `SHOW INSPECT ERRORS FOR TABLE` would select the most recent completed INSPECT job that had errors for that table by joining with the `inspect_errors` table. This approach had a flaw: if a more recent job inspected the table but found no errors, it would be ignored in favor of an older job with errors. This changes that by query the INSPECT job payload to determine which jobs actually inspected a given table. Informs #148287 Release note: none Epic: CRDB-30356 154706: kvcoord: remove metrics for deprecated dist sender errors r=yuzefovich a=yuzefovich Epic: None Release note: None Co-authored-by: Matt Spilchen <[email protected]> Co-authored-by: Yahor Yuzefovich <[email protected]>
3 parents e942b71 + 1f429d7 + 1f81f98 commit e9a0b1c

File tree

4 files changed

+175
-164
lines changed

4 files changed

+175
-164
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 0 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -2743,146 +2743,6 @@ layers:
27432743
unit: COUNT
27442744
aggregation: AVG
27452745
derivative: NON_NEGATIVE_DERIVATIVE
2746-
- name: distsender.rpc.err.errordetailtype(0)
2747-
exported_name: distsender_rpc_err_errordetailtype_0_
2748-
description: |
2749-
Number of ErrorDetailType(0) errors received replica-bound RPCs
2750-
2751-
This counts how often error of the specified type was received back from replicas
2752-
as part of executing possibly range-spanning requests. Failures to reach the target
2753-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2754-
errors as 'roachpb.InternalErrType'.
2755-
y_axis_label: Errors
2756-
type: COUNTER
2757-
unit: COUNT
2758-
aggregation: AVG
2759-
derivative: NON_NEGATIVE_DERIVATIVE
2760-
- name: distsender.rpc.err.errordetailtype(15)
2761-
exported_name: distsender_rpc_err_errordetailtype_15_
2762-
description: |
2763-
Number of ErrorDetailType(15) errors received replica-bound RPCs
2764-
2765-
This counts how often error of the specified type was received back from replicas
2766-
as part of executing possibly range-spanning requests. Failures to reach the target
2767-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2768-
errors as 'roachpb.InternalErrType'.
2769-
y_axis_label: Errors
2770-
type: COUNTER
2771-
unit: COUNT
2772-
aggregation: AVG
2773-
derivative: NON_NEGATIVE_DERIVATIVE
2774-
- name: distsender.rpc.err.errordetailtype(19)
2775-
exported_name: distsender_rpc_err_errordetailtype_19_
2776-
description: |
2777-
Number of ErrorDetailType(19) errors received replica-bound RPCs
2778-
2779-
This counts how often error of the specified type was received back from replicas
2780-
as part of executing possibly range-spanning requests. Failures to reach the target
2781-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2782-
errors as 'roachpb.InternalErrType'.
2783-
y_axis_label: Errors
2784-
type: COUNTER
2785-
unit: COUNT
2786-
aggregation: AVG
2787-
derivative: NON_NEGATIVE_DERIVATIVE
2788-
- name: distsender.rpc.err.errordetailtype(20)
2789-
exported_name: distsender_rpc_err_errordetailtype_20_
2790-
description: |
2791-
Number of ErrorDetailType(20) errors received replica-bound RPCs
2792-
2793-
This counts how often error of the specified type was received back from replicas
2794-
as part of executing possibly range-spanning requests. Failures to reach the target
2795-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2796-
errors as 'roachpb.InternalErrType'.
2797-
y_axis_label: Errors
2798-
type: COUNTER
2799-
unit: COUNT
2800-
aggregation: AVG
2801-
derivative: NON_NEGATIVE_DERIVATIVE
2802-
- name: distsender.rpc.err.errordetailtype(21)
2803-
exported_name: distsender_rpc_err_errordetailtype_21_
2804-
description: |
2805-
Number of ErrorDetailType(21) errors received replica-bound RPCs
2806-
2807-
This counts how often error of the specified type was received back from replicas
2808-
as part of executing possibly range-spanning requests. Failures to reach the target
2809-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2810-
errors as 'roachpb.InternalErrType'.
2811-
y_axis_label: Errors
2812-
type: COUNTER
2813-
unit: COUNT
2814-
aggregation: AVG
2815-
derivative: NON_NEGATIVE_DERIVATIVE
2816-
- name: distsender.rpc.err.errordetailtype(23)
2817-
exported_name: distsender_rpc_err_errordetailtype_23_
2818-
description: |
2819-
Number of ErrorDetailType(23) errors received replica-bound RPCs
2820-
2821-
This counts how often error of the specified type was received back from replicas
2822-
as part of executing possibly range-spanning requests. Failures to reach the target
2823-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2824-
errors as 'roachpb.InternalErrType'.
2825-
y_axis_label: Errors
2826-
type: COUNTER
2827-
unit: COUNT
2828-
aggregation: AVG
2829-
derivative: NON_NEGATIVE_DERIVATIVE
2830-
- name: distsender.rpc.err.errordetailtype(24)
2831-
exported_name: distsender_rpc_err_errordetailtype_24_
2832-
description: |
2833-
Number of ErrorDetailType(24) errors received replica-bound RPCs
2834-
2835-
This counts how often error of the specified type was received back from replicas
2836-
as part of executing possibly range-spanning requests. Failures to reach the target
2837-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2838-
errors as 'roachpb.InternalErrType'.
2839-
y_axis_label: Errors
2840-
type: COUNTER
2841-
unit: COUNT
2842-
aggregation: AVG
2843-
derivative: NON_NEGATIVE_DERIVATIVE
2844-
- name: distsender.rpc.err.errordetailtype(29)
2845-
exported_name: distsender_rpc_err_errordetailtype_29_
2846-
description: |
2847-
Number of ErrorDetailType(29) errors received replica-bound RPCs
2848-
2849-
This counts how often error of the specified type was received back from replicas
2850-
as part of executing possibly range-spanning requests. Failures to reach the target
2851-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2852-
errors as 'roachpb.InternalErrType'.
2853-
y_axis_label: Errors
2854-
type: COUNTER
2855-
unit: COUNT
2856-
aggregation: AVG
2857-
derivative: NON_NEGATIVE_DERIVATIVE
2858-
- name: distsender.rpc.err.errordetailtype(30)
2859-
exported_name: distsender_rpc_err_errordetailtype_30_
2860-
description: |
2861-
Number of ErrorDetailType(30) errors received replica-bound RPCs
2862-
2863-
This counts how often error of the specified type was received back from replicas
2864-
as part of executing possibly range-spanning requests. Failures to reach the target
2865-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2866-
errors as 'roachpb.InternalErrType'.
2867-
y_axis_label: Errors
2868-
type: COUNTER
2869-
unit: COUNT
2870-
aggregation: AVG
2871-
derivative: NON_NEGATIVE_DERIVATIVE
2872-
- name: distsender.rpc.err.errordetailtype(33)
2873-
exported_name: distsender_rpc_err_errordetailtype_33_
2874-
description: |
2875-
Number of ErrorDetailType(33) errors received replica-bound RPCs
2876-
2877-
This counts how often error of the specified type was received back from replicas
2878-
as part of executing possibly range-spanning requests. Failures to reach the target
2879-
replica will be accounted for as 'roachpb.CommunicationErrType' and unclassified
2880-
errors as 'roachpb.InternalErrType'.
2881-
y_axis_label: Errors
2882-
type: COUNTER
2883-
unit: COUNT
2884-
aggregation: AVG
2885-
derivative: NON_NEGATIVE_DERIVATIVE
28862746
- name: distsender.rpc.err.exclusionviolationerrtype
28872747
exported_name: distsender_rpc_err_exclusionviolationerrtype
28882748
description: |

pkg/kv/kvclient/kvcoord/dist_sender.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -460,8 +460,9 @@ type DistSenderMetrics struct {
460460
ProxyForwardSentCount *metric.Counter
461461
ProxyForwardErrCount *metric.Counter
462462
MethodCounts [kvpb.NumMethods]*metric.Counter
463-
ErrCounts [kvpb.NumErrors]*metric.Counter
464-
CircuitBreaker DistSenderCircuitBreakerMetrics
463+
// ErrCounts[i] can be nil if i'th error has been deprecated.
464+
ErrCounts [kvpb.NumErrors]*metric.Counter
465+
CircuitBreaker DistSenderCircuitBreakerMetrics
465466
DistSenderRangeFeedMetrics
466467
}
467468

@@ -526,6 +527,10 @@ func MakeDistSenderMetrics(locality roachpb.Locality) DistSenderMetrics {
526527
}
527528
for i := range m.ErrCounts {
528529
errType := kvpb.ErrorDetailType(i).String()
530+
if strings.HasPrefix(errType, "ErrorDetailType") {
531+
// This error index has been deprecated.
532+
continue
533+
}
529534
meta := metaDistSenderErrCountTmpl
530535
meta.Name = fmt.Sprintf(meta.Name, strings.ToLower(errType))
531536
meta.Help = fmt.Sprintf(meta.Help, errType)
@@ -3187,14 +3192,18 @@ func (ds *DistSender) maybeIncrementErrCounters(br *kvpb.BatchResponse, err erro
31873192
if err == nil && br.Error == nil {
31883193
return
31893194
}
3195+
var counter *metric.Counter
31903196
if err != nil {
3191-
ds.metrics.ErrCounts[kvpb.CommunicationErrType].Inc(1)
3197+
counter = ds.metrics.ErrCounts[kvpb.CommunicationErrType]
31923198
} else {
31933199
typ := kvpb.InternalErrType
31943200
if detail := br.Error.GetDetail(); detail != nil {
31953201
typ = detail.Type()
31963202
}
3197-
ds.metrics.ErrCounts[typ].Inc(1)
3203+
counter = ds.metrics.ErrCounts[typ]
3204+
}
3205+
if counter != nil {
3206+
counter.Inc(1)
31983207
}
31993208
}
32003209

pkg/sql/delegate/show_inspect_errors.go

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,25 +57,37 @@ func (d *delegator) delegateShowInspectErrors(n *tree.ShowInspectErrors) (tree.S
5757
query.WriteString(fmt.Sprintf(" AND id = %d", *n.JobID))
5858
}
5959

60-
// TODO(148287): query the inspect job payload to figure out if a job touches a particular table or database ID
61-
// If a table was specified, only consider jobs that reported errors on it.
62-
// If a job ID was specified, only consider that job. The records from the
63-
// most recent completed job that satisfies those criteria is used.
6460
query.WriteString(`),
65-
job_id AS (
66-
SELECT max(inspect_jobs.id) as id
61+
filtered_jobs AS (
62+
SELECT inspect_jobs.id
6763
FROM inspect_jobs
68-
JOIN crdb_internal.cluster_inspect_errors ie ON inspect_jobs.id = ie.job_id
69-
WHERE 1=1
64+
JOIN crdb_internal.system_jobs sj ON inspect_jobs.id = sj.id
7065
`)
66+
// If a table is specified, limit results to jobs that include checks
67+
// on that table. The table is matched by inspecting the job payload.
7168
if tableID != catid.InvalidDescID {
72-
query.WriteString(fmt.Sprintf(" AND ie.id = %d", tableID))
73-
}
74-
if n.JobID != nil {
75-
query.WriteString(fmt.Sprintf(" AND ie.job_id = %d", *n.JobID))
69+
query.WriteString(fmt.Sprintf(`
70+
WHERE EXISTS (
71+
SELECT 1
72+
FROM jsonb_array_elements(
73+
COALESCE(
74+
crdb_internal.pb_to_json('cockroach.sql.jobs.jobspb.Payload', sj.payload)
75+
-> 'inspectDetails' -> 'checks',
76+
'[]'::JSONB
77+
)
78+
) AS c
79+
WHERE (c ->> 'tableId')::INT = %d
80+
)`, tableID))
7681
}
82+
// Reports on a single job. If multiple match, use the most recent one.
83+
query.WriteString(`),
84+
job_id AS (
85+
SELECT max(id) AS id
86+
FROM filtered_jobs
87+
)
88+
`)
7789

78-
query.WriteString(`)
90+
query.WriteString(`
7991
SELECT
8092
ie.error_type,
8193
COALESCE(t.database_name, '<unknown>') AS database_name,

0 commit comments

Comments
 (0)