Skip to content

Commit 07bf7bf

Browse files
committed
crosscluster: clean up some metrics
This patch labels a few PCR/LDR metrics as essential and applies a static label to several ldr event metrics. Epic: CRDB-52339 Release note: none
1 parent fccbd7e commit 07bf7bf

File tree

3 files changed

+134
-60
lines changed

3 files changed

+134
-60
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 83 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,28 @@ layers:
8585
derivative: NONE
8686
how_to_use: Changefeeds use protected timestamps to protect the data from being garbage collected. Ensure the protected timestamp age does not significantly exceed the GC TTL zone configuration. Alert on this metric if the protected timestamp age is greater than 3 times the GC TTL.
8787
essential: true
88+
- name: CROSS_CLUSTER_REPLICATION
89+
metrics:
90+
- name: physical_replication.logical_bytes
91+
exported_name: physical_replication_logical_bytes
92+
description: Logical bytes (sum of keys + values) ingested by all replication jobs
93+
y_axis_label: Bytes
94+
type: COUNTER
95+
unit: BYTES
96+
aggregation: AVG
97+
derivative: NON_NEGATIVE_DERIVATIVE
98+
how_to_use: Track PCR throughput
99+
essential: true
100+
- name: physical_replication.replicated_time_seconds
101+
exported_name: physical_replication_replicated_time_seconds
102+
description: The replicated time of the physical replication stream in seconds since the unix epoch.
103+
y_axis_label: Seconds
104+
type: GAUGE
105+
unit: SECONDS
106+
aggregation: AVG
107+
derivative: NONE
108+
how_to_use: Track replication lag via current time - physical_replication.replicated_time_seconds
109+
essential: true
88110
- name: DISTRIBUTED
89111
metrics:
90112
- name: distsender.errors.notleaseholder
@@ -107,6 +129,58 @@ layers:
107129
derivative: NON_NEGATIVE_DERIVATIVE
108130
how_to_use: RPC errors do not necessarily indicate a problem. This metric tracks remote procedure calls that return a status value other than "success". A non-success status of an RPC should not be misconstrued as a network transport issue. It is database code logic executed on another cluster node. The non-success status is a result of an orderly execution of an RPC that reports a specific logical condition.
109131
essential: true
132+
- name: LOGICAL_DATA_REPLICATION
133+
metrics:
134+
- name: logical_replication.commit_latency
135+
exported_name: logical_replication_commit_latency
136+
description: 'Event commit latency: a difference between event MVCC timestamp and the time it was flushed into disk. If we batch events, then the difference between the oldest event in the batch and flush is recorded'
137+
y_axis_label: Nanoseconds
138+
type: HISTOGRAM
139+
unit: NANOSECONDS
140+
aggregation: AVG
141+
derivative: NONE
142+
how_to_use: track the latency of of applying events from source to destination
143+
essential: true
144+
- name: logical_replication.events_dlqed
145+
exported_name: logical_replication_events_dlqed
146+
description: Row update events sent to DLQ
147+
y_axis_label: Failures
148+
type: COUNTER
149+
unit: COUNT
150+
aggregation: AVG
151+
derivative: NON_NEGATIVE_DERIVATIVE
152+
how_to_use: track events sent to the dead letter queue
153+
essential: true
154+
- name: logical_replication.events_ingested
155+
exported_name: logical_replication_events_ingested
156+
description: Events ingested by all replication jobs
157+
y_axis_label: Events
158+
type: COUNTER
159+
unit: COUNT
160+
aggregation: AVG
161+
derivative: NON_NEGATIVE_DERIVATIVE
162+
how_to_use: track events (e.g. updates, deletes, inserts) ingested
163+
essential: true
164+
- name: logical_replication.logical_bytes
165+
exported_name: logical_replication_logical_bytes
166+
description: Logical bytes (sum of keys + values) received by all replication jobs
167+
y_axis_label: Bytes
168+
type: COUNTER
169+
unit: BYTES
170+
aggregation: AVG
171+
derivative: NON_NEGATIVE_DERIVATIVE
172+
how_to_use: track logical data replication throughput
173+
essential: true
174+
- name: logical_replication.replicated_time_seconds
175+
exported_name: logical_replication_replicated_time_seconds
176+
description: The replicated time of the logical replication stream in seconds since the unix epoch.
177+
y_axis_label: Seconds
178+
type: GAUGE
179+
unit: SECONDS
180+
aggregation: AVG
181+
derivative: NONE
182+
how_to_use: Track replication lag via current time - logical_replication.replicated_time_seconds
183+
essential: true
110184
- name: NETWORKING
111185
metrics:
112186
- name: clock-offset.meannanos
@@ -7026,24 +7100,9 @@ layers:
70267100
unit: COUNT
70277101
aggregation: AVG
70287102
derivative: NON_NEGATIVE_DERIVATIVE
7029-
- name: logical_replication.commit_latency
7030-
exported_name: logical_replication_commit_latency
7031-
description: 'Event commit latency: a difference between event MVCC timestamp and the time it was flushed into disk. If we batch events, then the difference between the oldest event in the batch and flush is recorded'
7032-
y_axis_label: Nanoseconds
7033-
type: HISTOGRAM
7034-
unit: NANOSECONDS
7035-
aggregation: AVG
7036-
derivative: NONE
7037-
- name: logical_replication.events_dlqed
7038-
exported_name: logical_replication_events_dlqed
7039-
description: Row update events sent to DLQ
7040-
y_axis_label: Failures
7041-
type: COUNTER
7042-
unit: COUNT
7043-
aggregation: AVG
7044-
derivative: NON_NEGATIVE_DERIVATIVE
70457103
- name: logical_replication.events_dlqed_age
70467104
exported_name: logical_replication_events_dlqed_age
7105+
labeled_name: 'logical_replication.events{type: dlqed_age}'
70477106
description: Row update events sent to DLQ due to reaching the maximum time allowed in the retry queue
70487107
y_axis_label: Failures
70497108
type: COUNTER
@@ -7060,6 +7119,7 @@ layers:
70607119
derivative: NON_NEGATIVE_DERIVATIVE
70617120
- name: logical_replication.events_dlqed_errtype
70627121
exported_name: logical_replication_events_dlqed_errtype
7122+
labeled_name: 'logical_replication.events{type: dlqed_errtype}'
70637123
description: Row update events sent to DLQ due to an error not considered retryable
70647124
y_axis_label: Failures
70657125
type: COUNTER
@@ -7068,20 +7128,13 @@ layers:
70687128
derivative: NON_NEGATIVE_DERIVATIVE
70697129
- name: logical_replication.events_dlqed_space
70707130
exported_name: logical_replication_events_dlqed_space
7131+
labeled_name: 'logical_replication.events{type: dlqed_space}'
70717132
description: Row update events sent to DLQ due to capacity of the retry queue
70727133
y_axis_label: Failures
70737134
type: COUNTER
70747135
unit: COUNT
70757136
aggregation: AVG
70767137
derivative: NON_NEGATIVE_DERIVATIVE
7077-
- name: logical_replication.events_ingested
7078-
exported_name: logical_replication_events_ingested
7079-
description: Events ingested by all replication jobs
7080-
y_axis_label: Events
7081-
type: COUNTER
7082-
unit: COUNT
7083-
aggregation: AVG
7084-
derivative: NON_NEGATIVE_DERIVATIVE
70857138
- name: logical_replication.events_ingested_by_label
70867139
exported_name: logical_replication_events_ingested_by_label
70877140
description: Events ingested by all replication jobs by label
@@ -7092,6 +7145,7 @@ layers:
70927145
derivative: NON_NEGATIVE_DERIVATIVE
70937146
- name: logical_replication.events_initial_failure
70947147
exported_name: logical_replication_events_initial_failure
7148+
labeled_name: 'logical_replication.events{type: initial_failure}'
70957149
description: Failed attempts to apply an incoming row update
70967150
y_axis_label: Failures
70977151
type: COUNTER
@@ -7100,14 +7154,16 @@ layers:
71007154
derivative: NON_NEGATIVE_DERIVATIVE
71017155
- name: logical_replication.events_initial_success
71027156
exported_name: logical_replication_events_initial_success
7157+
labeled_name: 'logical_replication.events{type: initial_success}'
71037158
description: Successful applications of an incoming row update
7104-
y_axis_label: Failures
7159+
y_axis_label: Successes
71057160
type: COUNTER
71067161
unit: COUNT
71077162
aggregation: AVG
71087163
derivative: NON_NEGATIVE_DERIVATIVE
71097164
- name: logical_replication.events_retry_failure
71107165
exported_name: logical_replication_events_retry_failure
7166+
labeled_name: 'logical_replication.events{type: retry_failure}'
71117167
description: Failed re-attempts to apply a row update
71127168
y_axis_label: Failures
71137169
type: COUNTER
@@ -7116,8 +7172,9 @@ layers:
71167172
derivative: NON_NEGATIVE_DERIVATIVE
71177173
- name: logical_replication.events_retry_success
71187174
exported_name: logical_replication_events_retry_success
7175+
labeled_name: 'logical_replication.events{type: retry_success}'
71197176
description: Row update events applied after one or more retries
7120-
y_axis_label: Failures
7177+
y_axis_label: Successes
71217178
type: COUNTER
71227179
unit: COUNT
71237180
aggregation: AVG
@@ -7138,14 +7195,6 @@ layers:
71387195
unit: COUNT
71397196
aggregation: AVG
71407197
derivative: NON_NEGATIVE_DERIVATIVE
7141-
- name: logical_replication.logical_bytes
7142-
exported_name: logical_replication_logical_bytes
7143-
description: Logical bytes (sum of keys + values) received by all replication jobs
7144-
y_axis_label: Bytes
7145-
type: COUNTER
7146-
unit: BYTES
7147-
aggregation: AVG
7148-
derivative: NON_NEGATIVE_DERIVATIVE
71497198
- name: logical_replication.replan_count
71507199
exported_name: logical_replication_replan_count
71517200
description: Total number of dist sql replanning events
@@ -7162,14 +7211,6 @@ layers:
71627211
unit: SECONDS
71637212
aggregation: AVG
71647213
derivative: NONE
7165-
- name: logical_replication.replicated_time_seconds
7166-
exported_name: logical_replication_replicated_time_seconds
7167-
description: The replicated time of the logical replication stream in seconds since the unix epoch.
7168-
y_axis_label: Seconds
7169-
type: GAUGE
7170-
unit: SECONDS
7171-
aggregation: AVG
7172-
derivative: NONE
71737214
- name: logical_replication.retry_queue_bytes
71747215
exported_name: logical_replication_retry_queue_bytes
71757216
description: Logical bytes (sum of keys+values) in the retry queue
@@ -7290,22 +7331,6 @@ layers:
72907331
unit: COUNT
72917332
aggregation: AVG
72927333
derivative: NON_NEGATIVE_DERIVATIVE
7293-
- name: physical_replication.logical_bytes
7294-
exported_name: physical_replication_logical_bytes
7295-
description: Logical bytes (sum of keys + values) ingested by all replication jobs
7296-
y_axis_label: Bytes
7297-
type: COUNTER
7298-
unit: BYTES
7299-
aggregation: AVG
7300-
derivative: NON_NEGATIVE_DERIVATIVE
7301-
- name: physical_replication.replicated_time_seconds
7302-
exported_name: physical_replication_replicated_time_seconds
7303-
description: The replicated time of the physical replication stream in seconds since the unix epoch.
7304-
y_axis_label: Seconds
7305-
type: GAUGE
7306-
unit: SECONDS
7307-
aggregation: AVG
7308-
derivative: NONE
73097334
- name: physical_replication.resolved_events_ingested
73107335
exported_name: physical_replication_resolved_events_ingested
73117336
description: Resolved events ingested by all replication jobs

pkg/crosscluster/logical/metrics.go

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,33 +17,48 @@ var (
1717
Name: "logical_replication.events_ingested",
1818
Help: "Events ingested by all replication jobs",
1919
Measurement: "Events",
20+
Essential: true,
21+
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
2022
Unit: metric.Unit_COUNT,
23+
HowToUse: "track events (e.g. updates, deletes, inserts) ingested",
2124
}
2225
metaDLQedRowUpdates = metric.Metadata{
2326
Name: "logical_replication.events_dlqed",
2427
Help: "Row update events sent to DLQ",
2528
Measurement: "Failures",
29+
Essential: true,
30+
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
2631
Unit: metric.Unit_COUNT,
32+
HowToUse: "track events sent to the dead letter queue",
2733
}
2834
metaReceivedLogicalBytes = metric.Metadata{
2935
Name: "logical_replication.logical_bytes",
3036
Help: "Logical bytes (sum of keys + values) received by all replication jobs",
37+
Essential: true,
38+
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
3139
Measurement: "Bytes",
3240
Unit: metric.Unit_BYTES,
41+
HowToUse: "track logical data replication throughput",
3342
}
3443
metaCommitToCommitLatency = metric.Metadata{
3544
Name: "logical_replication.commit_latency",
3645
Help: "Event commit latency: a difference between event MVCC timestamp " +
3746
"and the time it was flushed into disk. If we batch events, then the difference " +
3847
"between the oldest event in the batch and flush is recorded",
3948
Measurement: "Nanoseconds",
49+
Essential: true,
50+
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
4051
Unit: metric.Unit_NANOSECONDS,
52+
HowToUse: "track the latency of of applying events from source to destination",
4153
}
4254
metaReplicatedTimeSeconds = metric.Metadata{
4355
Name: "logical_replication.replicated_time_seconds",
4456
Help: "The replicated time of the logical replication stream in seconds since the unix epoch.",
4557
Measurement: "Seconds",
58+
Essential: true,
59+
Category: metric.Metadata_LOGICAL_DATA_REPLICATION,
4660
Unit: metric.Unit_SECONDS,
61+
HowToUse: "Track replication lag via current time - logical_replication.replicated_time_seconds",
4762
}
4863

4964
// User-visible health and ops metrics.
@@ -68,45 +83,73 @@ var (
6883
metaInitialApplySuccess = metric.Metadata{
6984
Name: "logical_replication.events_initial_success",
7085
Help: "Successful applications of an incoming row update",
71-
Measurement: "Failures",
86+
Measurement: "Successes",
7287
Unit: metric.Unit_COUNT,
88+
LabeledName: "logical_replication.events",
89+
StaticLabels: metric.MakeLabelPairs(
90+
metric.LabelType, "initial_success",
91+
),
7392
}
7493
metaInitialApplyFailures = metric.Metadata{
7594
Name: "logical_replication.events_initial_failure",
7695
Help: "Failed attempts to apply an incoming row update",
7796
Measurement: "Failures",
7897
Unit: metric.Unit_COUNT,
98+
LabeledName: "logical_replication.events",
99+
StaticLabels: metric.MakeLabelPairs(
100+
metric.LabelType, "initial_failure",
101+
),
79102
}
80103
metaRetriedApplySuccesses = metric.Metadata{
81104
Name: "logical_replication.events_retry_success",
82105
Help: "Row update events applied after one or more retries",
83-
Measurement: "Failures",
106+
Measurement: "Successes",
84107
Unit: metric.Unit_COUNT,
108+
LabeledName: "logical_replication.events",
109+
StaticLabels: metric.MakeLabelPairs(
110+
metric.LabelType, "retry_success",
111+
),
85112
}
86113
metaRetriedApplyFailures = metric.Metadata{
87114
Name: "logical_replication.events_retry_failure",
88115
Help: "Failed re-attempts to apply a row update",
89116
Measurement: "Failures",
90117
Unit: metric.Unit_COUNT,
118+
LabeledName: "logical_replication.events",
119+
StaticLabels: metric.MakeLabelPairs(
120+
metric.LabelType, "retry_failure",
121+
),
91122
}
92123

93124
metaDLQedDueToAge = metric.Metadata{
94125
Name: "logical_replication.events_dlqed_age",
95126
Help: "Row update events sent to DLQ due to reaching the maximum time allowed in the retry queue",
96127
Measurement: "Failures",
97128
Unit: metric.Unit_COUNT,
129+
LabeledName: "logical_replication.events",
130+
StaticLabels: metric.MakeLabelPairs(
131+
metric.LabelType, "dlqed_age",
132+
),
98133
}
99134
metaDLQedDueToQueueSpace = metric.Metadata{
100135
Name: "logical_replication.events_dlqed_space",
101136
Help: "Row update events sent to DLQ due to capacity of the retry queue",
102137
Measurement: "Failures",
103138
Unit: metric.Unit_COUNT,
139+
LabeledName: "logical_replication.events",
140+
StaticLabels: metric.MakeLabelPairs(
141+
metric.LabelType, "dlqed_space",
142+
),
104143
}
105144
metaDLQedDueToErrType = metric.Metadata{
106145
Name: "logical_replication.events_dlqed_errtype",
107146
Help: "Row update events sent to DLQ due to an error not considered retryable",
108147
Measurement: "Failures",
109148
Unit: metric.Unit_COUNT,
149+
LabeledName: "logical_replication.events",
150+
StaticLabels: metric.MakeLabelPairs(
151+
metric.LabelType, "dlqed_errtype",
152+
),
110153
}
111154

112155
// Internal metrics.

pkg/crosscluster/physical/metrics.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ var (
3535
Name: "physical_replication.logical_bytes",
3636
Help: "Logical bytes (sum of keys + values) ingested by all replication jobs",
3737
Measurement: "Bytes",
38+
Essential: true,
39+
Category: metric.Metadata_CROSS_CLUSTER_REPLICATION,
3840
Unit: metric.Unit_BYTES,
41+
HowToUse: "Track PCR throughput",
3942
}
4043
metaReplicationFlushes = metric.Metadata{
4144
Name: "physical_replication.flushes",
@@ -76,7 +79,10 @@ var (
7679
Name: "physical_replication.replicated_time_seconds",
7780
Help: "The replicated time of the physical replication stream in seconds since the unix epoch.",
7881
Measurement: "Seconds",
82+
Essential: true,
83+
Category: metric.Metadata_CROSS_CLUSTER_REPLICATION,
7984
Unit: metric.Unit_SECONDS,
85+
HowToUse: "Track replication lag via current time - physical_replication.replicated_time_seconds",
8086
}
8187
// This metric would be 0 until cutover begins, and then it will be updated to
8288
// the total number of ranges that need to be reverted, and then gradually go

0 commit comments

Comments
 (0)