Skip to content

Commit cc1bcb9

Browse files
nipunn1313Convex, Inc.
authored andcommitted
Tag snapshot export/import failure metrics with error type (#39135)
Then we can alert more aggressively on non-operational errors. Stuff like TableSummaryBootstrapping is operational. GitOrigin-RevId: 37f4375a801641f398ee04cc3096c877a5d2ffdc
1 parent a20d4e9 commit cc1bcb9

File tree

4 files changed

+25
-10
lines changed

4 files changed

+25
-10
lines changed

crates/application/src/exports/metrics.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
use errors::ErrorMetadataAnyhowExt;
12
use metrics::{
2-
log_counter,
3+
log_counter_with_labels,
34
register_convex_counter,
45
register_convex_histogram,
56
StaticMetricLabel,
@@ -23,7 +24,13 @@ pub fn export_timer(instance_name: &str) -> StatusTimer {
2324
register_convex_counter!(
2425
SNAPSHOT_EXPORT_FAILED_TOTAL,
2526
"Number of snapshot export attempts that failed",
27+
&["status"]
2628
);
27-
pub fn log_export_failed() {
28-
log_counter(&SNAPSHOT_EXPORT_FAILED_TOTAL, 1);
29+
pub fn log_export_failed(e: &anyhow::Error) {
30+
let status = e.metric_status_label_value();
31+
log_counter_with_labels(
32+
&SNAPSHOT_EXPORT_FAILED_TOTAL,
33+
1,
34+
vec![StaticMetricLabel::new("status", status)],
35+
);
2936
}

crates/application/src/exports/worker.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ impl<RT: Runtime> ExportWorker<RT> {
183183
tracing::info!("Export {} canceled", export.id());
184184
return Ok(());
185185
}
186-
log_export_failed();
186+
log_export_failed(&e);
187187
report_error(&mut e).await;
188188
let delay = self.backoff.fail(&mut self.runtime.rng());
189189
tracing::error!("Export failed, retrying in {delay:?}");

crates/application/src/snapshot_import/metrics.rs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
use std::time::Duration;
22

3+
use errors::ErrorMetadataAnyhowExt;
34
use metrics::{
4-
log_counter,
5+
log_counter_with_labels,
56
log_distribution,
67
register_convex_counter,
78
register_convex_histogram,
9+
StaticMetricLabel,
810
StatusTimer,
911
STATUS_LABEL,
1012
};
@@ -27,9 +29,15 @@ pub fn log_snapshot_import_age(age: Duration) {
2729
}
2830

2931
register_convex_counter!(
30-
SNAPSHOT_IMPORT_WORKER_DIED_TOTAL,
32+
SNAPSHOT_IMPORT_FAILED_TOTAL,
3133
"Number of times the snapshot import worker died",
34+
&["status"]
3235
);
33-
pub fn log_snapshot_import_worker_died() {
34-
log_counter(&SNAPSHOT_IMPORT_WORKER_DIED_TOTAL, 1);
36+
pub fn log_snapshot_import_failed(e: &anyhow::Error) {
37+
let status = e.metric_status_label_value();
38+
log_counter_with_labels(
39+
&SNAPSHOT_IMPORT_FAILED_TOTAL,
40+
1,
41+
vec![StaticMetricLabel::new("status", status)],
42+
);
3543
}

crates/application/src/snapshot_import/worker.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use crate::{
2323
metrics::log_worker_starting,
2424
snapshot_import::{
2525
metrics::{
26-
log_snapshot_import_worker_died,
26+
log_snapshot_import_failed,
2727
snapshot_import_timer,
2828
},
2929
SnapshotImportExecutor,
@@ -54,7 +54,7 @@ impl SnapshotImportWorker {
5454
async move {
5555
loop {
5656
if let Err(e) = Self::run_once(&mut worker).await {
57-
log_snapshot_import_worker_died();
57+
log_snapshot_import_failed(&e);
5858
report_error(&mut e.context("SnapshotImportWorker died")).await;
5959
let delay = worker.backoff.fail(&mut worker.runtime.rng());
6060
worker.runtime.wait(delay).await;

0 commit comments

Comments
 (0)