Skip to content

Commit 50f3c98

Browse files
authored
feat(meta-service): add metrics for snapshot building (#18212)
Add two snapshot metrics: - whether it is building snapshot, - and the entries written to snapshot. ``` # 0 or 1 metasrv_raft_storage_snapshot_building 0 # total number of entries written to snapshot metasrv_raft_storage_snapshot_written_entries_total 123 ```
1 parent e366f7e commit 50f3c98

File tree

4 files changed

+91
-31
lines changed

4 files changed

+91
-31
lines changed

src/meta/service/src/metrics/meta_metrics.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ pub mod raft_metrics {
377377
sent_bytes: Family<ToLabels, Counter>,
378378
recv_bytes: Family<FromLabels, Counter>,
379379
sent_failures: Family<ToLabels, Counter>,
380+
380381
append_sent_seconds: Family<ToLabels, Histogram>,
381382
snapshot_send_success: Family<ToLabels, Counter>,
382383
snapshot_send_failure: Family<ToLabels, Counter>,
@@ -601,6 +602,7 @@ pub mod raft_metrics {
601602
use prometheus_client::encoding::EncodeLabelSet;
602603
use prometheus_client::metrics::counter::Counter;
603604
use prometheus_client::metrics::family::Family;
605+
use prometheus_client::metrics::gauge::Gauge;
604606

605607
use crate::metrics::registry::load_global_registry;
606608

@@ -618,13 +620,24 @@ pub mod raft_metrics {
618620
struct StorageMetrics {
619621
raft_store_write_failed: Family<FuncLabels, Counter>,
620622
raft_store_read_failed: Family<FuncLabels, Counter>,
623+
624+
/// The number of tasks that are building a snapshot.
625+
///
626+
/// It should be 0 or 1.
627+
snapshot_building: Gauge,
628+
629+
/// The number of entries written to the snapshot file.
630+
snapshot_written_entries: Counter,
621631
}
622632

623633
impl StorageMetrics {
624634
fn init() -> Self {
625635
let metrics = Self {
626636
raft_store_write_failed: Family::default(),
627637
raft_store_read_failed: Family::default(),
638+
639+
snapshot_building: Gauge::default(),
640+
snapshot_written_entries: Counter::default(),
628641
};
629642

630643
let mut registry = load_global_registry();
@@ -638,6 +651,18 @@ pub mod raft_metrics {
638651
"raft store read failed",
639652
metrics.raft_store_read_failed.clone(),
640653
);
654+
655+
registry.register(
656+
key!("snapshot_building"),
657+
"The number of tasks that are building a snapshot. It should be 0 or 1.",
658+
metrics.snapshot_building.clone(),
659+
);
660+
registry.register(
661+
key!("snapshot_written_entries"),
662+
"The number of entries written to the snapshot file.",
663+
metrics.snapshot_written_entries.clone(),
664+
);
665+
641666
metrics
642667
}
643668
}
@@ -682,6 +707,14 @@ pub mod raft_metrics {
682707
.inc();
683708
}
684709
}
710+
711+
pub fn incr_snapshot_building_by(cnt: i64) {
712+
STORAGE_METRICS.snapshot_building.inc_by(cnt);
713+
}
714+
715+
pub fn incr_snapshot_written_entries() {
716+
STORAGE_METRICS.snapshot_written_entries.inc();
717+
}
685718
}
686719
}
687720

@@ -918,3 +951,12 @@ pub fn meta_metrics_to_prometheus_string() -> String {
918951
prometheus_encode(&mut text, &registry).unwrap();
919952
text
920953
}
954+
955+
#[derive(Default)]
956+
pub(crate) struct SnapshotBuilding;
957+
958+
impl count::Count for SnapshotBuilding {
959+
fn incr_count(&mut self, n: i64) {
960+
raft_metrics::storage::incr_snapshot_building_by(n);
961+
}
962+
}

src/meta/service/src/metrics/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ pub use meta_metrics::raft_metrics;
2121
pub use meta_metrics::server_metrics;
2222
pub(crate) use meta_metrics::ProposalPending;
2323
pub(crate) use meta_metrics::RequestInFlight;
24+
pub(crate) use meta_metrics::SnapshotBuilding;

src/meta/service/src/store/store_inner.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,17 @@ use databend_common_meta_types::snapshot_db::DB;
4848
use databend_common_meta_types::Endpoint;
4949
use databend_common_meta_types::MetaNetworkError;
5050
use databend_common_meta_types::MetaStartupError;
51+
use databend_common_metrics::count::Count;
5152
use futures::TryStreamExt;
5253
use log::debug;
5354
use log::error;
5455
use log::info;
5556
use raft_log::api::raft_log_writer::RaftLogWriter;
5657
use tokio::time::sleep;
5758

59+
use crate::metrics::raft_metrics;
60+
use crate::metrics::SnapshotBuilding;
61+
5862
/// This is the inner store that implements the raft log storage API.
5963
pub struct RaftStoreInner {
6064
/// The ID of the Raft node for which this storage instances is configured.
@@ -187,6 +191,8 @@ impl RaftStoreInner {
187191

188192
info!(id = self.id; "do_build_snapshot start");
189193

194+
let _guard = SnapshotBuilding::guard();
195+
190196
let mut compactor = {
191197
let mut w = self.state_machine.write().await;
192198
w.freeze_writable();
@@ -238,6 +244,8 @@ impl RaftStoreInner {
238244
tx.send(WriteEntry::Data(ent))
239245
.await
240246
.map_err(|e| StorageError::write_snapshot(Some(signature.clone()), &e))?;
247+
248+
raft_metrics::storage::incr_snapshot_written_entries();
241249
}
242250

243251
tx.send(WriteEntry::Finish(sys_data))

src/meta/service/tests/it/api/http/metrics.rs

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -102,65 +102,72 @@ async fn test_metrics() -> anyhow::Result<()> {
102102
// metasrv_meta_network_rpc_delay_seconds_count 0
103103
// metasrv_meta_network_rpc_delay_seconds_sum 0.0
104104
// metasrv_meta_network_sent_bytes_total 0
105+
// metasrv_meta_network_stream_get_item_sent_total 0
106+
// metasrv_meta_network_stream_list_item_sent_total 0
107+
// metasrv_meta_network_stream_mget_item_sent_total 0
108+
// metasrv_meta_network_watch_change_total 0
109+
// metasrv_meta_network_watch_initialization_total 0
105110
// metasrv_raft_network_active_peers{id="1",addr="127.0.0.1:29003"} 1
106111
// metasrv_raft_network_active_peers{id="2",addr="127.0.0.1:29006"} 1
107-
// metasrv_raft_network_append_sent_seconds_bucket{le="+Inf",to="1"} 9
112+
// metasrv_raft_network_append_sent_seconds_bucket{le="+Inf",to="1"} 8
108113
// metasrv_raft_network_append_sent_seconds_bucket{le="+Inf",to="2"} 6
109-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.001",to="1"} 9
114+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.001",to="1"} 8
110115
// metasrv_raft_network_append_sent_seconds_bucket{le="0.001",to="2"} 6
111-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.002",to="1"} 9
116+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.002",to="1"} 8
112117
// metasrv_raft_network_append_sent_seconds_bucket{le="0.002",to="2"} 6
113-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.004",to="1"} 9
118+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.004",to="1"} 8
114119
// metasrv_raft_network_append_sent_seconds_bucket{le="0.004",to="2"} 6
115-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.008",to="1"} 9
120+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.008",to="1"} 8
116121
// metasrv_raft_network_append_sent_seconds_bucket{le="0.008",to="2"} 6
117-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.016",to="1"} 9
122+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.016",to="1"} 8
118123
// metasrv_raft_network_append_sent_seconds_bucket{le="0.016",to="2"} 6
119-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.032",to="1"} 9
124+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.032",to="1"} 8
120125
// metasrv_raft_network_append_sent_seconds_bucket{le="0.032",to="2"} 6
121-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.064",to="1"} 9
126+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.064",to="1"} 8
122127
// metasrv_raft_network_append_sent_seconds_bucket{le="0.064",to="2"} 6
123-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.128",to="1"} 9
128+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.128",to="1"} 8
124129
// metasrv_raft_network_append_sent_seconds_bucket{le="0.128",to="2"} 6
125-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.256",to="1"} 9
130+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.256",to="1"} 8
126131
// metasrv_raft_network_append_sent_seconds_bucket{le="0.256",to="2"} 6
127-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.512",to="1"} 9
132+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.512",to="1"} 8
128133
// metasrv_raft_network_append_sent_seconds_bucket{le="0.512",to="2"} 6
129-
// metasrv_raft_network_append_sent_seconds_bucket{le="1.024",to="1"} 9
134+
// metasrv_raft_network_append_sent_seconds_bucket{le="1.024",to="1"} 8
130135
// metasrv_raft_network_append_sent_seconds_bucket{le="1.024",to="2"} 6
131-
// metasrv_raft_network_append_sent_seconds_bucket{le="131.072",to="1"} 9
136+
// metasrv_raft_network_append_sent_seconds_bucket{le="131.072",to="1"} 8
132137
// metasrv_raft_network_append_sent_seconds_bucket{le="131.072",to="2"} 6
133-
// metasrv_raft_network_append_sent_seconds_bucket{le="16.384",to="1"} 9
138+
// metasrv_raft_network_append_sent_seconds_bucket{le="16.384",to="1"} 8
134139
// metasrv_raft_network_append_sent_seconds_bucket{le="16.384",to="2"} 6
135-
// metasrv_raft_network_append_sent_seconds_bucket{le="2.048",to="1"} 9
140+
// metasrv_raft_network_append_sent_seconds_bucket{le="2.048",to="1"} 8
136141
// metasrv_raft_network_append_sent_seconds_bucket{le="2.048",to="2"} 6
137-
// metasrv_raft_network_append_sent_seconds_bucket{le="262.144",to="1"} 9
142+
// metasrv_raft_network_append_sent_seconds_bucket{le="262.144",to="1"} 8
138143
// metasrv_raft_network_append_sent_seconds_bucket{le="262.144",to="2"} 6
139-
// metasrv_raft_network_append_sent_seconds_bucket{le="32.768",to="1"} 9
144+
// metasrv_raft_network_append_sent_seconds_bucket{le="32.768",to="1"} 8
140145
// metasrv_raft_network_append_sent_seconds_bucket{le="32.768",to="2"} 6
141-
// metasrv_raft_network_append_sent_seconds_bucket{le="4.096",to="1"} 9
146+
// metasrv_raft_network_append_sent_seconds_bucket{le="4.096",to="1"} 8
142147
// metasrv_raft_network_append_sent_seconds_bucket{le="4.096",to="2"} 6
143-
// metasrv_raft_network_append_sent_seconds_bucket{le="524.288",to="1"} 9
148+
// metasrv_raft_network_append_sent_seconds_bucket{le="524.288",to="1"} 8
144149
// metasrv_raft_network_append_sent_seconds_bucket{le="524.288",to="2"} 6
145-
// metasrv_raft_network_append_sent_seconds_bucket{le="65.536",to="1"} 9
150+
// metasrv_raft_network_append_sent_seconds_bucket{le="65.536",to="1"} 8
146151
// metasrv_raft_network_append_sent_seconds_bucket{le="65.536",to="2"} 6
147-
// metasrv_raft_network_append_sent_seconds_bucket{le="8.192",to="1"} 9
152+
// metasrv_raft_network_append_sent_seconds_bucket{le="8.192",to="1"} 8
148153
// metasrv_raft_network_append_sent_seconds_bucket{le="8.192",to="2"} 6
149-
// metasrv_raft_network_append_sent_seconds_count{to="1"} 9
154+
// metasrv_raft_network_append_sent_seconds_count{to="1"} 8
150155
// metasrv_raft_network_append_sent_seconds_count{to="2"} 6
151156
// metasrv_raft_network_append_sent_seconds_sum{to="1"} 0.0
152157
// metasrv_raft_network_append_sent_seconds_sum{to="2"} 0.0
153-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:59841"} 1830
154-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:59843"} 1764
155-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:59844"} 809
156-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:59846"} 537
157-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:59847"} 537
158-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:59848"} 533
159-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:59849"} 673
158+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57382"} 1830
159+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57384"} 809
160+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57385"} 1764
161+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57387"} 537
162+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57388"} 338
163+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57389"} 533
164+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57390"} 673
160165
// metasrv_raft_network_recv_bytes_total{from="addr"} 1
161-
// metasrv_raft_network_sent_bytes_total{to="1"} 3709
162-
// metasrv_raft_network_sent_bytes_total{to="2"} 2974
166+
// metasrv_raft_network_sent_bytes_total{to="1"} 3650
167+
// metasrv_raft_network_sent_bytes_total{to="2"} 2834
163168
// metasrv_raft_storage_raft_store_write_failed_total{func="fun"} 1
169+
// metasrv_raft_storage_snapshot_building 0
170+
// metasrv_raft_storage_snapshot_written_entries_total 0
164171
// metasrv_server_applying_snapshot 0
165172
// metasrv_server_current_leader_id 0
166173
// metasrv_server_current_term 1
@@ -280,6 +287,8 @@ async fn test_metrics() -> anyhow::Result<()> {
280287

281288
// Raft storage metrics
282289
assert!(metric_keys.contains("metasrv_raft_storage_raft_store_write_failed_total"));
290+
assert!(metric_keys.contains("metasrv_raft_storage_snapshot_building"));
291+
assert!(metric_keys.contains("metasrv_raft_storage_snapshot_written_entries_total"));
283292

284293
// Watch
285294
assert!(metric_keys.contains("metasrv_meta_network_watch_initialization_total"));

0 commit comments

Comments
 (0)