Skip to content

Commit 2c99447

Browse files
authored
feat(meta-service): Add comprehensive snapshot database metrics (#18422)
Add detailed snapshot database metrics to monitor storage performance and statistics: - Block count, data size, index size metrics - Average block size and keys per block calculations - Block read statistics (total, from cache, from disk) - Upgrade rotbl dependency to 0.2.7 for enhanced access statistics These metrics provide visibility into snapshot storage efficiency and cache performance. All metrics are `gauge` type: ```config # number of keys in the last snapshot. metasrv_server_snapshot_key_count 0 # number of primary keys in the last snapshot. metasrv_server_snapshot_primary_index_count 0 # number of expire index keys in the last snapshot. metasrv_server_snapshot_expire_index_count 0 # number of blocks in the last snapshot. metasrv_server_snapshot_block_count 0 # size of data section in the last snapshot. metasrv_server_snapshot_data_size 0 # size of index section in the last snapshot. metasrv_server_snapshot_index_size 0 # average size of a block in the last snapshot. metasrv_server_snapshot_avg_block_size 0 # average number of keys per block in the last snapshot. metasrv_server_snapshot_avg_keys_per_block 0 # total number of read block from cache or from disk. metasrv_server_snapshot_read_block 0 # total number of read block from cache. metasrv_server_snapshot_read_block_from_cache 0 # total number of read block from disk. metasrv_server_snapshot_read_block_from_disk 0 ```
1 parent ea37a09 commit 2c99447

File tree

8 files changed

+205
-39
lines changed

8 files changed

+205
-39
lines changed

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ reqwest-hickory-resolver = "0.2"
466466
ringbuffer = "0.14.2"
467467
rmp-serde = "1.1.1"
468468
roaring = { version = "^0.10", features = ["serde"] }
469-
rotbl = { version = "0.2.6", features = [] }
469+
rotbl = { version = "0.2.7", features = [] }
470470
rust_decimal = "1.26"
471471
rustix = "0.38.37"
472472
rustls = { version = "0.23.27", features = ["ring", "tls12"], default-features = false }

src/meta/service/src/api/grpc/grpc_service.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,8 @@ impl MetaService for MetaServiceImpl {
577577
wal_closed_chunk_sizes: status.raft_log.wal_closed_chunk_sizes,
578578
}),
579579

580-
snapshot_key_count: status.snapshot_key_count as u64,
580+
snapshot_key_count: status.snapshot_key_count,
581+
581582
state: status.state,
582583
is_leader: status.is_leader,
583584
current_term: status.current_term,

src/meta/service/src/meta_service/meta_node.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ use databend_common_meta_types::raft_types::MembershipNode;
5353
use databend_common_meta_types::raft_types::NodeId;
5454
use databend_common_meta_types::raft_types::RaftMetrics;
5555
use databend_common_meta_types::raft_types::TypeConfig;
56+
use databend_common_meta_types::snapshot_db::DBStat;
5657
use databend_common_meta_types::AppliedState;
5758
use databend_common_meta_types::Cmd;
5859
use databend_common_meta_types::Endpoint;
@@ -487,6 +488,19 @@ impl MetaNode {
487488
)
488489
}
489490

491+
{
492+
let db_stat = meta_node.get_snapshot_db_stat().await;
493+
let snapshot = server_metrics::snapshot();
494+
snapshot.block_count.set(db_stat.block_num as i64);
495+
snapshot.data_size.set(db_stat.data_size as i64);
496+
snapshot.index_size.set(db_stat.index_size as i64);
497+
snapshot.avg_block_size.set(db_stat.avg_block_size as i64);
498+
snapshot
499+
.avg_keys_per_block
500+
.set(db_stat.avg_keys_per_block as i64);
501+
snapshot.read_block.set(db_stat.read_block as i64);
502+
}
503+
490504
last_leader = mm.current_leader;
491505
}
492506

@@ -898,6 +912,10 @@ impl MetaNode {
898912
self.raft_store.get_snapshot_key_space_stat().await
899913
}
900914

915+
async fn get_snapshot_db_stat(&self) -> DBStat {
916+
self.raft_store.get_snapshot_db_stat().await
917+
}
918+
901919
pub async fn get_status(&self) -> Result<MetaNodeStatus, MetaError> {
902920
let voters = self
903921
.raft_store

src/meta/service/src/metrics/meta_metrics.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,26 @@ pub mod server_metrics {
4747
};
4848
}
4949

50+
#[derive(Default, Debug, Clone)]
51+
pub struct SnapshotStat {
52+
/// The total number of blocks in the snapshot.
53+
pub block_count: Gauge,
54+
/// The total size in bytes of the block data section in the snapshot.
55+
pub data_size: Gauge,
56+
/// The total size in bytes of the block index section in the snapshot.
57+
pub index_size: Gauge,
58+
/// The average size in bytes of a block.
59+
pub avg_block_size: Gauge,
60+
/// The average number of keys per block.
61+
pub avg_keys_per_block: Gauge,
62+
/// The total number of read block from cache or from disk.
63+
pub read_block: Gauge,
64+
/// The total number of read block from cache.
65+
pub read_block_from_cache: Gauge,
66+
/// The total number of read block from disk.
67+
pub read_block_from_disk: Gauge,
68+
}
69+
5070
struct ServerMetrics {
5171
current_leader_id: Gauge,
5272
is_leader: Gauge,
@@ -65,6 +85,8 @@ pub mod server_metrics {
6585
/// `snapshot_key_count = snapshot_primary_index_count + snapshot_expire_index_count`
6686
snapshot_expire_index_count: Gauge,
6787

88+
snapshot_stat: SnapshotStat,
89+
6890
raft_log_cache_items: Gauge,
6991
raft_log_cache_used_size: Gauge,
7092
raft_log_wal_open_chunk_size: Gauge,
@@ -92,10 +114,13 @@ pub mod server_metrics {
92114
node_is_health: Gauge::default(),
93115
leader_changes: Counter::default(),
94116
applying_snapshot: Gauge::default(),
117+
95118
snapshot_key_count: Gauge::default(),
96119
snapshot_primary_index_count: Gauge::default(),
97120
snapshot_expire_index_count: Gauge::default(),
98121

122+
snapshot_stat: Default::default(),
123+
99124
raft_log_cache_items: Gauge::default(),
100125
raft_log_cache_used_size: Gauge::default(),
101126
raft_log_wal_open_chunk_size: Gauge::default(),
@@ -151,6 +176,46 @@ pub mod server_metrics {
151176
"number of expire index keys in the last snapshot",
152177
metrics.snapshot_expire_index_count.clone(),
153178
);
179+
registry.register(
180+
key!("snapshot_block_count"),
181+
"number of blocks in the last snapshot",
182+
metrics.snapshot_stat.block_count.clone(),
183+
);
184+
registry.register(
185+
key!("snapshot_data_size"),
186+
"size of data section in the last snapshot",
187+
metrics.snapshot_stat.data_size.clone(),
188+
);
189+
registry.register(
190+
key!("snapshot_index_size"),
191+
"size of index section in the last snapshot",
192+
metrics.snapshot_stat.index_size.clone(),
193+
);
194+
registry.register(
195+
key!("snapshot_avg_block_size"),
196+
"average size of a block in the last snapshot",
197+
metrics.snapshot_stat.avg_block_size.clone(),
198+
);
199+
registry.register(
200+
key!("snapshot_avg_keys_per_block"),
201+
"average number of keys per block in the last snapshot",
202+
metrics.snapshot_stat.avg_keys_per_block.clone(),
203+
);
204+
registry.register(
205+
key!("snapshot_read_block"),
206+
"total number of read block from cache or from disk",
207+
metrics.snapshot_stat.read_block.clone(),
208+
);
209+
registry.register(
210+
key!("snapshot_read_block_from_cache"),
211+
"total number of read block from cache",
212+
metrics.snapshot_stat.read_block_from_cache.clone(),
213+
);
214+
registry.register(
215+
key!("snapshot_read_block_from_disk"),
216+
"total number of read block from disk",
217+
metrics.snapshot_stat.read_block_from_disk.clone(),
218+
);
154219

155220
registry.register(
156221
key!("raft_log_cache_items"),
@@ -258,6 +323,10 @@ pub mod server_metrics {
258323
SERVER_METRICS.snapshot_expire_index_count.set(n as i64);
259324
}
260325

326+
pub fn snapshot() -> &'static SnapshotStat {
327+
&SERVER_METRICS.snapshot_stat
328+
}
329+
261330
pub fn set_raft_log_stat(st: RaftLogStat) {
262331
SERVER_METRICS
263332
.raft_log_cache_items

src/meta/service/src/store/store_inner.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ use databend_common_meta_types::raft_types::NodeId;
4343
use databend_common_meta_types::raft_types::Snapshot;
4444
use databend_common_meta_types::raft_types::SnapshotMeta;
4545
use databend_common_meta_types::raft_types::StorageError;
46+
use databend_common_meta_types::snapshot_db::DBStat;
4647
use databend_common_meta_types::snapshot_db::DB;
4748
use databend_common_meta_types::Endpoint;
4849
use databend_common_meta_types::MetaNetworkError;
@@ -310,6 +311,15 @@ impl RaftStoreInner {
310311
db.sys_data().key_counts().clone()
311312
}
312313

314+
/// Get the statistics of the snapshot database.
315+
pub(crate) async fn get_snapshot_db_stat(&self) -> DBStat {
316+
let sm = self.state_machine.read().await;
317+
let Some(db) = sm.levels().persisted() else {
318+
return Default::default();
319+
};
320+
db.db_stat()
321+
}
322+
313323
/// Install a snapshot to build a state machine from it and replace the old state machine with the new one.
314324
#[fastrace::trace]
315325
pub async fn do_install_snapshot(&self, db: DB) -> Result<(), MetaStorageError> {

src/meta/service/tests/it/api/http/metrics.rs

Lines changed: 53 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -109,62 +109,62 @@ async fn test_metrics() -> anyhow::Result<()> {
109109
// metasrv_meta_network_watch_initialization_total 0
110110
// metasrv_raft_network_active_peers{id="1",addr="127.0.0.1:29003"} 1
111111
// metasrv_raft_network_active_peers{id="2",addr="127.0.0.1:29006"} 1
112-
// metasrv_raft_network_append_sent_seconds_bucket{le="+Inf",to="1"} 8
112+
// metasrv_raft_network_append_sent_seconds_bucket{le="+Inf",to="1"} 9
113113
// metasrv_raft_network_append_sent_seconds_bucket{le="+Inf",to="2"} 6
114-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.001",to="1"} 8
114+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.001",to="1"} 9
115115
// metasrv_raft_network_append_sent_seconds_bucket{le="0.001",to="2"} 6
116-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.002",to="1"} 8
116+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.002",to="1"} 9
117117
// metasrv_raft_network_append_sent_seconds_bucket{le="0.002",to="2"} 6
118-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.004",to="1"} 8
118+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.004",to="1"} 9
119119
// metasrv_raft_network_append_sent_seconds_bucket{le="0.004",to="2"} 6
120-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.008",to="1"} 8
120+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.008",to="1"} 9
121121
// metasrv_raft_network_append_sent_seconds_bucket{le="0.008",to="2"} 6
122-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.016",to="1"} 8
122+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.016",to="1"} 9
123123
// metasrv_raft_network_append_sent_seconds_bucket{le="0.016",to="2"} 6
124-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.032",to="1"} 8
124+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.032",to="1"} 9
125125
// metasrv_raft_network_append_sent_seconds_bucket{le="0.032",to="2"} 6
126-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.064",to="1"} 8
126+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.064",to="1"} 9
127127
// metasrv_raft_network_append_sent_seconds_bucket{le="0.064",to="2"} 6
128-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.128",to="1"} 8
128+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.128",to="1"} 9
129129
// metasrv_raft_network_append_sent_seconds_bucket{le="0.128",to="2"} 6
130-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.256",to="1"} 8
130+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.256",to="1"} 9
131131
// metasrv_raft_network_append_sent_seconds_bucket{le="0.256",to="2"} 6
132-
// metasrv_raft_network_append_sent_seconds_bucket{le="0.512",to="1"} 8
132+
// metasrv_raft_network_append_sent_seconds_bucket{le="0.512",to="1"} 9
133133
// metasrv_raft_network_append_sent_seconds_bucket{le="0.512",to="2"} 6
134-
// metasrv_raft_network_append_sent_seconds_bucket{le="1.024",to="1"} 8
134+
// metasrv_raft_network_append_sent_seconds_bucket{le="1.024",to="1"} 9
135135
// metasrv_raft_network_append_sent_seconds_bucket{le="1.024",to="2"} 6
136-
// metasrv_raft_network_append_sent_seconds_bucket{le="131.072",to="1"} 8
136+
// metasrv_raft_network_append_sent_seconds_bucket{le="131.072",to="1"} 9
137137
// metasrv_raft_network_append_sent_seconds_bucket{le="131.072",to="2"} 6
138-
// metasrv_raft_network_append_sent_seconds_bucket{le="16.384",to="1"} 8
138+
// metasrv_raft_network_append_sent_seconds_bucket{le="16.384",to="1"} 9
139139
// metasrv_raft_network_append_sent_seconds_bucket{le="16.384",to="2"} 6
140-
// metasrv_raft_network_append_sent_seconds_bucket{le="2.048",to="1"} 8
140+
// metasrv_raft_network_append_sent_seconds_bucket{le="2.048",to="1"} 9
141141
// metasrv_raft_network_append_sent_seconds_bucket{le="2.048",to="2"} 6
142-
// metasrv_raft_network_append_sent_seconds_bucket{le="262.144",to="1"} 8
142+
// metasrv_raft_network_append_sent_seconds_bucket{le="262.144",to="1"} 9
143143
// metasrv_raft_network_append_sent_seconds_bucket{le="262.144",to="2"} 6
144-
// metasrv_raft_network_append_sent_seconds_bucket{le="32.768",to="1"} 8
144+
// metasrv_raft_network_append_sent_seconds_bucket{le="32.768",to="1"} 9
145145
// metasrv_raft_network_append_sent_seconds_bucket{le="32.768",to="2"} 6
146-
// metasrv_raft_network_append_sent_seconds_bucket{le="4.096",to="1"} 8
146+
// metasrv_raft_network_append_sent_seconds_bucket{le="4.096",to="1"} 9
147147
// metasrv_raft_network_append_sent_seconds_bucket{le="4.096",to="2"} 6
148-
// metasrv_raft_network_append_sent_seconds_bucket{le="524.288",to="1"} 8
148+
// metasrv_raft_network_append_sent_seconds_bucket{le="524.288",to="1"} 9
149149
// metasrv_raft_network_append_sent_seconds_bucket{le="524.288",to="2"} 6
150-
// metasrv_raft_network_append_sent_seconds_bucket{le="65.536",to="1"} 8
150+
// metasrv_raft_network_append_sent_seconds_bucket{le="65.536",to="1"} 9
151151
// metasrv_raft_network_append_sent_seconds_bucket{le="65.536",to="2"} 6
152-
// metasrv_raft_network_append_sent_seconds_bucket{le="8.192",to="1"} 8
152+
// metasrv_raft_network_append_sent_seconds_bucket{le="8.192",to="1"} 9
153153
// metasrv_raft_network_append_sent_seconds_bucket{le="8.192",to="2"} 6
154-
// metasrv_raft_network_append_sent_seconds_count{to="1"} 8
154+
// metasrv_raft_network_append_sent_seconds_count{to="1"} 9
155155
// metasrv_raft_network_append_sent_seconds_count{to="2"} 6
156156
// metasrv_raft_network_append_sent_seconds_sum{to="1"} 0.0
157157
// metasrv_raft_network_append_sent_seconds_sum{to="2"} 0.0
158-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57382"} 1830
159-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57384"} 809
160-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57385"} 1764
161-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57387"} 537
162-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57388"} 338
163-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57389"} 533
164-
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:57390"} 673
158+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:62962"} 1794
159+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:62964"} 797
160+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:62965"} 1728
161+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:62967"} 537
162+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:62968"} 537
163+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:62969"} 533
164+
// metasrv_raft_network_recv_bytes_total{from="127.0.0.1:62970"} 673
165165
// metasrv_raft_network_recv_bytes_total{from="addr"} 1
166-
// metasrv_raft_network_sent_bytes_total{to="1"} 3650
167-
// metasrv_raft_network_sent_bytes_total{to="2"} 2834
166+
// metasrv_raft_network_sent_bytes_total{to="1"} 3661
167+
// metasrv_raft_network_sent_bytes_total{to="2"} 2938
168168
// metasrv_raft_storage_raft_store_write_failed_total{func="fun"} 1
169169
// metasrv_raft_storage_snapshot_building 0
170170
// metasrv_raft_storage_snapshot_written_entries_total 0
@@ -180,16 +180,24 @@ async fn test_metrics() -> anyhow::Result<()> {
180180
// metasrv_server_proposals_failed_total 0
181181
// metasrv_server_proposals_pending 0
182182
// metasrv_server_raft_log_cache_items 10
183-
// metasrv_server_raft_log_cache_used_size 867
184-
// metasrv_server_raft_log_size 1712
183+
// metasrv_server_raft_log_cache_used_size 771
184+
// metasrv_server_raft_log_size 1694
185185
// metasrv_server_raft_log_wal_closed_chunk_count 0
186186
// metasrv_server_raft_log_wal_closed_chunk_total_size 0
187-
// metasrv_server_raft_log_wal_offset 1712
188-
// metasrv_server_raft_log_wal_open_chunk_size 1712
187+
// metasrv_server_raft_log_wal_offset 1694
188+
// metasrv_server_raft_log_wal_open_chunk_size 1694
189189
// metasrv_server_read_failed_total 0
190+
// metasrv_server_snapshot_avg_block_size 0
191+
// metasrv_server_snapshot_avg_keys_per_block 0
192+
// metasrv_server_snapshot_block_count 0
193+
// metasrv_server_snapshot_data_size 0
190194
// metasrv_server_snapshot_expire_index_count 0
195+
// metasrv_server_snapshot_index_size 0
191196
// metasrv_server_snapshot_key_count 0
192197
// metasrv_server_snapshot_primary_index_count 0
198+
// metasrv_server_snapshot_read_block 0
199+
// metasrv_server_snapshot_read_block_from_cache 0
200+
// metasrv_server_snapshot_read_block_from_disk 0
193201
// metasrv_server_watchers 0
194202

195203
let b = response.take_body();
@@ -258,6 +266,16 @@ async fn test_metrics() -> anyhow::Result<()> {
258266
assert!(metric_keys.contains("metasrv_server_snapshot_primary_index_count"));
259267
assert!(metric_keys.contains("metasrv_server_snapshot_expire_index_count"));
260268

269+
// Server snapshot internal metrics
270+
assert!(metric_keys.contains("metasrv_server_snapshot_block_count"));
271+
assert!(metric_keys.contains("metasrv_server_snapshot_data_size"));
272+
assert!(metric_keys.contains("metasrv_server_snapshot_index_size"));
273+
assert!(metric_keys.contains("metasrv_server_snapshot_avg_block_size"));
274+
assert!(metric_keys.contains("metasrv_server_snapshot_avg_keys_per_block"));
275+
assert!(metric_keys.contains("metasrv_server_snapshot_read_block"));
276+
assert!(metric_keys.contains("metasrv_server_snapshot_read_block_from_cache"));
277+
assert!(metric_keys.contains("metasrv_server_snapshot_read_block_from_disk"));
278+
261279
// Meta network metrics
262280
assert!(metric_keys.contains("metasrv_meta_network_recv_bytes_total"));
263281
assert!(metric_keys.contains("metasrv_meta_network_req_failed_total"));

0 commit comments

Comments
 (0)