Skip to content

Commit 9451238

Browse files
authored
docs(raft-store): enhance documentation across all modules (#18721)
Add comprehensive documentation for modules, structs, and configuration fields including units, defaults, and architectural overviews.
1 parent 636d4bb commit 9451238

File tree

10 files changed

+177
-39
lines changed

10 files changed

+177
-39
lines changed

src/meta/raft-store/src/applier/mod.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//! Raft log entry application to state machine.
16+
//!
17+
//! Applies committed raft log entries to the state machine including membership changes,
18+
//! KV operations, transactions, and TTL cleanup.
19+
1520
use std::future::ready;
1621
use std::io;
1722
use std::time::Duration;

src/meta/raft-store/src/config.rs

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//! Raft cluster configuration including networking, storage, and performance tuning.
16+
1517
use std::net::Ipv4Addr;
1618
use std::path::Path;
1719

@@ -24,6 +26,10 @@ use databend_common_meta_types::MetaStartupError;
2426
use crate::ondisk::DATA_VERSION;
2527
use crate::raft_log_v004;
2628

29+
/// Configuration for a Raft node including networking, storage paths, and performance tuning.
30+
///
31+
/// Controls cluster behavior, network endpoints, storage persistence, and operational parameters
32+
/// like heartbeat intervals and snapshot thresholds.
2733
#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize)]
2834
pub struct RaftConfig {
2935
/// Identify a config.
@@ -52,49 +58,78 @@ pub struct RaftConfig {
5258
/// You should only use this in a testing environment, unless YOU KNOW WHAT YOU ARE DOING.
5359
pub no_sync: bool,
5460

55-
/// The maximum number of log entries for log entries cache.
61+
/// Maximum log entries to cache in memory.
62+
///
63+
/// Higher values improve read performance but use more memory.
64+
/// Default: 1,000,000 entries.
5665
pub log_cache_max_items: u64,
5766

58-
/// The maximum memory in bytes for the log entries cache.
67+
/// Maximum memory for log cache in bytes.
68+
///
69+
/// Total memory limit for cached log entries.
70+
/// Default: 1GB (1,073,741,824 bytes).
5971
pub log_cache_capacity: u64,
6072

61-
/// Maximum number of records in a chunk of raft-log WAL.
73+
/// Maximum log records per WAL chunk
6274
pub log_wal_chunk_max_records: u64,
6375

64-
/// Maximum size in bytes for a chunk of raft-log WAL.
76+
/// Maximum WAL chunk size in bytes
6577
pub log_wal_chunk_max_size: u64,
6678

67-
/// The number of logs since the last snapshot to trigger next snapshot.
79+
/// Trigger snapshot after this many logs since last snapshot.
80+
///
81+
/// Lower values create more frequent snapshots but increase I/O.
82+
/// Default: 1024 log entries.
6883
pub snapshot_logs_since_last: u64,
6984

70-
/// The interval in milli seconds at which a leader send heartbeat message to followers.
71-
/// Different value of this setting on leader and followers may cause unexpected behavior.
85+
/// Leader heartbeat interval in milliseconds.
86+
///
87+
/// Must be > 0. Typical values: 500-2000ms.
88+
/// Affects leader election timeout (2x-3x this value).
89+
/// Default: 1000ms.
7290
pub heartbeat_interval: u64,
7391

74-
/// The max time in milli seconds that a leader wait for install-snapshot ack from a follower or non-voter.
92+
/// Install snapshot timeout in milliseconds.
93+
///
94+
/// Time to wait for snapshot installation to complete.
95+
/// Default: 4000ms.
7596
pub install_snapshot_timeout: u64,
7697

77-
/// The maximum number of applied logs to keep before purging
98+
/// Maximum applied logs to keep before purging.
99+
///
100+
/// Controls disk usage vs recovery time tradeoff.
101+
/// Default: 1000 log entries.
78102
pub max_applied_log_to_keep: u64,
79103

80-
/// The size of chunk for transmitting snapshot. The default is 64MB
104+
/// Snapshot transmission chunk size in bytes.
105+
///
106+
/// Larger chunks reduce network overhead but increase memory usage.
107+
/// Default: 4MB (4,194,304 bytes).
81108
pub snapshot_chunk_size: u64,
82109

83110
/// Whether to check keys fed to snapshot are sorted.
111+
///
112+
/// Enable for debugging snapshot corruption issues.
113+
/// Adds performance overhead in debug builds.
114+
/// Default: true.
84115
pub snapshot_db_debug_check: bool,
85116

86-
/// The maximum number of keys allowed in a block within a snapshot db.
117+
/// Maximum keys per snapshot database block.
87118
///
88-
/// A block serves as the caching unit in a snapshot database.
89-
/// Smaller blocks enable more granular cache control but may increase the index size.
119+
/// Higher values improve compression but increase memory usage.
120+
/// Default: 8000 keys per block.
90121
pub snapshot_db_block_keys: u64,
91122

92-
/// The total block to cache.
123+
/// Number of blocks to cache in snapshot database.
124+
///
125+
/// Higher values improve read performance but use more memory.
126+
/// Default: 1024 blocks.
93127
pub snapshot_db_block_cache_item: u64,
94128

95-
/// The total cache size for snapshot blocks.
129+
/// Total cache size for snapshot blocks in bytes.
96130
///
97-
/// By default it is 1GB.
131+
/// Controls memory usage for snapshot block caching.
132+
/// Default: 1GB (1,073,741,824 bytes).
98133
pub snapshot_db_block_cache_size: u64,
99134

100135
/// Single node metasrv. It creates a single node cluster if meta data is not initialized.
@@ -234,7 +269,7 @@ impl RaftConfig {
234269
Endpoint::new(&self.raft_advertise_host, self.raft_api_port)
235270
}
236271

237-
/// Support ip address and hostname
272+
/// Resolves the advertise host to an endpoint, supporting both IP addresses and hostnames.
238273
pub async fn raft_api_addr(&self) -> Result<Endpoint> {
239274
let ipv4_addr = self.raft_advertise_host.as_str().parse::<Ipv4Addr>();
240275
match ipv4_addr {
@@ -260,6 +295,13 @@ impl RaftConfig {
260295
(self.heartbeat_interval * 2, self.heartbeat_interval * 3)
261296
}
262297

298+
/// Validates configuration parameters for consistency and correctness.
299+
///
300+
/// # Errors
301+
/// Returns `MetaStartupError::InvalidConfig` if:
302+
/// - Neither `single` nor `join` is specified
303+
/// - Both `single` and `join` are specified
304+
/// - Node tries to join itself (self-reference in join addresses)
263305
pub fn check(&self) -> std::result::Result<(), MetaStartupError> {
264306
// If just leaving, does not need to check other config
265307
if !self.leave_via.is_empty() {

src/meta/raft-store/src/key_spaces.rs

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,19 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
//! Defines application key spaces that are defined by raft-store.
16-
//! All the key spaces stores key-value pairs in the underlying sled db.
15+
//! Storage partitioning by data type with unique prefixes.
16+
//!
17+
//! Each key space stores a specific data type:
18+
//! - **Logs** (V003): Raft log entries by index
19+
//! - **Nodes**: Cluster membership by node ID
20+
//! - **StateMachineMeta**: Last applied log and metadata
21+
//! - **RaftStateKV** (V003): Node state (ID, vote, committed log)
22+
//! - **Expire**: TTL expiration index by time
23+
//! - **GenericKV**: User data with sequence numbers
24+
//! - **Sequences**: Monotonic sequence generators
25+
//! - **LogMeta** (V003): Log metadata for purging
26+
//!
27+
//! V003 uses sled storage (legacy), V004 uses separate raft log with leveled state machine.
1728
1829
use databend_common_meta_sled_store::sled;
1930
use databend_common_meta_sled_store::SledKeySpace;
@@ -42,7 +53,12 @@ use crate::state_machine::LogMetaValue;
4253
use crate::state_machine::StateMachineMetaKey;
4354
use crate::state_machine::StateMachineMetaValue;
4455

45-
/// Types for raft log in SledTree
56+
/// Raft log entries storage key space (V003 only).
57+
///
58+
/// Maps log index to raft entries. In V004, logs are stored separately
59+
/// in WAL format for better performance.
60+
/// - Key: [`LogIndex`] (u64) - Sequential log entry index
61+
/// - Value: [`Entry`] - Complete raft log entry with payload
4662
pub struct Logs {}
4763
impl SledKeySpace for Logs {
4864
const PREFIX: u8 = 1;
@@ -51,7 +67,12 @@ impl SledKeySpace for Logs {
5167
type V = Entry;
5268
}
5369

54-
/// Types for raft log meta data in SledTree
70+
/// Log metadata storage key space (V003 only).
71+
///
72+
/// Stores metadata about raft logs for purging and management.
73+
/// Used to track log ranges and cleanup operations.
74+
/// - Key: [`LogMetaKey`] - Metadata type identifier
75+
/// - Value: [`LogMetaValue`] - Log metadata information
5576
pub struct LogMeta {}
5677
impl SledKeySpace for LogMeta {
5778
const PREFIX: u8 = 13;
@@ -60,7 +81,12 @@ impl SledKeySpace for LogMeta {
6081
type V = LogMetaValue;
6182
}
6283

63-
/// Types for Node in SledTree
84+
/// Cluster node information storage key space.
85+
///
86+
/// Maps node IDs to node configuration and endpoint information.
87+
/// Used for cluster membership management and node discovery.
88+
/// - Key: [`NodeId`] (u64) - Unique node identifier
89+
/// - Value: [`Node`] - Node configuration with endpoint and metadata
6490
pub struct Nodes {}
6591
impl SledKeySpace for Nodes {
6692
const PREFIX: u8 = 2;
@@ -69,7 +95,12 @@ impl SledKeySpace for Nodes {
6995
type V = Node;
7096
}
7197

72-
/// Key-Value Types for storing meta data of a raft state machine in sled::Tree, e.g. the last applied log id.
98+
/// State machine metadata storage key space.
99+
///
100+
/// Stores critical state machine metadata like last applied log ID
101+
/// and other operational state required for consistency.
102+
/// - Key: [`StateMachineMetaKey`] - Metadata type identifier
103+
/// - Value: [`StateMachineMetaValue`] - State machine metadata
73104
pub struct StateMachineMeta {}
74105
impl SledKeySpace for StateMachineMeta {
75106
const PREFIX: u8 = 3;
@@ -78,8 +109,12 @@ impl SledKeySpace for StateMachineMeta {
78109
type V = StateMachineMetaValue;
79110
}
80111

81-
/// Key-Value Types for storing meta data of a raft in sled::Tree:
82-
/// node_id, vote
112+
/// Raft consensus state storage key space (V003 only).
113+
///
114+
/// Stores core raft state including node ID, vote information,
115+
/// and committed log index. Critical for raft consensus protocol.
116+
/// - Key: [`RaftStateKey`] - State type identifier
117+
/// - Value: [`RaftStateValue`] - Raft state data
83118
pub struct RaftStateKV {}
84119
impl SledKeySpace for RaftStateKV {
85120
const PREFIX: u8 = 4;
@@ -88,9 +123,12 @@ impl SledKeySpace for RaftStateKV {
88123
type V = RaftStateValue;
89124
}
90125

91-
/// Stores a index for kv records with expire time.
126+
/// TTL expiration index storage key space.
92127
///
93-
/// It stores them in expire time order.
128+
/// Secondary index for records with expiration times, ordered by expire time.
129+
/// Enables efficient cleanup of expired entries during log application.
130+
/// - Key: [`ExpireKey`] - Expiration time and sequence
131+
/// - Value: [`ExpireValue`] - Original key that expires
94132
pub struct Expire {}
95133
impl SledKeySpace for Expire {
96134
const PREFIX: u8 = 5;
@@ -99,7 +137,12 @@ impl SledKeySpace for Expire {
99137
type V = ExpireValue;
100138
}
101139

102-
/// Key-Value Types for storing general purpose kv in sled::Tree:
140+
/// User data storage key space.
141+
///
142+
/// Primary storage for application key-value data with sequence numbers
143+
/// for versioning and consistency. Supports TTL and conditional operations.
144+
/// - Key: [`String`] - User-defined key
145+
/// - Value: [`SeqV<Vec<u8>>`] - Sequenced value with metadata
103146
pub struct GenericKV {}
104147
impl SledKeySpace for GenericKV {
105148
const PREFIX: u8 = 6;
@@ -108,7 +151,12 @@ impl SledKeySpace for GenericKV {
108151
type V = SeqV<Vec<u8>>;
109152
}
110153

111-
/// Key-Value Types for sequence number generator in sled::Tree:
154+
/// Monotonic sequence generator storage key space.
155+
///
156+
/// Provides atomic sequence number generation for various purposes
157+
/// like auto-incrementing IDs and sequential key generation.
158+
/// - Key: [`String`] - Sequence name identifier
159+
/// - Value: [`SeqNum`] - Current sequence number
112160
pub struct Sequences {}
113161
impl SledKeySpace for Sequences {
114162
const PREFIX: u8 = 7;
@@ -119,6 +167,12 @@ impl SledKeySpace for Sequences {
119167

120168
// Reserved: removed: `pub struct ClientLastResps {}`, PREFIX = 10;
121169

170+
/// Storage metadata header key space.
171+
///
172+
/// Stores version information and metadata about the storage format
173+
/// for compatibility checking and data migration purposes.
174+
/// - Key: [`String`] - Header type identifier
175+
/// - Value: [`Header`] - Storage format metadata
122176
pub struct DataHeader {}
123177
impl SledKeySpace for DataHeader {
124178
const PREFIX: u8 = 11;

src/meta/raft-store/src/leveled_store/leveled_map/mod.rs

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,23 +55,28 @@ pub mod leveled_map_data;
5555
#[cfg(test)]
5656
mod leveled_map_test;
5757

58-
/// Similar to leveldb.
58+
/// Multi-level storage similar to LevelDB with single-writer concurrency control.
5959
///
60-
/// The top level is the newest and writable.
61-
/// Others are immutable.
60+
/// ## Concurrency Model
61+
/// - **Single writer**: Only one writer allowed at a time via write_semaphore
62+
/// - **Multiple readers**: Concurrent read access across all levels
63+
/// - **Lock-free compaction**: Compactor clones data out before processing, no long mutex holds
64+
/// - **At most one candidate writer**: Top level is exclusively writable
6265
///
63-
/// - A writer must acquire a permit to write_semaphore.
64-
/// - A compactor must:
65-
/// - acquire the compaction_semaphore first,
66-
/// - then acquire `write_semaphore` to move `writeable` to `immutable_levels`,
67-
///
68-
/// The top level is the newest and writable and there is **at most one** candidate writer.
66+
/// ## Performance Characteristics
67+
/// - **Read latency**: O(log n) access across levels, newest data first
68+
/// - **Write performance**: Single writer to top level, no contention
69+
/// - **Memory usage**: Grows with number of levels and cached data
70+
/// - **Compaction performance**: Non-blocking, processes cloned data independently
6971
///
72+
/// ## Level Organization
73+
/// ```text
7074
/// | | writer_semaphore | compactor_semaphore |
7175
/// | :-- | :-- | :-- |
7276
/// | writable | RW | |
7377
/// | immutable_levels | R | RW |
7478
/// | persisted | R | RW |
79+
/// ```
7580
#[derive(Debug, Clone)]
7681
pub struct LeveledMap {
7782
pub data: Arc<Mutex<LeveledMapData>>,

src/meta/raft-store/src/lib.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,27 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//! Raft-based distributed metadata storage for Databend.
16+
//!
17+
//! This crate implements a distributed metadata store using the Raft consensus algorithm.
18+
//! It provides transactional KV operations, cluster membership management, and state machine
19+
//! replication with support for both legacy (V003) and current (V004) storage formats.
20+
//!
21+
//! ## Core Components
22+
//!
23+
//! - **`config`**: Raft cluster configuration and tuning parameters
24+
//! - **`key_spaces`**: Storage partitioning by data type with unique prefixes
25+
//! - **`leveled_store`**: Multi-level storage engine for efficient data organization
26+
//! - **`sm_v003`**: Legacy sled-based state machine implementation
27+
//! - **`raft_log_v004`**: Current WAL-based raft log storage
28+
//! - **`state_machine`**: Core state machine API and metadata management
29+
//! - **`applier`**: Log entry application and state transitions
30+
//!
31+
//! ## Version Compatibility
32+
//!
33+
//! - **V003**: Legacy format using sled for both logs and state machine
34+
//! - **V004**: Current format with separate WAL for logs and leveled storage for state machine
35+
1536
#![allow(clippy::uninlined_format_args)]
1637
#![feature(coroutines)]
1738
#![feature(impl_trait_in_assoc_type)]

src/meta/raft-store/src/snapshot_config.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//! Snapshot storage path configuration and management.
16+
1517
use std::fs;
1618
use std::io;
1719
use std::time::SystemTime;

src/meta/raft-store/src/state_machine/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//! State machine metadata types and snapshot management.
16+
1517
pub use log_meta::LogMetaKey;
1618
pub use log_meta::LogMetaValue;
1719
pub use snapshot_id::MetaSnapshotId;

src/meta/raft-store/src/state_machine_api_ext.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//! Extension trait providing high-level state machine operations.
16+
//!
17+
//! Extends [`StateMachineApi`] with conditional KV updates, prefix-based listing,
18+
//! TTL expiration management, and secondary index maintenance.
19+
1520
use std::future;
1621
use std::io;
1722
use std::ops::RangeBounds;

0 commit comments

Comments
 (0)