Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion consensus/src/consensus/factory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ use kaspa_consensusmanager::{ConsensusFactory, ConsensusInstance, DynConsensusCt
use kaspa_core::{debug, time::unix_now, warn};
use kaspa_database::{
prelude::{
BatchDbWriter, CachePolicy, CachedDbAccess, CachedDbItem, DirectDbWriter, StoreError, StoreResult, StoreResultExtensions, DB,
BatchDbWriter, CachePolicy, CachedDbAccess, CachedDbItem, DirectDbWriter, RocksDbPreset, StoreError, StoreResult,
StoreResultExtensions, DB,
},
registry::DatabaseStorePrefixes,
};
Expand Down Expand Up @@ -255,6 +256,8 @@ pub struct Factory {
tx_script_cache_counters: Arc<TxScriptCacheCounters>,
fd_budget: i32,
mining_rules: Arc<MiningRules>,
rocksdb_preset: RocksDbPreset,
wal_dir: Option<PathBuf>,
}

impl Factory {
Expand All @@ -268,6 +271,8 @@ impl Factory {
tx_script_cache_counters: Arc<TxScriptCacheCounters>,
fd_budget: i32,
mining_rules: Arc<MiningRules>,
rocksdb_preset: RocksDbPreset,
wal_dir: Option<PathBuf>,
) -> Self {
assert!(fd_budget > 0, "fd_budget has to be positive");
let mut config = config.clone();
Expand All @@ -286,6 +291,8 @@ impl Factory {
tx_script_cache_counters,
fd_budget,
mining_rules,
rocksdb_preset,
wal_dir,
};
factory.delete_inactive_consensus_entries();
factory
Expand Down Expand Up @@ -316,6 +323,8 @@ impl ConsensusFactory for Factory {
.with_db_path(dir)
.with_parallelism(self.db_parallelism)
.with_files_limit(self.fd_budget / 2) // active and staging consensuses should have equal budgets
.with_preset(self.rocksdb_preset)
.with_wal_dir(self.wal_dir.clone())
.build()
.unwrap();

Expand Down Expand Up @@ -351,6 +360,8 @@ impl ConsensusFactory for Factory {
.with_db_path(dir)
.with_parallelism(self.db_parallelism)
.with_files_limit(self.fd_budget / 2) // active and staging consensuses should have equal budgets
.with_preset(self.rocksdb_preset)
.with_wal_dir(self.wal_dir.clone())
.build()
.unwrap();

Expand Down
2 changes: 2 additions & 0 deletions database/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ use std::path::PathBuf;

pub use conn_builder::ConnBuilder;
use kaspa_utils::fd_budget::FDGuard;
pub use rocksdb_preset::RocksDbPreset;

mod conn_builder;
mod rocksdb_preset;

/// The DB type used for Kaspad stores
pub struct DB {
Expand Down
46 changes: 43 additions & 3 deletions database/src/db/conn_builder.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use super::rocksdb_preset::RocksDbPreset;
use crate::db::DB;
use rocksdb::{DBWithThreadMode, MultiThreaded};
use std::{path::PathBuf, sync::Arc};
Expand All @@ -13,6 +14,8 @@ pub struct ConnBuilder<Path, const STATS_ENABLED: bool, StatsPeriod, FDLimit> {
files_limit: FDLimit,
mem_budget: usize,
stats_period: StatsPeriod,
preset: RocksDbPreset,
wal_dir: Option<PathBuf>,
}

impl Default for ConnBuilder<Unspecified, false, Unspecified, Unspecified> {
Expand All @@ -24,6 +27,8 @@ impl Default for ConnBuilder<Unspecified, false, Unspecified, Unspecified> {
mem_budget: 64 * 1024 * 1024,
stats_period: Unspecified,
files_limit: Unspecified,
preset: RocksDbPreset::Default,
wal_dir: None,
}
}
}
Expand All @@ -37,6 +42,8 @@ impl<Path, const STATS_ENABLED: bool, StatsPeriod, FDLimit> ConnBuilder<Path, ST
parallelism: self.parallelism,
mem_budget: self.mem_budget,
stats_period: self.stats_period,
preset: self.preset,
wal_dir: self.wal_dir,
}
}
pub fn with_create_if_missing(self, create_if_missing: bool) -> ConnBuilder<Path, STATS_ENABLED, StatsPeriod, FDLimit> {
Expand All @@ -56,8 +63,16 @@ impl<Path, const STATS_ENABLED: bool, StatsPeriod, FDLimit> ConnBuilder<Path, ST
parallelism: self.parallelism,
mem_budget: self.mem_budget,
stats_period: self.stats_period,
preset: self.preset,
wal_dir: self.wal_dir,
}
}
pub fn with_preset(self, preset: RocksDbPreset) -> ConnBuilder<Path, STATS_ENABLED, StatsPeriod, FDLimit> {
ConnBuilder { preset, ..self }
}
pub fn with_wal_dir(self, wal_dir: Option<PathBuf>) -> ConnBuilder<Path, STATS_ENABLED, StatsPeriod, FDLimit> {
ConnBuilder { wal_dir, ..self }
}
}

impl<Path, FDLimit> ConnBuilder<Path, false, Unspecified, FDLimit> {
Expand All @@ -69,6 +84,8 @@ impl<Path, FDLimit> ConnBuilder<Path, false, Unspecified, FDLimit> {
files_limit: self.files_limit,
mem_budget: self.mem_budget,
stats_period: self.stats_period,
preset: self.preset,
wal_dir: self.wal_dir,
}
}
}
Expand All @@ -82,6 +99,8 @@ impl<Path, StatsPeriod, FDLimit> ConnBuilder<Path, true, StatsPeriod, FDLimit> {
files_limit: self.files_limit,
mem_budget: self.mem_budget,
stats_period: Unspecified,
preset: self.preset,
wal_dir: self.wal_dir,
}
}
pub fn with_stats_period(self, stats_period: impl Into<u32>) -> ConnBuilder<Path, true, u32, FDLimit> {
Expand All @@ -92,18 +111,39 @@ impl<Path, StatsPeriod, FDLimit> ConnBuilder<Path, true, StatsPeriod, FDLimit> {
files_limit: self.files_limit,
mem_budget: self.mem_budget,
stats_period: stats_period.into(),
preset: self.preset,
wal_dir: self.wal_dir,
}
}
}

macro_rules! default_opts {
($self: expr) => {{
let mut opts = rocksdb::Options::default();
if $self.parallelism > 1 {
opts.increase_parallelism($self.parallelism as i32);

// Apply the preset configuration (includes parallelism and compaction settings)
$self.preset.apply_to_options(&mut opts, $self.parallelism, $self.mem_budget);

// Configure WAL directory if specified (for RAM cache / tmpfs)
// Auto-generate unique subdirectory from database path to avoid conflicts
if let Some(ref wal_base) = $self.wal_dir {
let db_name = $self
.db_path
.file_name()
.and_then(|n| n.to_str())
.expect(&format!("Invalid database path: {}", $self.db_path.display()));
let wal_subdir = wal_base.join(db_name);

// Create subdirectory if needed (each DB gets its own WAL space)
std::fs::create_dir_all(&wal_subdir).expect(&format!(
"Failed to create WAL subdirectory {}: {}",
wal_subdir.display(),
"error"
));

opts.set_wal_dir(&wal_subdir);
}

opts.optimize_level_style_compaction($self.mem_budget);
let guard = kaspa_utils::fd_budget::acquire_guard($self.files_limit)?;
opts.set_max_open_files($self.files_limit);
opts.create_if_missing($self.create_if_missing);
Expand Down
223 changes: 223 additions & 0 deletions database/src/db/rocksdb_preset.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
//! RocksDB configuration presets for different use cases
//!
//! This module provides pre-configured RocksDB option sets optimized for different
//! deployment scenarios. Based on Issue #681 and community testing.

use rocksdb::Options;
use std::str::FromStr;

/// Available RocksDB configuration presets
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum RocksDbPreset {
/// Default configuration - balanced for general use on SSD/NVMe
/// - 64MB write buffer
/// - Standard compression
/// - Optimized for fast storage
#[default]
Default,

/// Archive configuration - optimized for HDD storage
/// - 256MB write buffer (4x default)
/// - Aggressive compression (LZ4 + ZSTD)
/// - BlobDB enabled for large values
/// - Rate limiting to prevent I/O spikes
/// - Optimized for sequential writes and reduced write amplification
///
/// Based on Callidon's configuration from Issue #681.
/// Recommended for archival nodes on HDD storage.
Archive,
}

impl FromStr for RocksDbPreset {
type Err = String;

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"default" => Ok(Self::Default),
"archive" => Ok(Self::Archive),
_ => Err(format!("Unknown RocksDB preset: '{}'. Valid options: default, archive", s)),
}
}
}

impl std::fmt::Display for RocksDbPreset {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Default => write!(f, "default"),
Self::Archive => write!(f, "archive"),
}
}
}

impl RocksDbPreset {
/// Apply the preset configuration to RocksDB options
///
/// # Arguments
/// * `opts` - RocksDB options to configure
/// * `parallelism` - Number of background threads
/// * `mem_budget` - Memory budget (only used for Default preset, Archive uses fixed 256MB)
pub fn apply_to_options(&self, opts: &mut Options, parallelism: usize, mem_budget: usize) {
match self {
Self::Default => self.apply_default(opts, parallelism, mem_budget),
Self::Archive => self.apply_archive(opts, parallelism),
}
}

/// Apply default preset configuration
fn apply_default(&self, opts: &mut Options, parallelism: usize, mem_budget: usize) {
if parallelism > 1 {
opts.increase_parallelism(parallelism as i32);
}

// Use the provided memory budget (typically 64MB)
opts.optimize_level_style_compaction(mem_budget);
}

/// Apply archive preset configuration (Callidon's HDD-optimized settings)
fn apply_archive(&self, opts: &mut Options, parallelism: usize) {
if parallelism > 1 {
opts.increase_parallelism(parallelism as i32);
}

// Memory and write buffer settings (256MB for better batching on HDD)
let write_buffer_size = 256 * 1024 * 1024; // 256MB

// Optimize for level-style compaction with archive-appropriate memory
// This sets up LSM tree parameters
opts.optimize_level_style_compaction(write_buffer_size);

// Re-set write_buffer_size after optimize_level_style_compaction()
// because optimize_level_style_compaction() internally overrides it to size/4
opts.set_write_buffer_size(write_buffer_size);

// LSM Tree Structure - Optimized for large (4TB+) archives
// 256 MB SST files reduce file count dramatically (500K → 16K files for 4TB)
opts.set_target_file_size_base(256 * 1024 * 1024); // 256 MB SST files
opts.set_target_file_size_multiplier(1); // Same size across all levels
opts.set_max_bytes_for_level_base(1024 * 1024 * 1024); // 1 GB L1 base
opts.set_level_compaction_dynamic_level_bytes(true); // Minimize space amplification

// Compaction settings
// Trigger compaction when L0 has just 1 file (minimize write amplification)
opts.set_level_zero_file_num_compaction_trigger(1);

// Prioritize compacting older/smaller files first
use rocksdb::CompactionPri;
opts.set_compaction_pri(CompactionPri::OldestSmallestSeqFirst);

// Read-ahead for compactions (4MB - good for sequential HDD reads)
opts.set_compaction_readahead_size(4 * 1024 * 1024);

// Compression strategy: LZ4 for all levels, ZSTD for bottommost
use rocksdb::DBCompressionType;

// Set default compression to LZ4 (fast)
opts.set_compression_type(DBCompressionType::Lz4);

// Enable bottommost level compression with maximum ZSTD level
opts.set_bottommost_compression_type(DBCompressionType::Zstd);

// ZSTD compression options for bottommost level
// Larger dictionaries (64 KB) improve compression on large archives
opts.set_compression_options(
-1, // window_bits (let ZSTD choose optimal)
22, // level (maximum compression)
0, // strategy (default)
64 * 1024, // dict_bytes (64 KB dictionary)
);

// Train ZSTD dictionaries on 8 MB of sample data (~125x dictionary size)
opts.set_zstd_max_train_bytes(8 * 1024 * 1024);

// Block-based table options for better caching
use rocksdb::{BlockBasedOptions, Cache};
let mut block_opts = BlockBasedOptions::default();

// Partitioned Bloom filters (18 bits per key for better false-positive rate)
block_opts.set_bloom_filter(18.0, false); // 18 bits per key
block_opts.set_partition_filters(true); // Partition for large databases
block_opts.set_format_version(5); // Latest format with optimizations
block_opts.set_index_type(rocksdb::BlockBasedIndexType::TwoLevelIndexSearch);

// Cache index and filter blocks in block cache for faster queries
block_opts.set_cache_index_and_filter_blocks(true);

// Block cache (2GB LRU cache for frequently accessed blocks)
let cache = Cache::new_lru_cache(2 * 1024 * 1024 * 1024); // 2GB
block_opts.set_block_cache(&cache);

// Set block size (256KB - better for sequential HDD reads)
block_opts.set_block_size(256 * 1024);

opts.set_block_based_table_factory(&block_opts);

// Rate limiting: prevent I/O spikes on HDD
// 12 MB/s rate limit for background writes
opts.set_ratelimiter(12 * 1024 * 1024, 100_000, 10);

// Enable BlobDB for large values (reduces write amplification)
opts.set_enable_blob_files(true);
opts.set_min_blob_size(512); // Only values >512 bytes go to blob files
opts.set_blob_file_size(256 * 1024 * 1024); // 256MB blob files
opts.set_blob_compression_type(DBCompressionType::Zstd); // Compress blobs
opts.set_enable_blob_gc(true); // Enable garbage collection
opts.set_blob_gc_age_cutoff(0.9); // GC blobs when 90% old
opts.set_blob_gc_force_threshold(0.1); // Force GC at 10% garbage
opts.set_blob_compaction_readahead_size(8 * 1024 * 1024); // 8 MB blob readahead
}

/// Get a human-readable description of the preset
pub fn description(&self) -> &'static str {
match self {
Self::Default => "Default preset - balanced for SSD/NVMe (64MB write buffer, standard compression)",
Self::Archive => "Archive preset - optimized for HDD (256MB write buffer, BlobDB, aggressive compression, rate limiting)",
}
}

/// Get the recommended use case for this preset
pub fn use_case(&self) -> &'static str {
match self {
Self::Default => "General purpose nodes on SSD/NVMe storage",
Self::Archive => "Archival nodes on HDD storage (--archival flag recommended)",
}
}

/// Get memory requirements for this preset
pub fn memory_requirements(&self) -> &'static str {
match self {
Self::Default => "~4GB minimum, scales with --ram-scale",
Self::Archive => "~8GB minimum (256MB write buffer + 2GB cache + overhead), 16GB+ recommended",
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_preset_from_str() {
assert_eq!(RocksDbPreset::from_str("default").unwrap(), RocksDbPreset::Default);
assert_eq!(RocksDbPreset::from_str("Default").unwrap(), RocksDbPreset::Default);
assert_eq!(RocksDbPreset::from_str("archive").unwrap(), RocksDbPreset::Archive);
assert_eq!(RocksDbPreset::from_str("ARCHIVE").unwrap(), RocksDbPreset::Archive);
assert!(RocksDbPreset::from_str("unknown").is_err());
}

#[test]
fn test_preset_display() {
assert_eq!(RocksDbPreset::Default.to_string(), "default");
assert_eq!(RocksDbPreset::Archive.to_string(), "archive");
}

#[test]
fn test_apply_presets() {
let mut opts = Options::default();

// Test default preset
RocksDbPreset::Default.apply_to_options(&mut opts, 4, 64 * 1024 * 1024);

// Test archive preset
RocksDbPreset::Archive.apply_to_options(&mut opts, 4, 64 * 1024 * 1024);
}
}
Loading