Skip to content

Commit a85ca4e

Browse files
authored
Optimize RocksDB prefix searches (#4487)
## Motivation Optimize RocksDB performance for prefix scans ## Proposal Enhance the RocksDB backend with several performance optimizations: - Add optimized `ReadOptions` for prefix scans with async I/O enabled - Set precise upper bounds for iterators to minimize key traversal - Improve iterator validity checking with a more robust loop structure - Configure bloom filters for prefix iteration optimization - Increase block size from 4KB to 32KB to reduce iterator seeks - Set up prefix extraction for bloom filter optimization - Enable memory-mapped files for faster reads - Configure memtable bloom filters and other performance settings ## Test Plan Tested this with the benchmarks, saw a performance improvement. Not a step change improvement, but significant enough to warrant a PR ## Release Plan Nothing to do / These changes follow the usual release cycle. ## Links - [reviewer checklist](https://github.com/linera-io/linera-protocol/blob/main/CONTRIBUTING.md#reviewer-checklist)
1 parent 9e85d1d commit a85ca4e

File tree

1 file changed

+51
-21
lines changed

1 file changed

+51
-21
lines changed

linera-views/src/backends/rocks_db.rs

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use std::{
1414
};
1515

1616
use linera_base::ensure;
17-
use rocksdb::{BlockBasedOptions, Cache, DBCompactionStyle};
17+
use rocksdb::{BlockBasedOptions, Cache, DBCompactionStyle, SliceTransform};
1818
use serde::{Deserialize, Serialize};
1919
use sysinfo::{CpuRefreshKind, MemoryRefreshKind, RefreshKind, System};
2020
use tempfile::TempDir;
@@ -166,25 +166,38 @@ impl RocksDbStoreExecutor {
166166
Ok(entries.into_iter().collect::<Result<_, _>>()?)
167167
}
168168

169+
fn get_find_prefix_iterator(&self, prefix: &[u8]) -> rocksdb::DBRawIteratorWithThreadMode<DB> {
170+
// Configure ReadOptions optimized for SSDs and iterator performance
171+
let mut read_opts = rocksdb::ReadOptions::default();
172+
// Enable async I/O for better concurrency
173+
read_opts.set_async_io(true);
174+
175+
// Set precise upper bound to minimize key traversal
176+
let upper_bound = get_upper_bound_option(prefix);
177+
if let Some(upper_bound) = upper_bound {
178+
read_opts.set_iterate_upper_bound(upper_bound);
179+
}
180+
181+
let mut iter = self.db.raw_iterator_opt(read_opts);
182+
iter.seek(prefix);
183+
iter
184+
}
185+
169186
fn find_keys_by_prefix_internal(
170187
&self,
171188
key_prefix: Vec<u8>,
172189
) -> Result<Vec<Vec<u8>>, RocksDbStoreInternalError> {
173190
check_key_size(&key_prefix)?;
191+
174192
let mut prefix = self.start_key.clone();
175193
prefix.extend(key_prefix);
176194
let len = prefix.len();
177-
let mut iter = self.db.raw_iterator();
195+
196+
let mut iter = self.get_find_prefix_iterator(&prefix);
178197
let mut keys = Vec::new();
179-
iter.seek(&prefix);
180-
let mut next_key = iter.key();
181-
while let Some(key) = next_key {
182-
if !key.starts_with(&prefix) {
183-
break;
184-
}
198+
while let Some(key) = iter.key() {
185199
keys.push(key[len..].to_vec());
186200
iter.next();
187-
next_key = iter.key();
188201
}
189202
Ok(keys)
190203
}
@@ -198,20 +211,13 @@ impl RocksDbStoreExecutor {
198211
let mut prefix = self.start_key.clone();
199212
prefix.extend(key_prefix);
200213
let len = prefix.len();
201-
let mut iter = self.db.raw_iterator();
214+
215+
let mut iter = self.get_find_prefix_iterator(&prefix);
202216
let mut key_values = Vec::new();
203-
iter.seek(&prefix);
204-
let mut next_key = iter.key();
205-
while let Some(key) = next_key {
206-
if !key.starts_with(&prefix) {
207-
break;
208-
}
209-
if let Some(value) = iter.value() {
210-
let key_value = (key[len..].to_vec(), value.to_vec());
211-
key_values.push(key_value);
212-
}
217+
while let Some((key, value)) = iter.item() {
218+
let key_value = (key[len..].to_vec(), value.to_vec());
219+
key_values.push(key_value);
213220
iter.next();
214-
next_key = iter.key();
215221
}
216222
Ok(key_values)
217223
}
@@ -373,8 +379,32 @@ impl RocksDbStoreInternal {
373379
total_ram / 4,
374380
HYPER_CLOCK_CACHE_BLOCK_SIZE,
375381
));
382+
383+
// Configure bloom filters for prefix iteration optimization
384+
block_options.set_bloom_filter(10.0, false);
385+
block_options.set_whole_key_filtering(false);
386+
387+
// 32KB blocks instead of default 4KB - reduces iterator seeks
388+
block_options.set_block_size(32 * 1024);
389+
// Use latest format for better compression and performance
390+
block_options.set_format_version(5);
391+
376392
options.set_block_based_table_factory(&block_options);
377393

394+
// Configure prefix extraction for bloom filter optimization
395+
// Use 8 bytes: ROOT_KEY_DOMAIN (1 byte) + BCS variant (1-2 bytes) + identifier start (4-5 bytes)
396+
let prefix_extractor = SliceTransform::create_fixed_prefix(8);
397+
options.set_prefix_extractor(prefix_extractor);
398+
399+
// 12.5% of memtable size for bloom filter
400+
options.set_memtable_prefix_bloom_ratio(0.125);
401+
// Skip bloom filter for memtable when key exists
402+
options.set_optimize_filters_for_hits(true);
403+
// Use memory-mapped files for faster reads
404+
options.set_allow_mmap_reads(true);
405+
// Don't use random access pattern since we do prefix scans
406+
options.set_advise_random_on_open(false);
407+
378408
let db = DB::open(&options, path_buf)?;
379409
let executor = RocksDbStoreExecutor {
380410
db: Arc::new(db),

0 commit comments

Comments
 (0)