diff --git a/Cargo.lock b/Cargo.lock index 76c6bf76..abe3061d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,17 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "ahash" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", +] + [[package]] name = "aho-corasick" version = "1.0.5" @@ -26,6 +37,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + [[package]] name = "anstream" version = "0.5.0" @@ -750,6 +767,7 @@ dependencies = [ "indoc", "itertools", "lazy_static", + "lru", "mutants", "nix", "nutmeg", @@ -1188,6 +1206,16 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +[[package]] +name = "hashbrown" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +dependencies = [ + "ahash", + "allocator-api2", +] + [[package]] name = "heck" version = "0.4.1" @@ -1332,7 +1360,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.12.3", ] [[package]] @@ -1422,6 +1450,15 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lru" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a83fb7698b3643a0e34f9ae6f2e8f0178c0fd42f8b59d493aa271ff3a5bf21" +dependencies = [ + "hashbrown 0.14.0", +] + [[package]] name = "matchers" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index b6df963d..61ee96be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ globset = "0.4.5" hex = "0.4.2" itertools = "0.10" lazy_static = "1.4.0" +lru = "0.11" mutants = "0.0.3" rayon = "1.3.0" readahead-iterator = "0.1.1" diff --git a/NEWS.md b/NEWS.md index 3b387333..e24790e6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ - S3 support! Enable it with `cargo install --features s3`, then e.g. `cargo backup s3://mybucket.example/`. +- Performance: A simple cache of retrieved decompressed blocks now speeds up restores, especially on relatively slow storage like S3. + - `--debug` now shows on stderr only debug messages from Conserve itself and not from dependencies. All the messages are still recorded to the `--log-json` file if that is given. diff --git a/src/blockdir.rs b/src/blockdir.rs index 93d830a9..0c259763 100644 --- a/src/blockdir.rs +++ b/src/blockdir.rs @@ -25,14 +25,16 @@ use std::collections::{HashMap, HashSet}; use std::convert::TryInto; use std::sync::atomic::Ordering::Relaxed; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, RwLock}; use std::time::Instant; use bytes::Bytes; +use lru::LruCache; use rayon::prelude::*; use serde::{Deserialize, Serialize}; #[allow(unused_imports)] use tracing::{debug, error, info, warn}; +use tracing::{instrument, trace}; use crate::backup::BackupStats; use crate::blockhash::BlockHash; @@ -46,6 +48,9 @@ const BLOCKDIR_FILE_NAME_LEN: usize = crate::BLAKE_HASH_SIZE_BYTES * 2; /// Take this many characters from the block hash to form the subdirectory name. const SUBDIR_NAME_CHARS: usize = 3; +/// Cache this many blocks in memory, of up to 1MB each. +const CACHE_SIZE: usize = 1000; + /// Points to some compressed data inside the block dir. /// /// Identifiers are: which file contains it, at what (pre-compression) offset, @@ -69,6 +74,8 @@ pub struct Address { pub struct BlockDir { transport: Arc, pub stats: BlockDirStats, + // TODO: There are fancier caches and they might help, but this one works, and Stretto did not work for me. + cache: RwLock>, } /// Returns the transport-relative subdirectory name. @@ -87,6 +94,7 @@ impl BlockDir { BlockDir { transport, stats: BlockDirStats::default(), + cache: RwLock::new(LruCache::new(CACHE_SIZE.try_into().unwrap())), } } @@ -111,6 +119,10 @@ impl BlockDir { return Ok(hash); } let compressed = Compressor::new().compress(&block_data)?; + self.cache + .write() + .expect("Lock cache") + .put(hash.clone(), block_data); let comp_len: u64 = compressed.len().try_into().unwrap(); let hex_hash = hash.to_string(); let relpath = block_relpath(&hash); @@ -131,6 +143,10 @@ impl BlockDir { /// So, these are specifically treated as missing, so there's a chance to heal /// them later. pub fn contains(&self, hash: &BlockHash) -> Result { + if self.cache.read().expect("Lock cache").contains(hash) { + self.stats.cache_hit.fetch_add(1, Relaxed); + return Ok(true); + } match self.transport.metadata(&block_relpath(hash)) { Err(err) if err.is_not_found() => Ok(false), Err(err) => { @@ -165,10 +181,13 @@ impl BlockDir { /// Return the entire contents of the block. /// /// Checks that the hash is correct with the contents. + #[instrument(skip(self))] pub fn get_block_content(&self, hash: &BlockHash) -> Result { - // TODO: Reuse decompressor buffer. - // TODO: Most importantly, cache decompressed blocks! - // TODO: Stats for block reads, maybe in the blockdir? + if let Some(hit) = self.cache.write().expect("Lock cache").get(hash) { + self.stats.cache_hit.fetch_add(1, Relaxed); + trace!("Block cache hit"); + return Ok(hit.clone()); + } let mut decompressor = Decompressor::new(); let block_relpath = block_relpath(hash); let compressed_bytes = self.transport.read_file(&block_relpath)?; @@ -178,6 +197,10 @@ impl BlockDir { error!(%hash, %actual_hash, %block_relpath, "Block file has wrong hash"); return Err(Error::BlockCorrupt { hash: hash.clone() }); } + self.cache + .write() + .expect("Lock cache") + .put(hash.clone(), decompressed_bytes.clone()); self.stats.read_blocks.fetch_add(1, Relaxed); self.stats .read_block_compressed_bytes @@ -189,6 +212,7 @@ impl BlockDir { } pub fn delete_block(&self, hash: &BlockHash) -> Result<()> { + self.cache.write().expect("Lock cache").pop(hash); self.transport .remove_file(&block_relpath(hash)) .map_err(Error::from) @@ -290,6 +314,7 @@ pub struct BlockDirStats { pub read_blocks: AtomicUsize, pub read_block_compressed_bytes: AtomicUsize, pub read_block_uncompressed_bytes: AtomicUsize, + pub cache_hit: AtomicUsize, } #[cfg(test)] @@ -309,6 +334,9 @@ mod test { .store_or_deduplicate(Bytes::from("stuff"), &mut stats) .unwrap(); assert!(blockdir.contains(&hash).unwrap()); + + // Open again to get a fresh cache + let blockdir = BlockDir::open(open_local_transport(tempdir.path()).unwrap()); OpenOptions::new() .write(true) .truncate(true) @@ -317,4 +345,27 @@ mod test { .expect("Truncate block"); assert!(!blockdir.contains(&hash).unwrap()); } + + #[test] + fn cache_hit() { + let tempdir = TempDir::new().unwrap(); + let blockdir = BlockDir::open(open_local_transport(tempdir.path()).unwrap()); + let mut stats = BackupStats::default(); + let content = Bytes::from("stuff"); + let hash = blockdir + .store_or_deduplicate(content.clone(), &mut stats) + .unwrap(); + assert_eq!(blockdir.stats.cache_hit.load(Relaxed), 0); + + assert!(blockdir.contains(&hash).unwrap()); + assert_eq!(blockdir.stats.cache_hit.load(Relaxed), 1); + + let retrieved = blockdir.get_block_content(&hash).unwrap(); + assert_eq!(content, retrieved); + assert_eq!(blockdir.stats.cache_hit.load(Relaxed), 2); // hit against the value written + + let retrieved = blockdir.get_block_content(&hash).unwrap(); + assert_eq!(content, retrieved); + assert_eq!(blockdir.stats.cache_hit.load(Relaxed), 3); // hit again + } } diff --git a/src/restore.rs b/src/restore.rs index 042687b5..cb9a10fd 100644 --- a/src/restore.rs +++ b/src/restore.rs @@ -16,13 +16,14 @@ use std::fs::File; use std::io; use std::io::Write; use std::path::{Path, PathBuf}; +use std::sync::atomic::Ordering::Relaxed; use std::{fs, time::Instant}; use filetime::set_file_handle_times; #[cfg(unix)] use filetime::set_symlink_file_times; use time::OffsetDateTime; -use tracing::{error, instrument, warn}; +use tracing::{error, instrument, trace, warn}; use crate::band::BandSelectionPolicy; use crate::io::{directory_is_empty, ensure_dir_exists}; @@ -148,6 +149,7 @@ pub fn restore( } stats += apply_deferrals(&deferrals)?; stats.elapsed = start.elapsed(); + stats.block_cache_hits = block_dir.stats.cache_hit.load(Relaxed); // TODO: Merge in stats from the tree iter and maybe the source tree? Ok(stats) } @@ -244,6 +246,7 @@ fn restore_file( stats.errors += 1; } // TODO: Accumulate more stats. + trace!("Restored file"); Ok(stats) } diff --git a/src/stats.rs b/src/stats.rs index 8ac5fdca..b044382c 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -110,6 +110,8 @@ pub struct RestoreStats { pub uncompressed_file_bytes: u64, pub elapsed: Duration, + + pub block_cache_hits: usize, } impl fmt::Display for RestoreStats { @@ -122,6 +124,9 @@ impl fmt::Display for RestoreStats { write_count(w, "unsupported file kind", self.unknown_kind); writeln!(w).unwrap(); + write_count(w, "block cache hits", self.block_cache_hits); + writeln!(w).unwrap(); + write_count(w, "errors", self.errors); write_duration(w, "elapsed", self.elapsed)?; diff --git a/tests/damage/main.rs b/tests/damage/main.rs index 508f0cad..de249770 100644 --- a/tests/damage/main.rs +++ b/tests/damage/main.rs @@ -81,8 +81,14 @@ fn backup_after_damage( let backup_options = BackupOptions::default(); backup(&archive, source_dir.path(), &backup_options).expect("initial backup"); + drop(archive); action.damage(&location.to_path(&archive_dir)); + // Open the archive again to avoid cache effects. + let archive = + Archive::open(conserve::transport::open_local_transport(archive_dir.path()).unwrap()) + .expect("open archive"); + // A second backup should succeed. changes.apply(&source_dir); let backup_stats = backup(&archive, source_dir.path(), &backup_options)