Skip to content

Commit 75d7e27

Browse files
authored
Merge pull request #5137 from jbencin/fix/improve-deadlock-handling
fix: Improve DB deadlock detection/logging
2 parents 1f44a69 + 292cd89 commit 75d7e27

File tree

4 files changed

+101
-53
lines changed

4 files changed

+101
-53
lines changed

clarity/src/vm/database/sqlite.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use rusqlite::{
2121
};
2222
use stacks_common::types::chainstate::{BlockHeaderHash, StacksBlockId};
2323
use stacks_common::types::sqlite::NO_PARAMS;
24-
use stacks_common::util::db_common::tx_busy_handler;
24+
use stacks_common::util::db::tx_busy_handler;
2525
use stacks_common::util::hash::Sha512Trunc256Sum;
2626

2727
use super::clarity_store::{make_contract_hash_key, ContractCommitment};

stacks-common/src/util/db.rs

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
// Copyright (C) 2013-2020 Blockstack PBC, a public benefit corporation
2+
// Copyright (C) 2020 Stacks Open Internet Foundation
3+
//
4+
// This program is free software: you can redistribute it and/or modify
5+
// it under the terms of the GNU General Public License as published by
6+
// the Free Software Foundation, either version 3 of the License, or
7+
// (at your option) any later version.
8+
//
9+
// This program is distributed in the hope that it will be useful,
10+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
// GNU General Public License for more details.
13+
//
14+
// You should have received a copy of the GNU General Public License
15+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
17+
use std::backtrace::Backtrace;
18+
use std::sync::{LazyLock, Mutex};
19+
use std::thread;
20+
use std::time::{Duration, Instant};
21+
22+
use hashbrown::HashMap;
23+
use rand::{thread_rng, Rng};
24+
use rusqlite::Connection;
25+
26+
use crate::util::sleep_ms;
27+
28+
/// Keep track of DB locks, for deadlock debugging
29+
/// - **key:** `rusqlite::Connection` debug print
30+
/// - **value:** Lock holder (thread name + timestamp)
31+
///
32+
/// This uses a `Mutex` inside of `LazyLock` because:
33+
/// - Using `Mutex` alone, it can't be statically initialized because `HashMap::new()` isn't `const`
34+
/// - Using `LazyLock` alone doesn't allow interior mutability
35+
static LOCK_TABLE: LazyLock<Mutex<HashMap<String, String>>> =
36+
LazyLock::new(|| Mutex::new(HashMap::new()));
37+
/// Generate timestanps for use in `LOCK_TABLE`
38+
/// `Instant` is preferable to `SystemTime` because it uses `CLOCK_MONOTONIC` and is not affected by NTP adjustments
39+
static LOCK_TABLE_TIMER: LazyLock<Instant> = LazyLock::new(Instant::now);
40+
41+
/// Call when using an operation which locks a database
42+
/// Updates `LOCK_TABLE`
43+
pub fn update_lock_table(conn: &Connection) {
44+
let timestamp = LOCK_TABLE_TIMER.elapsed().as_millis();
45+
// The debug format for `Connection` includes the path
46+
let k = format!("{conn:?}");
47+
let v = format!("{:?}@{timestamp}", thread::current().name());
48+
LOCK_TABLE.lock().unwrap().insert(k, v);
49+
}
50+
51+
/// Called by `rusqlite` if we are waiting too long on a database lock
52+
/// If called too many times, will assume a deadlock and panic
53+
pub fn tx_busy_handler(run_count: i32) -> bool {
54+
const TIMEOUT: Duration = Duration::from_secs(300);
55+
const AVG_SLEEP_TIME_MS: u64 = 100;
56+
57+
// First, check if this is taking unreasonably long. If so, it's probably a deadlock
58+
let run_count = run_count.unsigned_abs();
59+
let approx_time_elapsed =
60+
Duration::from_millis(AVG_SLEEP_TIME_MS.saturating_mul(u64::from(run_count)));
61+
if approx_time_elapsed > TIMEOUT {
62+
error!("Deadlock detected. Waited {} seconds (estimated) for database lock. Giving up", approx_time_elapsed.as_secs();
63+
"run_count" => run_count,
64+
"backtrace" => ?Backtrace::capture()
65+
);
66+
for (k, v) in LOCK_TABLE.lock().unwrap().iter() {
67+
error!("Database '{k}' last locked by {v}");
68+
}
69+
panic!("Deadlock in thread {:?}", thread::current().name());
70+
}
71+
72+
let mut sleep_time_ms = 2u64.saturating_pow(run_count);
73+
74+
sleep_time_ms = sleep_time_ms.saturating_add(thread_rng().gen_range(0..sleep_time_ms));
75+
76+
if sleep_time_ms > AVG_SLEEP_TIME_MS {
77+
let jitter = 10;
78+
sleep_time_ms =
79+
thread_rng().gen_range((AVG_SLEEP_TIME_MS - jitter)..(AVG_SLEEP_TIME_MS + jitter));
80+
}
81+
82+
let msg = format!("Database is locked; sleeping {sleep_time_ms}ms and trying again");
83+
if run_count > 10 && run_count % 10 == 0 {
84+
warn!("{msg}";
85+
"run_count" => run_count,
86+
"backtrace" => ?Backtrace::capture()
87+
);
88+
} else {
89+
debug!("{msg}");
90+
}
91+
92+
sleep_ms(sleep_time_ms);
93+
true
94+
}

stacks-common/src/util/mod.rs

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ pub mod log;
1919
#[macro_use]
2020
pub mod macros;
2121
pub mod chunked_encoding;
22+
pub mod db;
2223
pub mod hash;
2324
pub mod pair;
2425
pub mod pipe;
@@ -85,32 +86,6 @@ impl error::Error for HexError {
8586
}
8687
}
8788

88-
pub mod db_common {
89-
use std::{thread, time};
90-
91-
use rand::{thread_rng, Rng};
92-
93-
pub fn tx_busy_handler(run_count: i32) -> bool {
94-
let mut sleep_count = 10;
95-
if run_count > 0 {
96-
sleep_count = 2u64.saturating_pow(run_count as u32);
97-
}
98-
sleep_count = sleep_count.saturating_add(thread_rng().gen::<u64>() % sleep_count);
99-
100-
if sleep_count > 5000 {
101-
sleep_count = 5000;
102-
}
103-
104-
debug!(
105-
"Database is locked; sleeping {}ms and trying again",
106-
&sleep_count
107-
);
108-
109-
thread::sleep(time::Duration::from_millis(sleep_count));
110-
true
111-
}
112-
}
113-
11489
/// Write any `serde_json` object directly to a file
11590
pub fn serialize_json_to_file<J, P>(json: &J, path: P) -> Result<(), std::io::Error>
11691
where

stackslib/src/util_lib/db.rs

Lines changed: 5 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use std::backtrace::Backtrace;
1818
use std::io::Error as IOError;
1919
use std::ops::{Deref, DerefMut};
2020
use std::path::{Path, PathBuf};
21-
use std::time::Duration;
21+
use std::time::{Duration, SystemTime};
2222
use std::{error, fmt, fs, io};
2323

2424
use clarity::vm::types::QualifiedContractIdentifier;
@@ -32,6 +32,7 @@ use serde_json::Error as serde_error;
3232
use stacks_common::types::chainstate::{SortitionId, StacksAddress, StacksBlockId, TrieHash};
3333
use stacks_common::types::sqlite::NO_PARAMS;
3434
use stacks_common::types::Address;
35+
use stacks_common::util::db::update_lock_table;
3536
use stacks_common::util::hash::to_hex;
3637
use stacks_common::util::secp256k1::{Secp256k1PrivateKey, Secp256k1PublicKey};
3738
use stacks_common::util::sleep_ms;
@@ -655,32 +656,9 @@ impl<'a, C: Clone, T: MarfTrieId> DerefMut for IndexDBTx<'a, C, T> {
655656
}
656657
}
657658

659+
/// Called by `rusqlite` if we are waiting too long on a database lock
658660
pub fn tx_busy_handler(run_count: i32) -> bool {
659-
let mut sleep_count = 2;
660-
if run_count > 0 {
661-
sleep_count = 2u64.saturating_pow(run_count as u32);
662-
}
663-
sleep_count = sleep_count.saturating_add(thread_rng().gen::<u64>() % sleep_count);
664-
665-
if sleep_count > 100 {
666-
let jitter = thread_rng().gen::<u64>() % 20;
667-
sleep_count = 100 - jitter;
668-
}
669-
670-
debug!(
671-
"Database is locked; sleeping {}ms and trying again",
672-
&sleep_count;
673-
"backtrace" => ?{
674-
if run_count > 10 && run_count % 10 == 0 {
675-
Some(Backtrace::capture())
676-
} else {
677-
None
678-
}
679-
},
680-
);
681-
682-
sleep_ms(sleep_count);
683-
true
661+
stacks_common::util::db::tx_busy_handler(run_count)
684662
}
685663

686664
/// Begin an immediate-mode transaction, and handle busy errors with exponential backoff.
@@ -697,6 +675,7 @@ pub fn tx_begin_immediate<'a>(conn: &'a mut Connection) -> Result<DBTx<'a>, Erro
697675
pub fn tx_begin_immediate_sqlite<'a>(conn: &'a mut Connection) -> Result<DBTx<'a>, sqlite_error> {
698676
conn.busy_handler(Some(tx_busy_handler))?;
699677
let tx = Transaction::new(conn, TransactionBehavior::Immediate)?;
678+
update_lock_table(tx.deref());
700679
Ok(tx)
701680
}
702681

0 commit comments

Comments
 (0)