Skip to content

Commit 176c021

Browse files
sdaftuarsipa
authored andcommitted
[qa] Test non-atomic chainstate writes
Adds new functional test, dbcrash.py, which uses -dbcrashratio to exercise the logic for recovering from a crash during chainstate flush. dbcrash.py is added to the extended tests, as it may take ~10 minutes to run Use _Exit() instead of exit() for crash simulation This eliminates stderr output such as: terminate called without an active exception or Assertion failed: (!pthread_mutex_destroy(&m)), function ~recursive_mutex, file /usr/local/include/boost/thread/pthread/recursive_mutex.hpp, line 104. Eliminating the stderr output on crash simulation allows testing with test_runner.py, which reports a test as failed if stderr is produced.
1 parent d6af06d commit 176c021

File tree

4 files changed

+272
-1
lines changed

4 files changed

+272
-1
lines changed

src/txdb.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ bool CCoinsViewDB::BatchWrite(CCoinsMap &mapCoins, const uint256 &hashBlock) {
124124
static FastRandomContext rng;
125125
if (rng.randrange(crash_simulate) == 0) {
126126
LogPrintf("Simulating a crash. Goodbye.\n");
127-
exit(0);
127+
_Exit(0);
128128
}
129129
}
130130
}

test/functional/dbcrash.py

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) 2017 The Bitcoin Core developers
3+
# Distributed under the MIT software license, see the accompanying
4+
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
5+
"""Test recovery from a crash during chainstate writing."""
6+
7+
from test_framework.test_framework import BitcoinTestFramework
8+
from test_framework.util import *
9+
from test_framework.script import *
10+
from test_framework.mininode import *
11+
import random
12+
try:
13+
import http.client as httplib
14+
except ImportError:
15+
import httplib
16+
import errno
17+
18+
'''
19+
Test structure:
20+
21+
- 4 nodes
22+
* node0, node1, and node2 will have different dbcrash ratios, and different
23+
dbcache sizes
24+
* node3 will be a regular node, with no crashing.
25+
* The nodes will not connect to each other.
26+
27+
- use default test framework starting chain. initialize starting_tip_height to
28+
tip height.
29+
30+
- Main loop:
31+
* generate lots of transactions on node3, enough to fill up a block.
32+
* uniformly randomly pick a tip height from starting_tip_height to
33+
tip_height; with probability 1/(height_difference+4), invalidate this block.
34+
* mine enough blocks to overtake tip_height at start of loop.
35+
* for each node in [node0,node1,node2]:
36+
- for each mined block:
37+
* submit block to node
38+
* if node crashed on/after submitting:
39+
- restart until recovery succeeds
40+
- check that utxo matches node3 using gettxoutsetinfo
41+
'''
42+
43+
class ChainstateWriteCrashTest(BitcoinTestFramework):
44+
45+
def __init__(self):
46+
super().__init__()
47+
self.num_nodes = 4
48+
self.setup_clean_chain = False
49+
50+
# Set -maxmempool=0 to turn off mempool memory sharing with dbcache
51+
# Set -rpcservertimeout=900 to reduce socket disconnects in this
52+
# long-running test
53+
self.base_args = ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900"]
54+
55+
# Set different crash ratios and cache sizes. Note that not all of
56+
# -dbcache goes to pcoinsTip.
57+
self.node0_args = ["-dbcrashratio=8", "-dbcache=4", "-dbbatchsize=200000"] + self.base_args
58+
self.node1_args = ["-dbcrashratio=16", "-dbcache=8", "-dbbatchsize=200000"] + self.base_args
59+
self.node2_args = ["-dbcrashratio=24", "-dbcache=16", "-dbbatchsize=200000"] + self.base_args
60+
61+
# Node3 is a normal node with default args, except will mine full blocks
62+
self.node3_args = ["-blockmaxweight=4000000"]
63+
self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args]
64+
65+
# We'll track some test coverage statistics
66+
self.restart_counts = [0, 0, 0] # Track the restarts for nodes 0-2
67+
self.crashed_on_restart = 0 # Track count of crashes during recovery
68+
69+
def setup_network(self):
70+
self.setup_nodes()
71+
# Leave them unconnected, we'll use submitblock directly in this test
72+
73+
# Starts up a given node id, waits for the tip to reach the given block
74+
# hash, and calculates the utxo hash. Exceptions on startup should
75+
# indicate node crash (due to -dbcrashratio), in which case we try again.
76+
# Give up after 60 seconds.
77+
# Returns the utxo hash of the given node.
78+
def restart_node(self, node_index, expected_tip):
79+
time_start = time.time()
80+
while time.time() - time_start < 60:
81+
try:
82+
# Any of these RPC calls could throw due to node crash
83+
self.nodes[node_index] = self.start_node(node_index, self.options.tmpdir, self.extra_args[node_index])
84+
self.nodes[node_index].waitforblock(expected_tip)
85+
utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_2']
86+
return utxo_hash
87+
except:
88+
# An exception here should mean the node is about to crash.
89+
# If bitcoind exits, then try again. wait_for_node_exit()
90+
# should raise an exception if bitcoind doesn't exit.
91+
wait_for_node_exit(node_index, timeout=10)
92+
self.crashed_on_restart += 1
93+
time.sleep(1)
94+
95+
# If we got here, bitcoind isn't coming back up on restart. Could be a
96+
# bug in bitcoind, or we've gotten unlucky with our dbcrash ratio --
97+
# perhaps we generated a test case that blew up our cache?
98+
# TODO: If this happens a lot, we should try to restart without -dbcrashratio
99+
# and make sure that recovery happens.
100+
raise AssertionError("Unable to successfully restart node %d in allotted time", node_index)
101+
102+
# Try submitting a block to the given node.
103+
# Catch any exceptions that indicate the node has crashed.
104+
# Returns true if the block was submitted successfully; false otherwise.
105+
def submit_block_catch_error(self, node_index, block):
106+
try:
107+
self.nodes[node_index].submitblock(block)
108+
return True
109+
except (httplib.CannotSendRequest, httplib.RemoteDisconnected) as e:
110+
self.log.debug("node %d submitblock raised exception: %s", node_index, e)
111+
return False
112+
except OSError as e:
113+
self.log.debug("node %d submitblock raised OSError exception: errno=%s", node_index, e.errno)
114+
if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]:
115+
# The node has likely crashed
116+
return False
117+
else:
118+
# Unexpected exception, raise
119+
raise
120+
121+
# Use submitblock to sync node3's chain with the other nodes
122+
# If submitblock fails, restart the node and get the new utxo hash.
123+
def sync_node3blocks(self, block_hashes):
124+
# If any nodes crash while updating, we'll compare utxo hashes to
125+
# ensure recovery was successful.
126+
node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
127+
128+
# Retrieve all the blocks from node3
129+
blocks = []
130+
for block_hash in block_hashes:
131+
blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)])
132+
133+
# Deliver each block to each other node
134+
for i in range(3):
135+
nodei_utxo_hash = None
136+
self.log.debug("Syncing blocks to node %d", i)
137+
for (block_hash, block) in blocks:
138+
# Get the block from node3, and submit to node_i
139+
self.log.debug("submitting block %s", block_hash)
140+
if not self.submit_block_catch_error(i, block):
141+
# TODO: more carefully check that the crash is due to -dbcrashratio
142+
# (change the exit code perhaps, and check that here?)
143+
wait_for_node_exit(i, timeout=30)
144+
self.log.debug("Restarting node %d after block hash %s", i, block_hash)
145+
nodei_utxo_hash = self.restart_node(i, block_hash)
146+
assert nodei_utxo_hash is not None
147+
self.restart_counts[i] += 1
148+
else:
149+
# Clear it out after successful submitblock calls -- the cached
150+
# utxo hash will no longer be correct
151+
nodei_utxo_hash = None
152+
153+
# Check that the utxo hash matches node3's utxo set
154+
# NOTE: we only check the utxo set if we had to restart the node
155+
# after the last block submitted:
156+
# - checking the utxo hash causes a cache flush, which we don't
157+
# want to do every time; so
158+
# - we only update the utxo cache after a node restart, since flushing
159+
# the cache is a no-op at that point
160+
if nodei_utxo_hash is not None:
161+
self.log.debug("Checking txoutsetinfo matches for node %d", i)
162+
assert_equal(nodei_utxo_hash, node3_utxo_hash)
163+
164+
# Verify that the utxo hash of each node matches node3.
165+
# Restart any nodes that crash while querying.
166+
def verify_utxo_hash(self):
167+
node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
168+
self.log.info("Verifying utxo hash matches for all nodes")
169+
170+
for i in range(3):
171+
try:
172+
nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_2']
173+
except OSError:
174+
# probably a crash on db flushing
175+
nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash())
176+
assert_equal(nodei_utxo_hash, node3_utxo_hash)
177+
178+
179+
def generate_small_transactions(self, node, count, utxo_list):
180+
FEE = 1000 # TODO: replace this with node relay fee based calculation
181+
num_transactions = 0
182+
random.shuffle(utxo_list)
183+
while len(utxo_list) >= 2 and num_transactions < count:
184+
tx = CTransaction()
185+
input_amount = 0
186+
for i in range(2):
187+
utxo = utxo_list.pop()
188+
tx.vin.append(CTxIn(COutPoint(int(utxo['txid'], 16), utxo['vout'])))
189+
input_amount += int(utxo['amount']*COIN)
190+
output_amount = (input_amount - FEE)//3
191+
192+
if output_amount <= 0:
193+
# Sanity check -- if we chose inputs that are too small, skip
194+
continue
195+
196+
for i in range(3):
197+
tx.vout.append(CTxOut(output_amount, hex_str_to_bytes(utxo['scriptPubKey'])))
198+
199+
# Sign and send the transaction to get into the mempool
200+
tx_signed_hex = node.signrawtransaction(ToHex(tx))['hex']
201+
node.sendrawtransaction(tx_signed_hex)
202+
num_transactions += 1
203+
204+
def run_test(self):
205+
206+
# Start by creating a lot of utxos on node3
207+
initial_height = self.nodes[3].getblockcount()
208+
utxo_list = create_confirmed_utxos(self.nodes[3].getnetworkinfo()['relayfee'], self.nodes[3], 5000)
209+
self.log.info("Prepped %d utxo entries", len(utxo_list))
210+
211+
# Sync these blocks with the other nodes
212+
block_hashes_to_sync = []
213+
for height in range(initial_height+1, self.nodes[3].getblockcount()+1):
214+
block_hashes_to_sync.append(self.nodes[3].getblockhash(height))
215+
216+
self.log.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync))
217+
# Syncing the blocks could cause nodes to crash, so the test begins here.
218+
self.sync_node3blocks(block_hashes_to_sync)
219+
220+
starting_tip_height = self.nodes[3].getblockcount()
221+
222+
# Main test loop:
223+
# each time through the loop, generate a bunch of transactions,
224+
# and then either mine a single new block on the tip, or some-sized reorg.
225+
for i in range(40):
226+
self.log.info("Iteration %d, generating 2500 transactions %s", i, self.restart_counts)
227+
# Generate a bunch of small-ish transactions
228+
self.generate_small_transactions(self.nodes[3], 2500, utxo_list)
229+
# Pick a random block between current tip, and starting tip
230+
current_height = self.nodes[3].getblockcount()
231+
random_height = random.randint(starting_tip_height, current_height)
232+
self.log.debug("At height %d, considering height %d", current_height, random_height)
233+
if random_height > starting_tip_height:
234+
# Randomly reorg from this point with some probability (1/4 for
235+
# tip, 1/5 for tip-1, ...)
236+
if random.random() < 1.0/(current_height + 4 - random_height):
237+
self.log.debug("Invalidating block at height %d", random_height)
238+
self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height))
239+
240+
# Now generate new blocks until we pass the old tip height
241+
self.log.debug("Mining longer tip")
242+
block_hashes = self.nodes[3].generate(current_height+1-self.nodes[3].getblockcount())
243+
self.log.debug("Syncing %d new blocks...", len(block_hashes))
244+
self.sync_node3blocks(block_hashes)
245+
utxo_list = self.nodes[3].listunspent()
246+
self.log.debug("Node3 utxo count: %d", len(utxo_list))
247+
248+
# Check that the utxo hashes agree with node3
249+
# Useful side effect: each utxo cache gets flushed here, so that we
250+
# won't get crashes on shutdown at the end of the test.
251+
self.verify_utxo_hash()
252+
253+
# Check the test coverage
254+
self.log.info("Restarted nodes: %s; crashes on restart: %d", self.restart_counts, self.crashed_on_restart)
255+
256+
# If no nodes were restarted, we didn't test anything.
257+
assert self.restart_counts != [0, 0, 0]
258+
259+
# Make sure we tested the case of crash-during-recovery.
260+
assert self.crashed_on_restart > 0
261+
262+
# Warn if any of the nodes escaped restart.
263+
for i in range(3):
264+
if self.restart_counts[i] == 0:
265+
self.log.warn("Node %d never crashed during utxo flush!", i)
266+
267+
if __name__ == "__main__":
268+
ChainstateWriteCrashTest().main()

test/functional/test_framework/util.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,8 @@ def wait_for_bitcoind_start(process, datadir, i, rpchost=None):
249249
raise
250250
time.sleep(0.25)
251251

252+
def wait_for_node_exit(node_index, timeout):
253+
bitcoind_processes[node_index].wait(timeout)
252254

253255
def _start_node(i, dirname, extra_args=None, rpchost=None, timewait=None, binary=None, stderr=None):
254256
"""Start a bitcoind and return RPC connection to it

test/functional/test_runner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@
124124
# vv Tests less than 5m vv
125125
'maxuploadtarget.py',
126126
'mempool_packages.py',
127+
'dbcrash.py',
127128
# vv Tests less than 2m vv
128129
'bip68-sequence.py',
129130
'getblocktemplate_longpoll.py',

0 commit comments

Comments
 (0)