|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# Copyright (c) 2017 The Bitcoin Core developers |
| 3 | +# Distributed under the MIT software license, see the accompanying |
| 4 | +# file COPYING or http://www.opensource.org/licenses/mit-license.php. |
| 5 | +"""Test recovery from a crash during chainstate writing.""" |
| 6 | + |
| 7 | +from test_framework.test_framework import BitcoinTestFramework |
| 8 | +from test_framework.util import * |
| 9 | +from test_framework.script import * |
| 10 | +from test_framework.mininode import * |
| 11 | +import random |
| 12 | +try: |
| 13 | + import http.client as httplib |
| 14 | +except ImportError: |
| 15 | + import httplib |
| 16 | +import errno |
| 17 | + |
| 18 | +''' |
| 19 | +Test structure: |
| 20 | +
|
| 21 | +- 4 nodes |
| 22 | + * node0, node1, and node2 will have different dbcrash ratios, and different |
| 23 | + dbcache sizes |
| 24 | + * node3 will be a regular node, with no crashing. |
| 25 | + * The nodes will not connect to each other. |
| 26 | +
|
| 27 | +- use default test framework starting chain. initialize starting_tip_height to |
| 28 | + tip height. |
| 29 | +
|
| 30 | +- Main loop: |
| 31 | + * generate lots of transactions on node3, enough to fill up a block. |
| 32 | + * uniformly randomly pick a tip height from starting_tip_height to |
| 33 | + tip_height; with probability 1/(height_difference+4), invalidate this block. |
| 34 | + * mine enough blocks to overtake tip_height at start of loop. |
| 35 | + * for each node in [node0,node1,node2]: |
| 36 | + - for each mined block: |
| 37 | + * submit block to node |
| 38 | + * if node crashed on/after submitting: |
| 39 | + - restart until recovery succeeds |
| 40 | + - check that utxo matches node3 using gettxoutsetinfo |
| 41 | +''' |
| 42 | + |
| 43 | +class ChainstateWriteCrashTest(BitcoinTestFramework): |
| 44 | + |
| 45 | + def __init__(self): |
| 46 | + super().__init__() |
| 47 | + self.num_nodes = 4 |
| 48 | + self.setup_clean_chain = False |
| 49 | + |
| 50 | + # Set -maxmempool=0 to turn off mempool memory sharing with dbcache |
| 51 | + # Set -rpcservertimeout=900 to reduce socket disconnects in this |
| 52 | + # long-running test |
| 53 | + self.base_args = ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900"] |
| 54 | + |
| 55 | + # Set different crash ratios and cache sizes. Note that not all of |
| 56 | + # -dbcache goes to pcoinsTip. |
| 57 | + self.node0_args = ["-dbcrashratio=8", "-dbcache=4", "-dbbatchsize=200000"] + self.base_args |
| 58 | + self.node1_args = ["-dbcrashratio=16", "-dbcache=8", "-dbbatchsize=200000"] + self.base_args |
| 59 | + self.node2_args = ["-dbcrashratio=24", "-dbcache=16", "-dbbatchsize=200000"] + self.base_args |
| 60 | + |
| 61 | + # Node3 is a normal node with default args, except will mine full blocks |
| 62 | + self.node3_args = ["-blockmaxweight=4000000"] |
| 63 | + self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args] |
| 64 | + |
| 65 | + # We'll track some test coverage statistics |
| 66 | + self.restart_counts = [0, 0, 0] # Track the restarts for nodes 0-2 |
| 67 | + self.crashed_on_restart = 0 # Track count of crashes during recovery |
| 68 | + |
| 69 | + def setup_network(self): |
| 70 | + self.setup_nodes() |
| 71 | + # Leave them unconnected, we'll use submitblock directly in this test |
| 72 | + |
| 73 | + # Starts up a given node id, waits for the tip to reach the given block |
| 74 | + # hash, and calculates the utxo hash. Exceptions on startup should |
| 75 | + # indicate node crash (due to -dbcrashratio), in which case we try again. |
| 76 | + # Give up after 60 seconds. |
| 77 | + # Returns the utxo hash of the given node. |
| 78 | + def restart_node(self, node_index, expected_tip): |
| 79 | + time_start = time.time() |
| 80 | + while time.time() - time_start < 60: |
| 81 | + try: |
| 82 | + # Any of these RPC calls could throw due to node crash |
| 83 | + self.nodes[node_index] = self.start_node(node_index, self.options.tmpdir, self.extra_args[node_index]) |
| 84 | + self.nodes[node_index].waitforblock(expected_tip) |
| 85 | + utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_2'] |
| 86 | + return utxo_hash |
| 87 | + except: |
| 88 | + # An exception here should mean the node is about to crash. |
| 89 | + # If bitcoind exits, then try again. wait_for_node_exit() |
| 90 | + # should raise an exception if bitcoind doesn't exit. |
| 91 | + wait_for_node_exit(node_index, timeout=10) |
| 92 | + self.crashed_on_restart += 1 |
| 93 | + time.sleep(1) |
| 94 | + |
| 95 | + # If we got here, bitcoind isn't coming back up on restart. Could be a |
| 96 | + # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio -- |
| 97 | + # perhaps we generated a test case that blew up our cache? |
| 98 | + # TODO: If this happens a lot, we should try to restart without -dbcrashratio |
| 99 | + # and make sure that recovery happens. |
| 100 | + raise AssertionError("Unable to successfully restart node %d in allotted time", node_index) |
| 101 | + |
| 102 | + # Try submitting a block to the given node. |
| 103 | + # Catch any exceptions that indicate the node has crashed. |
| 104 | + # Returns true if the block was submitted successfully; false otherwise. |
| 105 | + def submit_block_catch_error(self, node_index, block): |
| 106 | + try: |
| 107 | + self.nodes[node_index].submitblock(block) |
| 108 | + return True |
| 109 | + except (httplib.CannotSendRequest, httplib.RemoteDisconnected) as e: |
| 110 | + self.log.debug("node %d submitblock raised exception: %s", node_index, e) |
| 111 | + return False |
| 112 | + except OSError as e: |
| 113 | + self.log.debug("node %d submitblock raised OSError exception: errno=%s", node_index, e.errno) |
| 114 | + if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]: |
| 115 | + # The node has likely crashed |
| 116 | + return False |
| 117 | + else: |
| 118 | + # Unexpected exception, raise |
| 119 | + raise |
| 120 | + |
| 121 | + # Use submitblock to sync node3's chain with the other nodes |
| 122 | + # If submitblock fails, restart the node and get the new utxo hash. |
| 123 | + def sync_node3blocks(self, block_hashes): |
| 124 | + # If any nodes crash while updating, we'll compare utxo hashes to |
| 125 | + # ensure recovery was successful. |
| 126 | + node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2'] |
| 127 | + |
| 128 | + # Retrieve all the blocks from node3 |
| 129 | + blocks = [] |
| 130 | + for block_hash in block_hashes: |
| 131 | + blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)]) |
| 132 | + |
| 133 | + # Deliver each block to each other node |
| 134 | + for i in range(3): |
| 135 | + nodei_utxo_hash = None |
| 136 | + self.log.debug("Syncing blocks to node %d", i) |
| 137 | + for (block_hash, block) in blocks: |
| 138 | + # Get the block from node3, and submit to node_i |
| 139 | + self.log.debug("submitting block %s", block_hash) |
| 140 | + if not self.submit_block_catch_error(i, block): |
| 141 | + # TODO: more carefully check that the crash is due to -dbcrashratio |
| 142 | + # (change the exit code perhaps, and check that here?) |
| 143 | + wait_for_node_exit(i, timeout=30) |
| 144 | + self.log.debug("Restarting node %d after block hash %s", i, block_hash) |
| 145 | + nodei_utxo_hash = self.restart_node(i, block_hash) |
| 146 | + assert nodei_utxo_hash is not None |
| 147 | + self.restart_counts[i] += 1 |
| 148 | + else: |
| 149 | + # Clear it out after successful submitblock calls -- the cached |
| 150 | + # utxo hash will no longer be correct |
| 151 | + nodei_utxo_hash = None |
| 152 | + |
| 153 | + # Check that the utxo hash matches node3's utxo set |
| 154 | + # NOTE: we only check the utxo set if we had to restart the node |
| 155 | + # after the last block submitted: |
| 156 | + # - checking the utxo hash causes a cache flush, which we don't |
| 157 | + # want to do every time; so |
| 158 | + # - we only update the utxo cache after a node restart, since flushing |
| 159 | + # the cache is a no-op at that point |
| 160 | + if nodei_utxo_hash is not None: |
| 161 | + self.log.debug("Checking txoutsetinfo matches for node %d", i) |
| 162 | + assert_equal(nodei_utxo_hash, node3_utxo_hash) |
| 163 | + |
| 164 | + # Verify that the utxo hash of each node matches node3. |
| 165 | + # Restart any nodes that crash while querying. |
| 166 | + def verify_utxo_hash(self): |
| 167 | + node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2'] |
| 168 | + self.log.info("Verifying utxo hash matches for all nodes") |
| 169 | + |
| 170 | + for i in range(3): |
| 171 | + try: |
| 172 | + nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_2'] |
| 173 | + except OSError: |
| 174 | + # probably a crash on db flushing |
| 175 | + nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash()) |
| 176 | + assert_equal(nodei_utxo_hash, node3_utxo_hash) |
| 177 | + |
| 178 | + |
| 179 | + def generate_small_transactions(self, node, count, utxo_list): |
| 180 | + FEE = 1000 # TODO: replace this with node relay fee based calculation |
| 181 | + num_transactions = 0 |
| 182 | + random.shuffle(utxo_list) |
| 183 | + while len(utxo_list) >= 2 and num_transactions < count: |
| 184 | + tx = CTransaction() |
| 185 | + input_amount = 0 |
| 186 | + for i in range(2): |
| 187 | + utxo = utxo_list.pop() |
| 188 | + tx.vin.append(CTxIn(COutPoint(int(utxo['txid'], 16), utxo['vout']))) |
| 189 | + input_amount += int(utxo['amount']*COIN) |
| 190 | + output_amount = (input_amount - FEE)//3 |
| 191 | + |
| 192 | + if output_amount <= 0: |
| 193 | + # Sanity check -- if we chose inputs that are too small, skip |
| 194 | + continue |
| 195 | + |
| 196 | + for i in range(3): |
| 197 | + tx.vout.append(CTxOut(output_amount, hex_str_to_bytes(utxo['scriptPubKey']))) |
| 198 | + |
| 199 | + # Sign and send the transaction to get into the mempool |
| 200 | + tx_signed_hex = node.signrawtransaction(ToHex(tx))['hex'] |
| 201 | + node.sendrawtransaction(tx_signed_hex) |
| 202 | + num_transactions += 1 |
| 203 | + |
| 204 | + def run_test(self): |
| 205 | + |
| 206 | + # Start by creating a lot of utxos on node3 |
| 207 | + initial_height = self.nodes[3].getblockcount() |
| 208 | + utxo_list = create_confirmed_utxos(self.nodes[3].getnetworkinfo()['relayfee'], self.nodes[3], 5000) |
| 209 | + self.log.info("Prepped %d utxo entries", len(utxo_list)) |
| 210 | + |
| 211 | + # Sync these blocks with the other nodes |
| 212 | + block_hashes_to_sync = [] |
| 213 | + for height in range(initial_height+1, self.nodes[3].getblockcount()+1): |
| 214 | + block_hashes_to_sync.append(self.nodes[3].getblockhash(height)) |
| 215 | + |
| 216 | + self.log.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync)) |
| 217 | + # Syncing the blocks could cause nodes to crash, so the test begins here. |
| 218 | + self.sync_node3blocks(block_hashes_to_sync) |
| 219 | + |
| 220 | + starting_tip_height = self.nodes[3].getblockcount() |
| 221 | + |
| 222 | + # Main test loop: |
| 223 | + # each time through the loop, generate a bunch of transactions, |
| 224 | + # and then either mine a single new block on the tip, or some-sized reorg. |
| 225 | + for i in range(40): |
| 226 | + self.log.info("Iteration %d, generating 2500 transactions %s", i, self.restart_counts) |
| 227 | + # Generate a bunch of small-ish transactions |
| 228 | + self.generate_small_transactions(self.nodes[3], 2500, utxo_list) |
| 229 | + # Pick a random block between current tip, and starting tip |
| 230 | + current_height = self.nodes[3].getblockcount() |
| 231 | + random_height = random.randint(starting_tip_height, current_height) |
| 232 | + self.log.debug("At height %d, considering height %d", current_height, random_height) |
| 233 | + if random_height > starting_tip_height: |
| 234 | + # Randomly reorg from this point with some probability (1/4 for |
| 235 | + # tip, 1/5 for tip-1, ...) |
| 236 | + if random.random() < 1.0/(current_height + 4 - random_height): |
| 237 | + self.log.debug("Invalidating block at height %d", random_height) |
| 238 | + self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height)) |
| 239 | + |
| 240 | + # Now generate new blocks until we pass the old tip height |
| 241 | + self.log.debug("Mining longer tip") |
| 242 | + block_hashes = self.nodes[3].generate(current_height+1-self.nodes[3].getblockcount()) |
| 243 | + self.log.debug("Syncing %d new blocks...", len(block_hashes)) |
| 244 | + self.sync_node3blocks(block_hashes) |
| 245 | + utxo_list = self.nodes[3].listunspent() |
| 246 | + self.log.debug("Node3 utxo count: %d", len(utxo_list)) |
| 247 | + |
| 248 | + # Check that the utxo hashes agree with node3 |
| 249 | + # Useful side effect: each utxo cache gets flushed here, so that we |
| 250 | + # won't get crashes on shutdown at the end of the test. |
| 251 | + self.verify_utxo_hash() |
| 252 | + |
| 253 | + # Check the test coverage |
| 254 | + self.log.info("Restarted nodes: %s; crashes on restart: %d", self.restart_counts, self.crashed_on_restart) |
| 255 | + |
| 256 | + # If no nodes were restarted, we didn't test anything. |
| 257 | + assert self.restart_counts != [0, 0, 0] |
| 258 | + |
| 259 | + # Make sure we tested the case of crash-during-recovery. |
| 260 | + assert self.crashed_on_restart > 0 |
| 261 | + |
| 262 | + # Warn if any of the nodes escaped restart. |
| 263 | + for i in range(3): |
| 264 | + if self.restart_counts[i] == 0: |
| 265 | + self.log.warn("Node %d never crashed during utxo flush!", i) |
| 266 | + |
| 267 | +if __name__ == "__main__": |
| 268 | + ChainstateWriteCrashTest().main() |
0 commit comments