Skip to content

Commit 349632e

Browse files
committed
Merge bitcoin/bitcoin#30807: Fix peers abruptly disconnecting from AssumeUTXO nodes during IBD
992f83b test: add coverage for assumeUTXO honest peers disconnection (furszy) 6d5812e assumeUTXO: fix peers disconnection during sync (furszy) Pull request description: Because AssumeUTXO nodes prioritize tip synchronization, they relay their local address through the network before completing the background chain sync. This, combined with the advertising of full-node service (`NODE_NETWORK`), can result in an honest peer in IBD connecting to the AssumeUTXO node (while syncing) and requesting an historical block the node does not have. This behavior leads to an abrupt disconnection due to perceived unresponsiveness from the AssumeUTXO node. This lack of response occurs because nodes ignore `getdata` requests when they do not have the block data available (further discussion can be found in #30385). Fix this by refraining from signaling full-node service support while the background chain is being synced. During this period, the node will only signal `NODE_NETWORK_LIMITED` support. Then, full-node (`NODE_NETWORK`) support will be re-enabled once the background chain sync is completed. Thanks mzumsande for a post-#30385 convo too. Testing notes: Just cherry-pick the second commit (bb08c22) on master. It will fail there, due to the IBD node requesting historical blocks to the snapshot node - which is bad because the snapshot node will ignore the requests and stall + disconnect after some time. ACKs for top commit: achow101: ACK 992f83b naumenkogs: ACK 992f83b mzumsande: ACK 992f83b Tree-SHA512: fef525d1cf3200c2dd89a346be9c82d77f2e28ddaaea1f490a435e180d1a47a371cadea508349777d740ab56e94be536ad8f7d61cc81f6550c58b609b3779ed3
2 parents f6298a8 + 992f83b commit 349632e

File tree

7 files changed

+147
-11
lines changed

7 files changed

+147
-11
lines changed

src/init.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1579,7 +1579,12 @@ bool AppInitMain(NodeContext& node, interfaces::BlockAndHeaderTipInfo* tip_info)
15791579
// This is defined and set here instead of inline in validation.h to avoid a hard
15801580
// dependency between validation and index/base, since the latter is not in
15811581
// libbitcoinkernel.
1582-
chainman.restart_indexes = [&node]() {
1582+
chainman.snapshot_download_completed = [&node]() {
1583+
if (!node.chainman->m_blockman.IsPruneMode()) {
1584+
LogPrintf("[snapshot] re-enabling NODE_NETWORK services\n");
1585+
node.connman->AddLocalServices(NODE_NETWORK);
1586+
}
1587+
15831588
LogPrintf("[snapshot] restarting indexes\n");
15841589

15851590
// Drain the validation interface queue to ensure that the old indexes
@@ -1716,8 +1721,13 @@ bool AppInitMain(NodeContext& node, interfaces::BlockAndHeaderTipInfo* tip_info)
17161721
}
17171722
}
17181723
} else {
1719-
LogPrintf("Setting NODE_NETWORK on non-prune mode\n");
1720-
nLocalServices = ServiceFlags(nLocalServices | NODE_NETWORK);
1724+
// Prior to setting NODE_NETWORK, check if we can provide historical blocks.
1725+
if (!WITH_LOCK(chainman.GetMutex(), return chainman.BackgroundSyncInProgress())) {
1726+
LogPrintf("Setting NODE_NETWORK on non-prune mode\n");
1727+
nLocalServices = ServiceFlags(nLocalServices | NODE_NETWORK);
1728+
} else {
1729+
LogPrintf("Running node in NODE_NETWORK_LIMITED mode until snapshot background sync completes\n");
1730+
}
17211731
}
17221732

17231733
// ********************************************************* Step 11: import blocks

src/net.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,7 +1791,8 @@ void CConnman::CreateNodeFromAcceptedSocket(std::unique_ptr<Sock>&& sock,
17911791
const bool inbound_onion = std::find(m_onion_binds.begin(), m_onion_binds.end(), addr_bind) != m_onion_binds.end();
17921792
// The V2Transport transparently falls back to V1 behavior when an incoming V1 connection is
17931793
// detected, so use it whenever we signal NODE_P2P_V2.
1794-
const bool use_v2transport(nLocalServices & NODE_P2P_V2);
1794+
ServiceFlags local_services = GetLocalServices();
1795+
const bool use_v2transport(local_services & NODE_P2P_V2);
17951796

17961797
CNode* pnode = new CNode(id,
17971798
std::move(sock),
@@ -1809,7 +1810,7 @@ void CConnman::CreateNodeFromAcceptedSocket(std::unique_ptr<Sock>&& sock,
18091810
.use_v2transport = use_v2transport,
18101811
});
18111812
pnode->AddRef();
1812-
m_msgproc->InitializeNode(*pnode, nLocalServices);
1813+
m_msgproc->InitializeNode(*pnode, local_services);
18131814
{
18141815
LOCK(m_nodes_mutex);
18151816
m_nodes.push_back(pnode);

src/net.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,6 +1221,11 @@ class CConnman
12211221
//! that peer during `net_processing.cpp:PushNodeVersion()`.
12221222
ServiceFlags GetLocalServices() const;
12231223

1224+
//! Updates the local services that this node advertises to other peers
1225+
//! during connection handshake.
1226+
void AddLocalServices(ServiceFlags services) { nLocalServices = ServiceFlags(nLocalServices | services); };
1227+
void RemoveLocalServices(ServiceFlags services) { nLocalServices = ServiceFlags(nLocalServices & ~services); }
1228+
12241229
uint64_t GetMaxOutboundTarget() const EXCLUSIVE_LOCKS_REQUIRED(!m_total_bytes_sent_mutex);
12251230
std::chrono::seconds GetMaxOutboundTimeframe() const;
12261231

@@ -1460,11 +1465,12 @@ class CConnman
14601465
* This data is replicated in each Peer instance we create.
14611466
*
14621467
* This data is not marked const, but after being set it should not
1463-
* change.
1468+
* change. Unless AssumeUTXO is started, in which case, the peer
1469+
* will be limited until the background chain sync finishes.
14641470
*
14651471
* \sa Peer::our_services
14661472
*/
1467-
ServiceFlags nLocalServices;
1473+
std::atomic<ServiceFlags> nLocalServices;
14681474

14691475
std::unique_ptr<CSemaphore> semOutbound;
14701476
std::unique_ptr<CSemaphore> semAddnode;

src/rpc/blockchain.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3042,6 +3042,13 @@ static RPCHelpMan loadtxoutset()
30423042
throw JSONRPCError(RPC_INTERNAL_ERROR, strprintf("Unable to load UTXO snapshot: %s. (%s)", util::ErrorString(activation_result).original, path.utf8string()));
30433043
}
30443044

3045+
// Because we can't provide historical blocks during tip or background sync.
3046+
// Update local services to reflect we are a limited peer until we are fully sync.
3047+
node.connman->RemoveLocalServices(NODE_NETWORK);
3048+
// Setting the limited state is usually redundant because the node can always
3049+
// provide the last 288 blocks, but it doesn't hurt to set it.
3050+
node.connman->AddLocalServices(NODE_NETWORK_LIMITED);
3051+
30453052
CBlockIndex& snapshot_index{*CHECK_NONFATAL(*activation_result)};
30463053

30473054
UniValue result(UniValue::VOBJ);

src/validation.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3575,8 +3575,8 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr<
35753575
//
35763576
// This cannot be done while holding cs_main (within
35773577
// MaybeCompleteSnapshotValidation) or a cs_main deadlock will occur.
3578-
if (m_chainman.restart_indexes) {
3579-
m_chainman.restart_indexes();
3578+
if (m_chainman.snapshot_download_completed) {
3579+
m_chainman.snapshot_download_completed();
35803580
}
35813581
break;
35823582
}

src/validation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -977,7 +977,7 @@ class ChainstateManager
977977

978978
//! Function to restart active indexes; set dynamically to avoid a circular
979979
//! dependency on `base/index.cpp`.
980-
std::function<void()> restart_indexes = std::function<void()>();
980+
std::function<void()> snapshot_download_completed = std::function<void()>();
981981

982982
const CChainParams& GetParams() const { return m_options.chainparams; }
983983
const Consensus::Params& GetConsensus() const { return m_options.chainparams.GetConsensus(); }

test/functional/feature_assumeutxo.py

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,30 @@
99
The assumeutxo value generated and used here is committed to in
1010
`CRegTestParams::m_assumeutxo_data` in `src/kernel/chainparams.cpp`.
1111
"""
12+
import time
1213
from shutil import rmtree
1314

1415
from dataclasses import dataclass
1516
from test_framework.blocktools import (
1617
create_block,
1718
create_coinbase
1819
)
19-
from test_framework.messages import tx_from_hex
20+
from test_framework.messages import (
21+
CBlockHeader,
22+
from_hex,
23+
msg_headers,
24+
tx_from_hex
25+
)
26+
from test_framework.p2p import (
27+
P2PInterface,
28+
)
2029
from test_framework.test_framework import BitcoinTestFramework
2130
from test_framework.util import (
2231
assert_approx,
2332
assert_equal,
2433
assert_raises_rpc_error,
2534
sha256sum_file,
35+
try_rpc,
2636
)
2737
from test_framework.wallet import (
2838
getnewdestination,
@@ -248,6 +258,74 @@ def test_snapshot_not_on_most_work_chain(self, dump_output_path):
248258
node1.submitheader(main_block1)
249259
node1.submitheader(main_block2)
250260

261+
def test_sync_from_assumeutxo_node(self, snapshot):
262+
"""
263+
This test verifies that:
264+
1. An IBD node can sync headers from an AssumeUTXO node at any time.
265+
2. IBD nodes do not request historical blocks from AssumeUTXO nodes while they are syncing the background-chain.
266+
3. The assumeUTXO node dynamically adjusts the network services it offers according to its state.
267+
4. IBD nodes can fully sync from AssumeUTXO nodes after they finish the background-chain sync.
268+
"""
269+
self.log.info("Testing IBD-sync from assumeUTXO node")
270+
# Node2 starts clean and loads the snapshot.
271+
# Node3 starts clean and seeks to sync-up from snapshot_node.
272+
miner = self.nodes[0]
273+
snapshot_node = self.nodes[2]
274+
ibd_node = self.nodes[3]
275+
276+
# Start test fresh by cleaning up node directories
277+
for node in (snapshot_node, ibd_node):
278+
self.stop_node(node.index)
279+
rmtree(node.chain_path)
280+
self.start_node(node.index, extra_args=self.extra_args[node.index])
281+
282+
# Sync-up headers chain on snapshot_node to load snapshot
283+
headers_provider_conn = snapshot_node.add_p2p_connection(P2PInterface())
284+
headers_provider_conn.wait_for_getheaders()
285+
msg = msg_headers()
286+
for block_num in range(1, miner.getblockcount()+1):
287+
msg.headers.append(from_hex(CBlockHeader(), miner.getblockheader(miner.getblockhash(block_num), verbose=False)))
288+
headers_provider_conn.send_message(msg)
289+
290+
# Ensure headers arrived
291+
default_value = {'status': ''} # No status
292+
headers_tip_hash = miner.getbestblockhash()
293+
self.wait_until(lambda: next(filter(lambda x: x['hash'] == headers_tip_hash, snapshot_node.getchaintips()), default_value)['status'] == "headers-only")
294+
snapshot_node.disconnect_p2ps()
295+
296+
# Load snapshot
297+
snapshot_node.loadtxoutset(snapshot['path'])
298+
299+
# Connect nodes and verify the ibd_node can sync-up the headers-chain from the snapshot_node
300+
self.connect_nodes(ibd_node.index, snapshot_node.index)
301+
snapshot_block_hash = snapshot['base_hash']
302+
self.wait_until(lambda: next(filter(lambda x: x['hash'] == snapshot_block_hash, ibd_node.getchaintips()), default_value)['status'] == "headers-only")
303+
304+
# Once the headers-chain is synced, the ibd_node must avoid requesting historical blocks from the snapshot_node.
305+
# If it does request such blocks, the snapshot_node will ignore requests it cannot fulfill, causing the ibd_node
306+
# to stall. This stall could last for up to 10 min, ultimately resulting in an abrupt disconnection due to the
307+
# ibd_node's perceived unresponsiveness.
308+
time.sleep(3) # Sleep here because we can't detect when a node avoids requesting blocks from other peer.
309+
assert_equal(len(ibd_node.getpeerinfo()[0]['inflight']), 0)
310+
311+
# Now disconnect nodes and finish background chain sync
312+
self.disconnect_nodes(ibd_node.index, snapshot_node.index)
313+
self.connect_nodes(snapshot_node.index, miner.index)
314+
self.sync_blocks(nodes=(miner, snapshot_node))
315+
# Check the base snapshot block was stored and ensure node signals full-node service support
316+
self.wait_until(lambda: not try_rpc(-1, "Block not found", snapshot_node.getblock, snapshot_block_hash))
317+
assert 'NETWORK' in snapshot_node.getnetworkinfo()['localservicesnames']
318+
319+
# Now the snapshot_node is sync, verify the ibd_node can sync from it
320+
self.connect_nodes(snapshot_node.index, ibd_node.index)
321+
assert 'NETWORK' in ibd_node.getpeerinfo()[0]['servicesnames']
322+
self.sync_blocks(nodes=(ibd_node, snapshot_node))
323+
324+
def assert_only_network_limited_service(self, node):
325+
node_services = node.getnetworkinfo()['localservicesnames']
326+
assert 'NETWORK' not in node_services
327+
assert 'NETWORK_LIMITED' in node_services
328+
251329
def run_test(self):
252330
"""
253331
Bring up two (disconnected) nodes, mine some new blocks on the first,
@@ -381,13 +459,20 @@ def check_dump_output(output):
381459
self.test_snapshot_block_invalidated(dump_output['path'])
382460
self.test_snapshot_not_on_most_work_chain(dump_output['path'])
383461

462+
# Prune-node sanity check
463+
assert 'NETWORK' not in n1.getnetworkinfo()['localservicesnames']
464+
384465
self.log.info(f"Loading snapshot into second node from {dump_output['path']}")
385466
# This node's tip is on an ancestor block of the snapshot, which should
386467
# be the normal case
387468
loaded = n1.loadtxoutset(dump_output['path'])
388469
assert_equal(loaded['coins_loaded'], SNAPSHOT_BASE_HEIGHT)
389470
assert_equal(loaded['base_height'], SNAPSHOT_BASE_HEIGHT)
390471

472+
self.log.info("Confirm that local services remain unchanged")
473+
# Since n1 is a pruned node, the 'NETWORK' service flag must always be unset.
474+
self.assert_only_network_limited_service(n1)
475+
391476
self.log.info("Check that UTXO-querying RPCs operate on snapshot chainstate")
392477
snapshot_hash = loaded['tip_hash']
393478
snapshot_num_coins = loaded['coins_loaded']
@@ -491,6 +576,9 @@ def check_tx_counts(final: bool) -> None:
491576
self.restart_node(1, extra_args=[
492577
f"-stopatheight={PAUSE_HEIGHT}", *self.extra_args[1]])
493578

579+
# Upon restart during snapshot tip sync, the node must remain in 'limited' mode.
580+
self.assert_only_network_limited_service(n1)
581+
494582
# Finally connect the nodes and let them sync.
495583
#
496584
# Set `wait_for_connect=False` to avoid a race between performing connection
@@ -507,6 +595,9 @@ def check_tx_counts(final: bool) -> None:
507595
self.log.info("Restarted node before snapshot validation completed, reloading...")
508596
self.restart_node(1, extra_args=self.extra_args[1])
509597

598+
# Upon restart, the node must remain in 'limited' mode
599+
self.assert_only_network_limited_service(n1)
600+
510601
# Send snapshot block to n1 out of order. This makes the test less
511602
# realistic because normally the snapshot block is one of the last
512603
# blocks downloaded, but its useful to test because it triggers more
@@ -525,6 +616,10 @@ def check_tx_counts(final: bool) -> None:
525616
self.log.info("Ensuring background validation completes")
526617
self.wait_until(lambda: len(n1.getchainstates()['chainstates']) == 1)
527618

619+
# Since n1 is a pruned node, it will not signal NODE_NETWORK after
620+
# completing the background sync.
621+
self.assert_only_network_limited_service(n1)
622+
528623
# Ensure indexes have synced.
529624
completed_idx_state = {
530625
'basic block filter index': COMPLETE_IDX,
@@ -555,12 +650,18 @@ def check_tx_counts(final: bool) -> None:
555650

556651
self.log.info("-- Testing all indexes + reindex")
557652
assert_equal(n2.getblockcount(), START_HEIGHT)
653+
assert 'NETWORK' in n2.getnetworkinfo()['localservicesnames'] # sanity check
558654

559655
self.log.info(f"Loading snapshot into third node from {dump_output['path']}")
560656
loaded = n2.loadtxoutset(dump_output['path'])
561657
assert_equal(loaded['coins_loaded'], SNAPSHOT_BASE_HEIGHT)
562658
assert_equal(loaded['base_height'], SNAPSHOT_BASE_HEIGHT)
563659

660+
# Even though n2 is a full node, it will unset the 'NETWORK' service flag during snapshot loading.
661+
# This indicates other peers that the node will temporarily not provide historical blocks.
662+
self.log.info("Check node2 updated the local services during snapshot load")
663+
self.assert_only_network_limited_service(n2)
664+
564665
for reindex_arg in ['-reindex=1', '-reindex-chainstate=1']:
565666
self.log.info(f"Check that restarting with {reindex_arg} will delete the snapshot chainstate")
566667
self.restart_node(2, extra_args=[reindex_arg, *self.extra_args[2]])
@@ -584,13 +685,21 @@ def check_tx_counts(final: bool) -> None:
584685
msg = "Unable to load UTXO snapshot: Can't activate a snapshot-based chainstate more than once"
585686
assert_raises_rpc_error(-32603, msg, n2.loadtxoutset, dump_output['path'])
586687

688+
# Upon restart, the node must stay in 'limited' mode until the background
689+
# chain sync completes.
690+
self.restart_node(2, extra_args=self.extra_args[2])
691+
self.assert_only_network_limited_service(n2)
692+
587693
self.connect_nodes(0, 2)
588694
self.wait_until(lambda: n2.getchainstates()['chainstates'][-1]['blocks'] == FINAL_HEIGHT)
589695
self.sync_blocks(nodes=(n0, n2))
590696

591697
self.log.info("Ensuring background validation completes")
592698
self.wait_until(lambda: len(n2.getchainstates()['chainstates']) == 1)
593699

700+
# Once background chain sync completes, the full node must start offering historical blocks again.
701+
assert {'NETWORK', 'NETWORK_LIMITED'}.issubset(n2.getnetworkinfo()['localservicesnames'])
702+
594703
completed_idx_state = {
595704
'basic block filter index': COMPLETE_IDX,
596705
'coinstatsindex': COMPLETE_IDX,
@@ -625,6 +734,9 @@ def check_tx_counts(final: bool) -> None:
625734

626735
self.test_snapshot_in_a_divergent_chain(dump_output['path'])
627736

737+
# The following test cleans node2 and node3 chain directories.
738+
self.test_sync_from_assumeutxo_node(snapshot=dump_output)
739+
628740
@dataclass
629741
class Block:
630742
hash: str

0 commit comments

Comments
 (0)