Skip to content

Commit 6d5812e

Browse files
committed
assumeUTXO: fix peers disconnection during sync
Because AssumeUTXO nodes prioritize tip synchronization, they relay their local address through the network before completing the background chain sync. This, combined with the advertising of full-node service (NODE_NETWORK), can result in an honest peer in IBD connecting to the AssumeUTXO node (while syncing) and requesting an historical block the node does not have. This behavior leads to an abrupt disconnection due to perceived unresponsiveness (lack of response) from the AssumeUTXO node. This lack of response occurs because nodes ignore getdata requests when they do not have the block data available (further discussion can be found in PR 30385). Fix this by refraining from signaling full-node service support while the background chain is being synced. During this period, the node will only signal 'NODE_NETWORK_LIMITED' support. Then, full-node ('NODE_NETWORK') support will be re-enabled once the background chain sync is completed.
1 parent 93e4824 commit 6d5812e

File tree

7 files changed

+70
-10
lines changed

7 files changed

+70
-10
lines changed

src/init.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,7 +1558,12 @@ bool AppInitMain(NodeContext& node, interfaces::BlockAndHeaderTipInfo* tip_info)
15581558
// This is defined and set here instead of inline in validation.h to avoid a hard
15591559
// dependency between validation and index/base, since the latter is not in
15601560
// libbitcoinkernel.
1561-
chainman.restart_indexes = [&node]() {
1561+
chainman.snapshot_download_completed = [&node]() {
1562+
if (!node.chainman->m_blockman.IsPruneMode()) {
1563+
LogPrintf("[snapshot] re-enabling NODE_NETWORK services\n");
1564+
node.connman->AddLocalServices(NODE_NETWORK);
1565+
}
1566+
15621567
LogPrintf("[snapshot] restarting indexes\n");
15631568

15641569
// Drain the validation interface queue to ensure that the old indexes
@@ -1695,8 +1700,13 @@ bool AppInitMain(NodeContext& node, interfaces::BlockAndHeaderTipInfo* tip_info)
16951700
}
16961701
}
16971702
} else {
1698-
LogPrintf("Setting NODE_NETWORK on non-prune mode\n");
1699-
nLocalServices = ServiceFlags(nLocalServices | NODE_NETWORK);
1703+
// Prior to setting NODE_NETWORK, check if we can provide historical blocks.
1704+
if (!WITH_LOCK(chainman.GetMutex(), return chainman.BackgroundSyncInProgress())) {
1705+
LogPrintf("Setting NODE_NETWORK on non-prune mode\n");
1706+
nLocalServices = ServiceFlags(nLocalServices | NODE_NETWORK);
1707+
} else {
1708+
LogPrintf("Running node in NODE_NETWORK_LIMITED mode until snapshot background sync completes\n");
1709+
}
17001710
}
17011711

17021712
// ********************************************************* Step 11: import blocks

src/net.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,7 +1791,8 @@ void CConnman::CreateNodeFromAcceptedSocket(std::unique_ptr<Sock>&& sock,
17911791
const bool inbound_onion = std::find(m_onion_binds.begin(), m_onion_binds.end(), addr_bind) != m_onion_binds.end();
17921792
// The V2Transport transparently falls back to V1 behavior when an incoming V1 connection is
17931793
// detected, so use it whenever we signal NODE_P2P_V2.
1794-
const bool use_v2transport(nLocalServices & NODE_P2P_V2);
1794+
ServiceFlags local_services = GetLocalServices();
1795+
const bool use_v2transport(local_services & NODE_P2P_V2);
17951796

17961797
CNode* pnode = new CNode(id,
17971798
std::move(sock),
@@ -1809,7 +1810,7 @@ void CConnman::CreateNodeFromAcceptedSocket(std::unique_ptr<Sock>&& sock,
18091810
.use_v2transport = use_v2transport,
18101811
});
18111812
pnode->AddRef();
1812-
m_msgproc->InitializeNode(*pnode, nLocalServices);
1813+
m_msgproc->InitializeNode(*pnode, local_services);
18131814
{
18141815
LOCK(m_nodes_mutex);
18151816
m_nodes.push_back(pnode);

src/net.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,6 +1221,11 @@ class CConnman
12211221
//! that peer during `net_processing.cpp:PushNodeVersion()`.
12221222
ServiceFlags GetLocalServices() const;
12231223

1224+
//! Updates the local services that this node advertises to other peers
1225+
//! during connection handshake.
1226+
void AddLocalServices(ServiceFlags services) { nLocalServices = ServiceFlags(nLocalServices | services); };
1227+
void RemoveLocalServices(ServiceFlags services) { nLocalServices = ServiceFlags(nLocalServices & ~services); }
1228+
12241229
uint64_t GetMaxOutboundTarget() const EXCLUSIVE_LOCKS_REQUIRED(!m_total_bytes_sent_mutex);
12251230
std::chrono::seconds GetMaxOutboundTimeframe() const;
12261231

@@ -1460,11 +1465,12 @@ class CConnman
14601465
* This data is replicated in each Peer instance we create.
14611466
*
14621467
* This data is not marked const, but after being set it should not
1463-
* change.
1468+
* change. Unless AssumeUTXO is started, in which case, the peer
1469+
* will be limited until the background chain sync finishes.
14641470
*
14651471
* \sa Peer::our_services
14661472
*/
1467-
ServiceFlags nLocalServices;
1473+
std::atomic<ServiceFlags> nLocalServices;
14681474

14691475
std::unique_ptr<CSemaphore> semOutbound;
14701476
std::unique_ptr<CSemaphore> semAddnode;

src/rpc/blockchain.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3043,6 +3043,13 @@ static RPCHelpMan loadtxoutset()
30433043
throw JSONRPCError(RPC_INTERNAL_ERROR, strprintf("Unable to load UTXO snapshot: %s. (%s)", util::ErrorString(activation_result).original, path.utf8string()));
30443044
}
30453045

3046+
// Because we can't provide historical blocks during tip or background sync.
3047+
// Update local services to reflect we are a limited peer until we are fully sync.
3048+
node.connman->RemoveLocalServices(NODE_NETWORK);
3049+
// Setting the limited state is usually redundant because the node can always
3050+
// provide the last 288 blocks, but it doesn't hurt to set it.
3051+
node.connman->AddLocalServices(NODE_NETWORK_LIMITED);
3052+
30463053
CBlockIndex& snapshot_index{*CHECK_NONFATAL(*activation_result)};
30473054

30483055
UniValue result(UniValue::VOBJ);

src/validation.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3575,8 +3575,8 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr<
35753575
//
35763576
// This cannot be done while holding cs_main (within
35773577
// MaybeCompleteSnapshotValidation) or a cs_main deadlock will occur.
3578-
if (m_chainman.restart_indexes) {
3579-
m_chainman.restart_indexes();
3578+
if (m_chainman.snapshot_download_completed) {
3579+
m_chainman.snapshot_download_completed();
35803580
}
35813581
break;
35823582
}

src/validation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -977,7 +977,7 @@ class ChainstateManager
977977

978978
//! Function to restart active indexes; set dynamically to avoid a circular
979979
//! dependency on `base/index.cpp`.
980-
std::function<void()> restart_indexes = std::function<void()>();
980+
std::function<void()> snapshot_download_completed = std::function<void()>();
981981

982982
const CChainParams& GetParams() const { return m_options.chainparams; }
983983
const Consensus::Params& GetConsensus() const { return m_options.chainparams.GetConsensus(); }

test/functional/feature_assumeutxo.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,11 @@ def test_snapshot_not_on_most_work_chain(self, dump_output_path):
248248
node1.submitheader(main_block1)
249249
node1.submitheader(main_block2)
250250

251+
def assert_only_network_limited_service(self, node):
252+
node_services = node.getnetworkinfo()['localservicesnames']
253+
assert 'NETWORK' not in node_services
254+
assert 'NETWORK_LIMITED' in node_services
255+
251256
def run_test(self):
252257
"""
253258
Bring up two (disconnected) nodes, mine some new blocks on the first,
@@ -381,13 +386,20 @@ def check_dump_output(output):
381386
self.test_snapshot_block_invalidated(dump_output['path'])
382387
self.test_snapshot_not_on_most_work_chain(dump_output['path'])
383388

389+
# Prune-node sanity check
390+
assert 'NETWORK' not in n1.getnetworkinfo()['localservicesnames']
391+
384392
self.log.info(f"Loading snapshot into second node from {dump_output['path']}")
385393
# This node's tip is on an ancestor block of the snapshot, which should
386394
# be the normal case
387395
loaded = n1.loadtxoutset(dump_output['path'])
388396
assert_equal(loaded['coins_loaded'], SNAPSHOT_BASE_HEIGHT)
389397
assert_equal(loaded['base_height'], SNAPSHOT_BASE_HEIGHT)
390398

399+
self.log.info("Confirm that local services remain unchanged")
400+
# Since n1 is a pruned node, the 'NETWORK' service flag must always be unset.
401+
self.assert_only_network_limited_service(n1)
402+
391403
self.log.info("Check that UTXO-querying RPCs operate on snapshot chainstate")
392404
snapshot_hash = loaded['tip_hash']
393405
snapshot_num_coins = loaded['coins_loaded']
@@ -491,6 +503,9 @@ def check_tx_counts(final: bool) -> None:
491503
self.restart_node(1, extra_args=[
492504
f"-stopatheight={PAUSE_HEIGHT}", *self.extra_args[1]])
493505

506+
# Upon restart during snapshot tip sync, the node must remain in 'limited' mode.
507+
self.assert_only_network_limited_service(n1)
508+
494509
# Finally connect the nodes and let them sync.
495510
#
496511
# Set `wait_for_connect=False` to avoid a race between performing connection
@@ -507,6 +522,9 @@ def check_tx_counts(final: bool) -> None:
507522
self.log.info("Restarted node before snapshot validation completed, reloading...")
508523
self.restart_node(1, extra_args=self.extra_args[1])
509524

525+
# Upon restart, the node must remain in 'limited' mode
526+
self.assert_only_network_limited_service(n1)
527+
510528
# Send snapshot block to n1 out of order. This makes the test less
511529
# realistic because normally the snapshot block is one of the last
512530
# blocks downloaded, but its useful to test because it triggers more
@@ -525,6 +543,10 @@ def check_tx_counts(final: bool) -> None:
525543
self.log.info("Ensuring background validation completes")
526544
self.wait_until(lambda: len(n1.getchainstates()['chainstates']) == 1)
527545

546+
# Since n1 is a pruned node, it will not signal NODE_NETWORK after
547+
# completing the background sync.
548+
self.assert_only_network_limited_service(n1)
549+
528550
# Ensure indexes have synced.
529551
completed_idx_state = {
530552
'basic block filter index': COMPLETE_IDX,
@@ -555,12 +577,18 @@ def check_tx_counts(final: bool) -> None:
555577

556578
self.log.info("-- Testing all indexes + reindex")
557579
assert_equal(n2.getblockcount(), START_HEIGHT)
580+
assert 'NETWORK' in n2.getnetworkinfo()['localservicesnames'] # sanity check
558581

559582
self.log.info(f"Loading snapshot into third node from {dump_output['path']}")
560583
loaded = n2.loadtxoutset(dump_output['path'])
561584
assert_equal(loaded['coins_loaded'], SNAPSHOT_BASE_HEIGHT)
562585
assert_equal(loaded['base_height'], SNAPSHOT_BASE_HEIGHT)
563586

587+
# Even though n2 is a full node, it will unset the 'NETWORK' service flag during snapshot loading.
588+
# This indicates other peers that the node will temporarily not provide historical blocks.
589+
self.log.info("Check node2 updated the local services during snapshot load")
590+
self.assert_only_network_limited_service(n2)
591+
564592
for reindex_arg in ['-reindex=1', '-reindex-chainstate=1']:
565593
self.log.info(f"Check that restarting with {reindex_arg} will delete the snapshot chainstate")
566594
self.restart_node(2, extra_args=[reindex_arg, *self.extra_args[2]])
@@ -584,13 +612,21 @@ def check_tx_counts(final: bool) -> None:
584612
msg = "Unable to load UTXO snapshot: Can't activate a snapshot-based chainstate more than once"
585613
assert_raises_rpc_error(-32603, msg, n2.loadtxoutset, dump_output['path'])
586614

615+
# Upon restart, the node must stay in 'limited' mode until the background
616+
# chain sync completes.
617+
self.restart_node(2, extra_args=self.extra_args[2])
618+
self.assert_only_network_limited_service(n2)
619+
587620
self.connect_nodes(0, 2)
588621
self.wait_until(lambda: n2.getchainstates()['chainstates'][-1]['blocks'] == FINAL_HEIGHT)
589622
self.sync_blocks(nodes=(n0, n2))
590623

591624
self.log.info("Ensuring background validation completes")
592625
self.wait_until(lambda: len(n2.getchainstates()['chainstates']) == 1)
593626

627+
# Once background chain sync completes, the full node must start offering historical blocks again.
628+
assert {'NETWORK', 'NETWORK_LIMITED'}.issubset(n2.getnetworkinfo()['localservicesnames'])
629+
594630
completed_idx_state = {
595631
'basic block filter index': COMPLETE_IDX,
596632
'coinstatsindex': COMPLETE_IDX,

0 commit comments

Comments
 (0)