Skip to content

Commit c6b5db1

Browse files
furszyachow101
authored andcommitted
assumeUTXO: fix peers disconnection during sync
Because AssumeUTXO nodes prioritize tip synchronization, they relay their local address through the network before completing the background chain sync. This, combined with the advertising of full-node service (NODE_NETWORK), can result in an honest peer in IBD connecting to the AssumeUTXO node (while syncing) and requesting an historical block the node does not have. This behavior leads to an abrupt disconnection due to perceived unresponsiveness (lack of response) from the AssumeUTXO node. This lack of response occurs because nodes ignore getdata requests when they do not have the block data available (further discussion can be found in PR 30385). Fix this by refraining from signaling full-node service support while the background chain is being synced. During this period, the node will only signal 'NODE_NETWORK_LIMITED' support. Then, full-node ('NODE_NETWORK') support will be re-enabled once the background chain sync is completed. Github-Pull: bitcoin#30807 Rebased-From: 6d5812e
1 parent 598415b commit c6b5db1

File tree

7 files changed

+70
-10
lines changed

7 files changed

+70
-10
lines changed

src/init.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,7 +1558,12 @@ bool AppInitMain(NodeContext& node, interfaces::BlockAndHeaderTipInfo* tip_info)
15581558
// This is defined and set here instead of inline in validation.h to avoid a hard
15591559
// dependency between validation and index/base, since the latter is not in
15601560
// libbitcoinkernel.
1561-
chainman.restart_indexes = [&node]() {
1561+
chainman.snapshot_download_completed = [&node]() {
1562+
if (!node.chainman->m_blockman.IsPruneMode()) {
1563+
LogPrintf("[snapshot] re-enabling NODE_NETWORK services\n");
1564+
node.connman->AddLocalServices(NODE_NETWORK);
1565+
}
1566+
15621567
LogPrintf("[snapshot] restarting indexes\n");
15631568

15641569
// Drain the validation interface queue to ensure that the old indexes
@@ -1695,8 +1700,13 @@ bool AppInitMain(NodeContext& node, interfaces::BlockAndHeaderTipInfo* tip_info)
16951700
}
16961701
}
16971702
} else {
1698-
LogPrintf("Setting NODE_NETWORK on non-prune mode\n");
1699-
nLocalServices = ServiceFlags(nLocalServices | NODE_NETWORK);
1703+
// Prior to setting NODE_NETWORK, check if we can provide historical blocks.
1704+
if (!WITH_LOCK(chainman.GetMutex(), return chainman.BackgroundSyncInProgress())) {
1705+
LogPrintf("Setting NODE_NETWORK on non-prune mode\n");
1706+
nLocalServices = ServiceFlags(nLocalServices | NODE_NETWORK);
1707+
} else {
1708+
LogPrintf("Running node in NODE_NETWORK_LIMITED mode until snapshot background sync completes\n");
1709+
}
17001710
}
17011711

17021712
// ********************************************************* Step 11: import blocks

src/net.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1789,7 +1789,8 @@ void CConnman::CreateNodeFromAcceptedSocket(std::unique_ptr<Sock>&& sock,
17891789
const bool inbound_onion = std::find(m_onion_binds.begin(), m_onion_binds.end(), addr_bind) != m_onion_binds.end();
17901790
// The V2Transport transparently falls back to V1 behavior when an incoming V1 connection is
17911791
// detected, so use it whenever we signal NODE_P2P_V2.
1792-
const bool use_v2transport(nLocalServices & NODE_P2P_V2);
1792+
ServiceFlags local_services = GetLocalServices();
1793+
const bool use_v2transport(local_services & NODE_P2P_V2);
17931794

17941795
CNode* pnode = new CNode(id,
17951796
std::move(sock),
@@ -1807,7 +1808,7 @@ void CConnman::CreateNodeFromAcceptedSocket(std::unique_ptr<Sock>&& sock,
18071808
.use_v2transport = use_v2transport,
18081809
});
18091810
pnode->AddRef();
1810-
m_msgproc->InitializeNode(*pnode, nLocalServices);
1811+
m_msgproc->InitializeNode(*pnode, local_services);
18111812
{
18121813
LOCK(m_nodes_mutex);
18131814
m_nodes.push_back(pnode);

src/net.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,6 +1221,11 @@ class CConnman
12211221
//! that peer during `net_processing.cpp:PushNodeVersion()`.
12221222
ServiceFlags GetLocalServices() const;
12231223

1224+
//! Updates the local services that this node advertises to other peers
1225+
//! during connection handshake.
1226+
void AddLocalServices(ServiceFlags services) { nLocalServices = ServiceFlags(nLocalServices | services); };
1227+
void RemoveLocalServices(ServiceFlags services) { nLocalServices = ServiceFlags(nLocalServices & ~services); }
1228+
12241229
uint64_t GetMaxOutboundTarget() const EXCLUSIVE_LOCKS_REQUIRED(!m_total_bytes_sent_mutex);
12251230
std::chrono::seconds GetMaxOutboundTimeframe() const;
12261231

@@ -1460,11 +1465,12 @@ class CConnman
14601465
* This data is replicated in each Peer instance we create.
14611466
*
14621467
* This data is not marked const, but after being set it should not
1463-
* change.
1468+
* change. Unless AssumeUTXO is started, in which case, the peer
1469+
* will be limited until the background chain sync finishes.
14641470
*
14651471
* \sa Peer::our_services
14661472
*/
1467-
ServiceFlags nLocalServices;
1473+
std::atomic<ServiceFlags> nLocalServices;
14681474

14691475
std::unique_ptr<CSemaphore> semOutbound;
14701476
std::unique_ptr<CSemaphore> semAddnode;

src/rpc/blockchain.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2866,6 +2866,13 @@ static RPCHelpMan loadtxoutset()
28662866
throw JSONRPCError(RPC_INTERNAL_ERROR, strprintf("Unable to load UTXO snapshot: %s. (%s)", util::ErrorString(activation_result).original, path.utf8string()));
28672867
}
28682868

2869+
// Because we can't provide historical blocks during tip or background sync.
2870+
// Update local services to reflect we are a limited peer until we are fully sync.
2871+
node.connman->RemoveLocalServices(NODE_NETWORK);
2872+
// Setting the limited state is usually redundant because the node can always
2873+
// provide the last 288 blocks, but it doesn't hurt to set it.
2874+
node.connman->AddLocalServices(NODE_NETWORK_LIMITED);
2875+
28692876
CBlockIndex& snapshot_index{*CHECK_NONFATAL(*activation_result)};
28702877

28712878
UniValue result(UniValue::VOBJ);

src/validation.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3574,8 +3574,8 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr<
35743574
//
35753575
// This cannot be done while holding cs_main (within
35763576
// MaybeCompleteSnapshotValidation) or a cs_main deadlock will occur.
3577-
if (m_chainman.restart_indexes) {
3578-
m_chainman.restart_indexes();
3577+
if (m_chainman.snapshot_download_completed) {
3578+
m_chainman.snapshot_download_completed();
35793579
}
35803580
break;
35813581
}

src/validation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -976,7 +976,7 @@ class ChainstateManager
976976

977977
//! Function to restart active indexes; set dynamically to avoid a circular
978978
//! dependency on `base/index.cpp`.
979-
std::function<void()> restart_indexes = std::function<void()>();
979+
std::function<void()> snapshot_download_completed = std::function<void()>();
980980

981981
const CChainParams& GetParams() const { return m_options.chainparams; }
982982
const Consensus::Params& GetConsensus() const { return m_options.chainparams.GetConsensus(); }

test/functional/feature_assumeutxo.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,11 @@ def test_snapshot_not_on_most_work_chain(self, dump_output_path):
247247
node1.submitheader(main_block1)
248248
node1.submitheader(main_block2)
249249

250+
def assert_only_network_limited_service(self, node):
251+
node_services = node.getnetworkinfo()['localservicesnames']
252+
assert 'NETWORK' not in node_services
253+
assert 'NETWORK_LIMITED' in node_services
254+
250255
def run_test(self):
251256
"""
252257
Bring up two (disconnected) nodes, mine some new blocks on the first,
@@ -343,13 +348,20 @@ def run_test(self):
343348
self.test_snapshot_block_invalidated(dump_output['path'])
344349
self.test_snapshot_not_on_most_work_chain(dump_output['path'])
345350

351+
# Prune-node sanity check
352+
assert 'NETWORK' not in n1.getnetworkinfo()['localservicesnames']
353+
346354
self.log.info(f"Loading snapshot into second node from {dump_output['path']}")
347355
# This node's tip is on an ancestor block of the snapshot, which should
348356
# be the normal case
349357
loaded = n1.loadtxoutset(dump_output['path'])
350358
assert_equal(loaded['coins_loaded'], SNAPSHOT_BASE_HEIGHT)
351359
assert_equal(loaded['base_height'], SNAPSHOT_BASE_HEIGHT)
352360

361+
self.log.info("Confirm that local services remain unchanged")
362+
# Since n1 is a pruned node, the 'NETWORK' service flag must always be unset.
363+
self.assert_only_network_limited_service(n1)
364+
353365
self.log.info("Check that UTXO-querying RPCs operate on snapshot chainstate")
354366
snapshot_hash = loaded['tip_hash']
355367
snapshot_num_coins = loaded['coins_loaded']
@@ -453,6 +465,9 @@ def check_tx_counts(final: bool) -> None:
453465
self.restart_node(1, extra_args=[
454466
f"-stopatheight={PAUSE_HEIGHT}", *self.extra_args[1]])
455467

468+
# Upon restart during snapshot tip sync, the node must remain in 'limited' mode.
469+
self.assert_only_network_limited_service(n1)
470+
456471
# Finally connect the nodes and let them sync.
457472
#
458473
# Set `wait_for_connect=False` to avoid a race between performing connection
@@ -469,6 +484,9 @@ def check_tx_counts(final: bool) -> None:
469484
self.log.info("Restarted node before snapshot validation completed, reloading...")
470485
self.restart_node(1, extra_args=self.extra_args[1])
471486

487+
# Upon restart, the node must remain in 'limited' mode
488+
self.assert_only_network_limited_service(n1)
489+
472490
# Send snapshot block to n1 out of order. This makes the test less
473491
# realistic because normally the snapshot block is one of the last
474492
# blocks downloaded, but its useful to test because it triggers more
@@ -487,6 +505,10 @@ def check_tx_counts(final: bool) -> None:
487505
self.log.info("Ensuring background validation completes")
488506
self.wait_until(lambda: len(n1.getchainstates()['chainstates']) == 1)
489507

508+
# Since n1 is a pruned node, it will not signal NODE_NETWORK after
509+
# completing the background sync.
510+
self.assert_only_network_limited_service(n1)
511+
490512
# Ensure indexes have synced.
491513
completed_idx_state = {
492514
'basic block filter index': COMPLETE_IDX,
@@ -517,12 +539,18 @@ def check_tx_counts(final: bool) -> None:
517539

518540
self.log.info("-- Testing all indexes + reindex")
519541
assert_equal(n2.getblockcount(), START_HEIGHT)
542+
assert 'NETWORK' in n2.getnetworkinfo()['localservicesnames'] # sanity check
520543

521544
self.log.info(f"Loading snapshot into third node from {dump_output['path']}")
522545
loaded = n2.loadtxoutset(dump_output['path'])
523546
assert_equal(loaded['coins_loaded'], SNAPSHOT_BASE_HEIGHT)
524547
assert_equal(loaded['base_height'], SNAPSHOT_BASE_HEIGHT)
525548

549+
# Even though n2 is a full node, it will unset the 'NETWORK' service flag during snapshot loading.
550+
# This indicates other peers that the node will temporarily not provide historical blocks.
551+
self.log.info("Check node2 updated the local services during snapshot load")
552+
self.assert_only_network_limited_service(n2)
553+
526554
for reindex_arg in ['-reindex=1', '-reindex-chainstate=1']:
527555
self.log.info(f"Check that restarting with {reindex_arg} will delete the snapshot chainstate")
528556
self.restart_node(2, extra_args=[reindex_arg, *self.extra_args[2]])
@@ -546,13 +574,21 @@ def check_tx_counts(final: bool) -> None:
546574
msg = "Unable to load UTXO snapshot: Can't activate a snapshot-based chainstate more than once"
547575
assert_raises_rpc_error(-32603, msg, n2.loadtxoutset, dump_output['path'])
548576

577+
# Upon restart, the node must stay in 'limited' mode until the background
578+
# chain sync completes.
579+
self.restart_node(2, extra_args=self.extra_args[2])
580+
self.assert_only_network_limited_service(n2)
581+
549582
self.connect_nodes(0, 2)
550583
self.wait_until(lambda: n2.getchainstates()['chainstates'][-1]['blocks'] == FINAL_HEIGHT)
551584
self.sync_blocks(nodes=(n0, n2))
552585

553586
self.log.info("Ensuring background validation completes")
554587
self.wait_until(lambda: len(n2.getchainstates()['chainstates']) == 1)
555588

589+
# Once background chain sync completes, the full node must start offering historical blocks again.
590+
assert {'NETWORK', 'NETWORK_LIMITED'}.issubset(n2.getnetworkinfo()['localservicesnames'])
591+
556592
completed_idx_state = {
557593
'basic block filter index': COMPLETE_IDX,
558594
'coinstatsindex': COMPLETE_IDX,

0 commit comments

Comments
 (0)