Skip to content

Commit d93fa26

Browse files
committed
Merge #11490: Disconnect from outbound peers with bad headers chains
e065249 Add unit test for outbound peer eviction (Suhas Daftuar) 5a6d00c Permit disconnection of outbound peers on bad/slow chains (Suhas Daftuar) c60fd71 Disconnecting from bad outbound peers in IBD (Suhas Daftuar) Pull request description: The first commit will disconnect an outbound peer that serves us a headers chain with insufficient work while we're in IBD. The second commit introduces a way to disconnect outbound peers whose chains fall out of sync with ours: For a given outbound peer, we check whether their best known block (which is known from the blocks they announce to us) has at least as much work as our tip. If it doesn't, we set a 20 minute timeout, and if we still haven't heard about a block with as much work as our tip had when we set the timeout, then we send a single getheaders message, and wait 2 more minutes. If after two minutes their best known block has insufficient work, we disconnect that peer. We protect 4 of our outbound peers (who provide some "good" headers chains, ie a chain with at least as much work as our tip at some point) from being subject to this logic, to prevent excessive network topology changes as a result of this algorithm, while still ensuring that we have a reasonable number of nodes not known to be on bogus chains. We also don't require our peers to be on the same chain as us, to prevent accidental partitioning of the network in the event of a chain split. Note that if our peers are ever on a more work chain than our tip, then we will download and validate it, and then either reorg to it, or learn of a consensus incompatibility with that peer and disconnect. This PR is designed to protect against peers that are on a less work chain which we may never try to download and validate. Tree-SHA512: 2e0169a1dd8a7fb95980573ac4a201924bffdd724c19afcab5efcef076fdbe1f2cec7dc5f5d7e0a6327216f56d3828884f73642e00c8534b56ec2bb4c854a656
2 parents cf8c4a7 + e065249 commit d93fa26

File tree

4 files changed

+200
-0
lines changed

4 files changed

+200
-0
lines changed

src/net_processing.cpp

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ namespace {
124124
/** Number of peers from which we're downloading blocks. */
125125
int nPeersWithValidatedDownloads = 0;
126126

127+
/** Number of outbound peers with m_chain_sync.m_protect. */
128+
int g_outbound_peers_with_protect_from_disconnect = 0;
129+
127130
/** Relay map, protected by cs_main. */
128131
typedef std::map<uint256, CTransactionRef> MapRelay;
129132
MapRelay mapRelay;
@@ -201,6 +204,33 @@ struct CNodeState {
201204
*/
202205
bool fSupportsDesiredCmpctVersion;
203206

207+
/** State used to enforce CHAIN_SYNC_TIMEOUT
208+
* Only in effect for outbound, non-manual connections, with
209+
* m_protect == false
210+
* Algorithm: if a peer's best known block has less work than our tip,
211+
* set a timeout CHAIN_SYNC_TIMEOUT seconds in the future:
212+
* - If at timeout their best known block now has more work than our tip
213+
* when the timeout was set, then either reset the timeout or clear it
214+
* (after comparing against our current tip's work)
215+
* - If at timeout their best known block still has less work than our
216+
* tip did when the timeout was set, then send a getheaders message,
217+
* and set a shorter timeout, HEADERS_RESPONSE_TIME seconds in future.
218+
* If their best known block is still behind when that new timeout is
219+
* reached, disconnect.
220+
*/
221+
struct ChainSyncTimeoutState {
222+
//! A timeout used for checking whether our peer has sufficiently synced
223+
int64_t m_timeout;
224+
//! A header with the work we require on our peer's chain
225+
const CBlockIndex * m_work_header;
226+
//! After timeout is reached, set to true after sending getheaders
227+
bool m_sent_getheaders;
228+
//! Whether this peer is protected from disconnection due to a bad/slow chain
229+
bool m_protect;
230+
};
231+
232+
ChainSyncTimeoutState m_chain_sync;
233+
204234
CNodeState(CAddress addrIn, std::string addrNameIn) : address(addrIn), name(addrNameIn) {
205235
fCurrentlyConnected = false;
206236
nMisbehavior = 0;
@@ -223,6 +253,7 @@ struct CNodeState {
223253
fHaveWitness = false;
224254
fWantsCmpctWitness = false;
225255
fSupportsDesiredCmpctVersion = false;
256+
m_chain_sync = { 0, nullptr, false, false };
226257
}
227258
};
228259

@@ -502,6 +533,13 @@ void FindNextBlocksToDownload(NodeId nodeid, unsigned int count, std::vector<con
502533

503534
} // namespace
504535

536+
// Returns true for outbound peers, excluding manual connections, feelers, and
537+
// one-shots
538+
bool IsOutboundDisconnectionCandidate(const CNode *node)
539+
{
540+
return !(node->fInbound || node->m_manual_connection || node->fFeeler || node->fOneShot);
541+
}
542+
505543
void PeerLogicValidation::InitializeNode(CNode *pnode) {
506544
CAddress addr = pnode->addr;
507545
std::string addrName = pnode->GetAddrName();
@@ -534,6 +572,8 @@ void PeerLogicValidation::FinalizeNode(NodeId nodeid, bool& fUpdateConnectionTim
534572
nPreferredDownload -= state->fPreferredDownload;
535573
nPeersWithValidatedDownloads -= (state->nBlocksInFlightValidHeaders != 0);
536574
assert(nPeersWithValidatedDownloads >= 0);
575+
g_outbound_peers_with_protect_from_disconnect -= state->m_chain_sync.m_protect;
576+
assert(g_outbound_peers_with_protect_from_disconnect >= 0);
537577

538578
mapNodeState.erase(nodeid);
539579

@@ -542,6 +582,7 @@ void PeerLogicValidation::FinalizeNode(NodeId nodeid, bool& fUpdateConnectionTim
542582
assert(mapBlocksInFlight.empty());
543583
assert(nPreferredDownload == 0);
544584
assert(nPeersWithValidatedDownloads == 0);
585+
assert(g_outbound_peers_with_protect_from_disconnect == 0);
545586
}
546587
LogPrint(BCLog::NET, "Cleared nodestate for peer=%d\n", nodeid);
547588
}
@@ -2337,6 +2378,10 @@ bool static ProcessMessage(CNode* pfrom, const std::string& strCommand, CDataStr
23372378
assert(pindexLast);
23382379
UpdateBlockAvailability(pfrom->GetId(), pindexLast->GetBlockHash());
23392380

2381+
// From here, pindexBestKnownBlock should be guaranteed to be non-null,
2382+
// because it is set in UpdateBlockAvailability. Some nullptr checks
2383+
// are still present, however, as belt-and-suspenders.
2384+
23402385
if (nCount == MAX_HEADERS_RESULTS) {
23412386
// Headers message had its maximum size; the peer may have more headers.
23422387
// TODO: optimize: if pindexLast is an ancestor of chainActive.Tip or pindexBestHeader, continue
@@ -2396,6 +2441,35 @@ bool static ProcessMessage(CNode* pfrom, const std::string& strCommand, CDataStr
23962441
}
23972442
}
23982443
}
2444+
// If we're in IBD, we want outbound peers that will serve us a useful
2445+
// chain. Disconnect peers that are on chains with insufficient work.
2446+
if (IsInitialBlockDownload() && nCount != MAX_HEADERS_RESULTS) {
2447+
// When nCount < MAX_HEADERS_RESULTS, we know we have no more
2448+
// headers to fetch from this peer.
2449+
if (nodestate->pindexBestKnownBlock && nodestate->pindexBestKnownBlock->nChainWork < nMinimumChainWork) {
2450+
// This peer has too little work on their headers chain to help
2451+
// us sync -- disconnect if using an outbound slot (unless
2452+
// whitelisted or addnode).
2453+
// Note: We compare their tip to nMinimumChainWork (rather than
2454+
// chainActive.Tip()) because we won't start block download
2455+
// until we have a headers chain that has at least
2456+
// nMinimumChainWork, even if a peer has a chain past our tip,
2457+
// as an anti-DoS measure.
2458+
if (IsOutboundDisconnectionCandidate(pfrom)) {
2459+
LogPrintf("Disconnecting outbound peer %d -- headers chain has insufficient work\n", pfrom->GetId());
2460+
pfrom->fDisconnect = true;
2461+
}
2462+
}
2463+
}
2464+
2465+
if (!pfrom->fDisconnect && IsOutboundDisconnectionCandidate(pfrom) && nodestate->pindexBestKnownBlock != nullptr) {
2466+
// If this is an outbound peer, check to see if we should protect
2467+
// it from the bad/lagging chain logic.
2468+
if (g_outbound_peers_with_protect_from_disconnect < MAX_OUTBOUND_PEERS_TO_PROTECT_FROM_DISCONNECT && nodestate->pindexBestKnownBlock->nChainWork >= chainActive.Tip()->nChainWork && !nodestate->m_chain_sync.m_protect) {
2469+
nodestate->m_chain_sync.m_protect = true;
2470+
++g_outbound_peers_with_protect_from_disconnect;
2471+
}
2472+
}
23992473
}
24002474
}
24012475

@@ -2794,6 +2868,58 @@ bool PeerLogicValidation::ProcessMessages(CNode* pfrom, std::atomic<bool>& inter
27942868
return fMoreWork;
27952869
}
27962870

2871+
void PeerLogicValidation::ConsiderEviction(CNode *pto, int64_t time_in_seconds)
2872+
{
2873+
AssertLockHeld(cs_main);
2874+
2875+
CNodeState &state = *State(pto->GetId());
2876+
const CNetMsgMaker msgMaker(pto->GetSendVersion());
2877+
2878+
if (!state.m_chain_sync.m_protect && IsOutboundDisconnectionCandidate(pto) && state.fSyncStarted) {
2879+
// This is an outbound peer subject to disconnection if they don't
2880+
// announce a block with as much work as the current tip within
2881+
// CHAIN_SYNC_TIMEOUT + HEADERS_RESPONSE_TIME seconds (note: if
2882+
// their chain has more work than ours, we should sync to it,
2883+
// unless it's invalid, in which case we should find that out and
2884+
// disconnect from them elsewhere).
2885+
if (state.pindexBestKnownBlock != nullptr && state.pindexBestKnownBlock->nChainWork >= chainActive.Tip()->nChainWork) {
2886+
if (state.m_chain_sync.m_timeout != 0) {
2887+
state.m_chain_sync.m_timeout = 0;
2888+
state.m_chain_sync.m_work_header = nullptr;
2889+
state.m_chain_sync.m_sent_getheaders = false;
2890+
}
2891+
} else if (state.m_chain_sync.m_timeout == 0 || (state.m_chain_sync.m_work_header != nullptr && state.pindexBestKnownBlock != nullptr && state.pindexBestKnownBlock->nChainWork >= state.m_chain_sync.m_work_header->nChainWork)) {
2892+
// Our best block known by this peer is behind our tip, and we're either noticing
2893+
// that for the first time, OR this peer was able to catch up to some earlier point
2894+
// where we checked against our tip.
2895+
// Either way, set a new timeout based on current tip.
2896+
state.m_chain_sync.m_timeout = time_in_seconds + CHAIN_SYNC_TIMEOUT;
2897+
state.m_chain_sync.m_work_header = chainActive.Tip();
2898+
state.m_chain_sync.m_sent_getheaders = false;
2899+
} else if (state.m_chain_sync.m_timeout > 0 && time_in_seconds > state.m_chain_sync.m_timeout) {
2900+
// No evidence yet that our peer has synced to a chain with work equal to that
2901+
// of our tip, when we first detected it was behind. Send a single getheaders
2902+
// message to give the peer a chance to update us.
2903+
if (state.m_chain_sync.m_sent_getheaders) {
2904+
// They've run out of time to catch up!
2905+
LogPrintf("Disconnecting outbound peer %d for old chain, best known block = %s\n", pto->GetId(), state.pindexBestKnownBlock != nullptr ? state.pindexBestKnownBlock->GetBlockHash().ToString() : "<none>");
2906+
pto->fDisconnect = true;
2907+
} else {
2908+
LogPrint(BCLog::NET, "sending getheaders to outbound peer=%d to verify chain work (current best known block:%s, benchmark blockhash: %s)\n", pto->GetId(), state.pindexBestKnownBlock != nullptr ? state.pindexBestKnownBlock->GetBlockHash().ToString() : "<none>", state.m_chain_sync.m_work_header->GetBlockHash().ToString());
2909+
connman->PushMessage(pto, msgMaker.Make(NetMsgType::GETHEADERS, chainActive.GetLocator(state.m_chain_sync.m_work_header->pprev), uint256()));
2910+
state.m_chain_sync.m_sent_getheaders = true;
2911+
constexpr int64_t HEADERS_RESPONSE_TIME = 120; // 2 minutes
2912+
// Bump the timeout to allow a response, which could clear the timeout
2913+
// (if the response shows the peer has synced), reset the timeout (if
2914+
// the peer syncs to the required work but not to our tip), or result
2915+
// in disconnect (if we advance to the timeout and pindexBestKnownBlock
2916+
// has not sufficiently progressed)
2917+
state.m_chain_sync.m_timeout = time_in_seconds + HEADERS_RESPONSE_TIME;
2918+
}
2919+
}
2920+
}
2921+
}
2922+
27972923
class CompareInvMempoolOrder
27982924
{
27992925
CTxMemPool *mp;
@@ -3260,6 +3386,9 @@ bool PeerLogicValidation::SendMessages(CNode* pto, std::atomic<bool>& interruptM
32603386
}
32613387
}
32623388

3389+
// Check that outbound peers have reasonable chains
3390+
// GetTime() is used by this anti-DoS logic so we can test this using mocktime
3391+
ConsiderEviction(pto, GetTime());
32633392

32643393
//
32653394
// Message: getdata (blocks)

src/net_processing.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ static const unsigned int DEFAULT_BLOCK_RECONSTRUCTION_EXTRA_TXN = 100;
2121
* Timeout = base + per_header * (expected number of headers) */
2222
static constexpr int64_t HEADERS_DOWNLOAD_TIMEOUT_BASE = 15 * 60 * 1000000; // 15 minutes
2323
static constexpr int64_t HEADERS_DOWNLOAD_TIMEOUT_PER_HEADER = 1000; // 1ms/header
24+
/** Protect at least this many outbound peers from disconnection due to slow/
25+
* behind headers chain.
26+
*/
27+
static constexpr int32_t MAX_OUTBOUND_PEERS_TO_PROTECT_FROM_DISCONNECT = 4;
28+
/** Timeout for (unprotected) outbound peers to sync to our chainwork, in seconds */
29+
static constexpr int64_t CHAIN_SYNC_TIMEOUT = 20 * 60; // 20 minutes
2430

2531
class PeerLogicValidation : public CValidationInterface, public NetEventsInterface {
2632
private:
@@ -47,6 +53,8 @@ class PeerLogicValidation : public CValidationInterface, public NetEventsInterfa
4753
* @return True if there is more work to be done
4854
*/
4955
bool SendMessages(CNode* pto, std::atomic<bool>& interrupt) override;
56+
57+
void ConsiderEviction(CNode *pto, int64_t time_in_seconds);
5058
};
5159

5260
struct CNodeStateStats {

src/test/DoS_tests.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,51 @@ static NodeId id = 0;
4242

4343
BOOST_FIXTURE_TEST_SUITE(DoS_tests, TestingSetup)
4444

45+
// Test eviction of an outbound peer whose chain never advances
46+
// Mock a node connection, and use mocktime to simulate a peer
47+
// which never sends any headers messages. PeerLogic should
48+
// decide to evict that outbound peer, after the appropriate timeouts.
49+
// Note that we protect 4 outbound nodes from being subject to
50+
// this logic; this test takes advantage of that protection only
51+
// being applied to nodes which send headers with sufficient
52+
// work.
53+
BOOST_AUTO_TEST_CASE(outbound_slow_chain_eviction)
54+
{
55+
std::atomic<bool> interruptDummy(false);
56+
57+
// Mock an outbound peer
58+
CAddress addr1(ip(0xa0b0c001), NODE_NONE);
59+
CNode dummyNode1(id++, ServiceFlags(NODE_NETWORK|NODE_WITNESS), 0, INVALID_SOCKET, addr1, 0, 0, CAddress(), "", /*fInboundIn=*/ false);
60+
dummyNode1.SetSendVersion(PROTOCOL_VERSION);
61+
62+
peerLogic->InitializeNode(&dummyNode1);
63+
dummyNode1.nVersion = 1;
64+
dummyNode1.fSuccessfullyConnected = true;
65+
66+
// This test requires that we have a chain with non-zero work.
67+
BOOST_CHECK(chainActive.Tip() != nullptr);
68+
BOOST_CHECK(chainActive.Tip()->nChainWork > 0);
69+
70+
// Test starts here
71+
peerLogic->SendMessages(&dummyNode1, interruptDummy); // should result in getheaders
72+
BOOST_CHECK(dummyNode1.vSendMsg.size() > 0);
73+
dummyNode1.vSendMsg.clear();
74+
75+
int64_t nStartTime = GetTime();
76+
// Wait 21 minutes
77+
SetMockTime(nStartTime+21*60);
78+
peerLogic->SendMessages(&dummyNode1, interruptDummy); // should result in getheaders
79+
BOOST_CHECK(dummyNode1.vSendMsg.size() > 0);
80+
// Wait 3 more minutes
81+
SetMockTime(nStartTime+24*60);
82+
peerLogic->SendMessages(&dummyNode1, interruptDummy); // should result in disconnect
83+
BOOST_CHECK(dummyNode1.fDisconnect == true);
84+
SetMockTime(0);
85+
86+
bool dummy;
87+
peerLogic->FinalizeNode(dummyNode1.GetId(), dummy);
88+
}
89+
4590
BOOST_AUTO_TEST_CASE(DoS_banning)
4691
{
4792
std::atomic<bool> interruptDummy(false);
@@ -71,6 +116,10 @@ BOOST_AUTO_TEST_CASE(DoS_banning)
71116
Misbehaving(dummyNode2.GetId(), 50);
72117
peerLogic->SendMessages(&dummyNode2, interruptDummy);
73118
BOOST_CHECK(connman->IsBanned(addr2));
119+
120+
bool dummy;
121+
peerLogic->FinalizeNode(dummyNode1.GetId(), dummy);
122+
peerLogic->FinalizeNode(dummyNode2.GetId(), dummy);
74123
}
75124

76125
BOOST_AUTO_TEST_CASE(DoS_banscore)
@@ -95,6 +144,9 @@ BOOST_AUTO_TEST_CASE(DoS_banscore)
95144
peerLogic->SendMessages(&dummyNode1, interruptDummy);
96145
BOOST_CHECK(connman->IsBanned(addr1));
97146
gArgs.ForceSetArg("-banscore", std::to_string(DEFAULT_BANSCORE_THRESHOLD));
147+
148+
bool dummy;
149+
peerLogic->FinalizeNode(dummyNode1.GetId(), dummy);
98150
}
99151

100152
BOOST_AUTO_TEST_CASE(DoS_bantime)
@@ -121,6 +173,9 @@ BOOST_AUTO_TEST_CASE(DoS_bantime)
121173

122174
SetMockTime(nStartTime+60*60*24+1);
123175
BOOST_CHECK(!connman->IsBanned(addr));
176+
177+
bool dummy;
178+
peerLogic->FinalizeNode(dummyNode.GetId(), dummy);
124179
}
125180

126181
CTransactionRef RandomOrphan()

test/functional/minchainwork.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class MinimumChainWorkTest(BitcoinTestFramework):
2727
def set_test_params(self):
2828
self.setup_clean_chain = True
2929
self.num_nodes = 3
30+
3031
self.extra_args = [[], ["-minimumchainwork=0x65"], ["-minimumchainwork=0x65"]]
3132
self.node_min_work = [0, 101, 101]
3233

@@ -74,6 +75,13 @@ def run_test(self):
7475
self.nodes[0].generate(1)
7576

7677
self.log.info("Verifying nodes are all synced")
78+
79+
# Because nodes in regtest are all manual connections (eg using
80+
# addnode), node1 should not have disconnected node0. If not for that,
81+
# we'd expect node1 to have disconnected node0 for serving an
82+
# insufficient work chain, in which case we'd need to reconnect them to
83+
# continue the test.
84+
7785
self.sync_all()
7886
self.log.info("Blockcounts: %s", [n.getblockcount() for n in self.nodes])
7987

0 commit comments

Comments
 (0)