Skip to content

Commit 5a6d00c

Browse files
committed
Permit disconnection of outbound peers on bad/slow chains
Currently we have no rotation of outbound peers. If an outbound peer stops serving us blocks, or is on a consensus-incompatible chain with less work than our tip (but otherwise valid headers), then we will never disconnect that peer, even though that peer is using one of our 8 outbound connection slots. Because we rely on our outbound peers to find an honest node in order to reach consensus, allowing an incompatible peer to occupy one of those slots is undesirable, particularly if it is possible for all such slots to be occupied by such peers. Protect against this by always checking to see if a peer's best known block has less work than our tip, and if so, set a 20 minute timeout -- if the peer is still not known to have caught up to a chain with as much work as ours after 20 minutes, then send a single getheaders message, wait 2 more minutes, and if a better header hasn't been received by then, disconnect that peer. Note: - we do not require that our peer sync to the same tip as ours, just an equal or greater work tip. (Doing otherwise would risk partitioning the network in the event of a chain split, and is also unnecessary.) - we pick 4 of our outbound peers and do not subject them to this logic, to be more conservative. We don't wish to permit temporary network issues (or an attacker) to excessively disrupt network topology.
1 parent c60fd71 commit 5a6d00c

File tree

2 files changed

+120
-1
lines changed

2 files changed

+120
-1
lines changed

src/net_processing.cpp

Lines changed: 112 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ namespace {
124124
/** Number of peers from which we're downloading blocks. */
125125
int nPeersWithValidatedDownloads = 0;
126126

127+
/** Number of outbound peers with m_chain_sync.m_protect. */
128+
int g_outbound_peers_with_protect_from_disconnect = 0;
129+
127130
/** Relay map, protected by cs_main. */
128131
typedef std::map<uint256, CTransactionRef> MapRelay;
129132
MapRelay mapRelay;
@@ -201,6 +204,33 @@ struct CNodeState {
201204
*/
202205
bool fSupportsDesiredCmpctVersion;
203206

207+
/** State used to enforce CHAIN_SYNC_TIMEOUT
208+
* Only in effect for outbound, non-manual connections, with
209+
* m_protect == false
210+
* Algorithm: if a peer's best known block has less work than our tip,
211+
* set a timeout CHAIN_SYNC_TIMEOUT seconds in the future:
212+
* - If at timeout their best known block now has more work than our tip
213+
* when the timeout was set, then either reset the timeout or clear it
214+
* (after comparing against our current tip's work)
215+
* - If at timeout their best known block still has less work than our
216+
* tip did when the timeout was set, then send a getheaders message,
217+
* and set a shorter timeout, HEADERS_RESPONSE_TIME seconds in future.
218+
* If their best known block is still behind when that new timeout is
219+
* reached, disconnect.
220+
*/
221+
struct ChainSyncTimeoutState {
222+
//! A timeout used for checking whether our peer has sufficiently synced
223+
int64_t m_timeout;
224+
//! A header with the work we require on our peer's chain
225+
const CBlockIndex * m_work_header;
226+
//! After timeout is reached, set to true after sending getheaders
227+
bool m_sent_getheaders;
228+
//! Whether this peer is protected from disconnection due to a bad/slow chain
229+
bool m_protect;
230+
};
231+
232+
ChainSyncTimeoutState m_chain_sync;
233+
204234
CNodeState(CAddress addrIn, std::string addrNameIn) : address(addrIn), name(addrNameIn) {
205235
fCurrentlyConnected = false;
206236
nMisbehavior = 0;
@@ -223,6 +253,7 @@ struct CNodeState {
223253
fHaveWitness = false;
224254
fWantsCmpctWitness = false;
225255
fSupportsDesiredCmpctVersion = false;
256+
m_chain_sync = { 0, nullptr, false, false };
226257
}
227258
};
228259

@@ -502,6 +533,13 @@ void FindNextBlocksToDownload(NodeId nodeid, unsigned int count, std::vector<con
502533

503534
} // namespace
504535

536+
// Returns true for outbound peers, excluding manual connections, feelers, and
537+
// one-shots
538+
bool IsOutboundDisconnectionCandidate(const CNode *node)
539+
{
540+
return !(node->fInbound || node->m_manual_connection || node->fFeeler || node->fOneShot);
541+
}
542+
505543
void PeerLogicValidation::InitializeNode(CNode *pnode) {
506544
CAddress addr = pnode->addr;
507545
std::string addrName = pnode->GetAddrName();
@@ -534,6 +572,8 @@ void PeerLogicValidation::FinalizeNode(NodeId nodeid, bool& fUpdateConnectionTim
534572
nPreferredDownload -= state->fPreferredDownload;
535573
nPeersWithValidatedDownloads -= (state->nBlocksInFlightValidHeaders != 0);
536574
assert(nPeersWithValidatedDownloads >= 0);
575+
g_outbound_peers_with_protect_from_disconnect -= state->m_chain_sync.m_protect;
576+
assert(g_outbound_peers_with_protect_from_disconnect >= 0);
537577

538578
mapNodeState.erase(nodeid);
539579

@@ -542,6 +582,7 @@ void PeerLogicValidation::FinalizeNode(NodeId nodeid, bool& fUpdateConnectionTim
542582
assert(mapBlocksInFlight.empty());
543583
assert(nPreferredDownload == 0);
544584
assert(nPeersWithValidatedDownloads == 0);
585+
assert(g_outbound_peers_with_protect_from_disconnect == 0);
545586
}
546587
LogPrint(BCLog::NET, "Cleared nodestate for peer=%d\n", nodeid);
547588
}
@@ -2324,6 +2365,10 @@ bool static ProcessMessage(CNode* pfrom, const std::string& strCommand, CDataStr
23242365
assert(pindexLast);
23252366
UpdateBlockAvailability(pfrom->GetId(), pindexLast->GetBlockHash());
23262367

2368+
// From here, pindexBestKnownBlock should be guaranteed to be non-null,
2369+
// because it is set in UpdateBlockAvailability. Some nullptr checks
2370+
// are still present, however, as belt-and-suspenders.
2371+
23272372
if (nCount == MAX_HEADERS_RESULTS) {
23282373
// Headers message had its maximum size; the peer may have more headers.
23292374
// TODO: optimize: if pindexLast is an ancestor of chainActive.Tip or pindexBestHeader, continue
@@ -2396,11 +2441,22 @@ bool static ProcessMessage(CNode* pfrom, const std::string& strCommand, CDataStr
23962441
// chainActive.Tip()) because we won't start block download
23972442
// until we have a headers chain that has at least
23982443
// nMinimumChainWork, even if a peer has a chain past our tip,
2399-
if (!(pfrom->fInbound || pfrom->fWhitelisted || pfrom->m_manual_connection)) {
2444+
// as an anti-DoS measure.
2445+
if (IsOutboundDisconnectionCandidate(pfrom)) {
2446+
LogPrintf("Disconnecting outbound peer %d -- headers chain has insufficient work\n", pfrom->GetId());
24002447
pfrom->fDisconnect = true;
24012448
}
24022449
}
24032450
}
2451+
2452+
if (!pfrom->fDisconnect && IsOutboundDisconnectionCandidate(pfrom) && nodestate->pindexBestKnownBlock != nullptr) {
2453+
// If this is an outbound peer, check to see if we should protect
2454+
// it from the bad/lagging chain logic.
2455+
if (g_outbound_peers_with_protect_from_disconnect < MAX_OUTBOUND_PEERS_TO_PROTECT_FROM_DISCONNECT && nodestate->pindexBestKnownBlock->nChainWork >= chainActive.Tip()->nChainWork && !nodestate->m_chain_sync.m_protect) {
2456+
nodestate->m_chain_sync.m_protect = true;
2457+
++g_outbound_peers_with_protect_from_disconnect;
2458+
}
2459+
}
24042460
}
24052461
}
24062462

@@ -2799,6 +2855,58 @@ bool PeerLogicValidation::ProcessMessages(CNode* pfrom, std::atomic<bool>& inter
27992855
return fMoreWork;
28002856
}
28012857

2858+
void PeerLogicValidation::ConsiderEviction(CNode *pto, int64_t time_in_seconds)
2859+
{
2860+
AssertLockHeld(cs_main);
2861+
2862+
CNodeState &state = *State(pto->GetId());
2863+
const CNetMsgMaker msgMaker(pto->GetSendVersion());
2864+
2865+
if (!state.m_chain_sync.m_protect && IsOutboundDisconnectionCandidate(pto) && state.fSyncStarted) {
2866+
// This is an outbound peer subject to disconnection if they don't
2867+
// announce a block with as much work as the current tip within
2868+
// CHAIN_SYNC_TIMEOUT + HEADERS_RESPONSE_TIME seconds (note: if
2869+
// their chain has more work than ours, we should sync to it,
2870+
// unless it's invalid, in which case we should find that out and
2871+
// disconnect from them elsewhere).
2872+
if (state.pindexBestKnownBlock != nullptr && state.pindexBestKnownBlock->nChainWork >= chainActive.Tip()->nChainWork) {
2873+
if (state.m_chain_sync.m_timeout != 0) {
2874+
state.m_chain_sync.m_timeout = 0;
2875+
state.m_chain_sync.m_work_header = nullptr;
2876+
state.m_chain_sync.m_sent_getheaders = false;
2877+
}
2878+
} else if (state.m_chain_sync.m_timeout == 0 || (state.m_chain_sync.m_work_header != nullptr && state.pindexBestKnownBlock != nullptr && state.pindexBestKnownBlock->nChainWork >= state.m_chain_sync.m_work_header->nChainWork)) {
2879+
// Our best block known by this peer is behind our tip, and we're either noticing
2880+
// that for the first time, OR this peer was able to catch up to some earlier point
2881+
// where we checked against our tip.
2882+
// Either way, set a new timeout based on current tip.
2883+
state.m_chain_sync.m_timeout = time_in_seconds + CHAIN_SYNC_TIMEOUT;
2884+
state.m_chain_sync.m_work_header = chainActive.Tip();
2885+
state.m_chain_sync.m_sent_getheaders = false;
2886+
} else if (state.m_chain_sync.m_timeout > 0 && time_in_seconds > state.m_chain_sync.m_timeout) {
2887+
// No evidence yet that our peer has synced to a chain with work equal to that
2888+
// of our tip, when we first detected it was behind. Send a single getheaders
2889+
// message to give the peer a chance to update us.
2890+
if (state.m_chain_sync.m_sent_getheaders) {
2891+
// They've run out of time to catch up!
2892+
LogPrintf("Disconnecting outbound peer %d for old chain, best known block = %s\n", pto->GetId(), state.pindexBestKnownBlock != nullptr ? state.pindexBestKnownBlock->GetBlockHash().ToString() : "<none>");
2893+
pto->fDisconnect = true;
2894+
} else {
2895+
LogPrint(BCLog::NET, "sending getheaders to outbound peer=%d to verify chain work (current best known block:%s, benchmark blockhash: %s)\n", pto->GetId(), state.pindexBestKnownBlock != nullptr ? state.pindexBestKnownBlock->GetBlockHash().ToString() : "<none>", state.m_chain_sync.m_work_header->GetBlockHash().ToString());
2896+
connman->PushMessage(pto, msgMaker.Make(NetMsgType::GETHEADERS, chainActive.GetLocator(state.m_chain_sync.m_work_header->pprev), uint256()));
2897+
state.m_chain_sync.m_sent_getheaders = true;
2898+
constexpr int64_t HEADERS_RESPONSE_TIME = 120; // 2 minutes
2899+
// Bump the timeout to allow a response, which could clear the timeout
2900+
// (if the response shows the peer has synced), reset the timeout (if
2901+
// the peer syncs to the required work but not to our tip), or result
2902+
// in disconnect (if we advance to the timeout and pindexBestKnownBlock
2903+
// has not sufficiently progressed)
2904+
state.m_chain_sync.m_timeout = time_in_seconds + HEADERS_RESPONSE_TIME;
2905+
}
2906+
}
2907+
}
2908+
}
2909+
28022910
class CompareInvMempoolOrder
28032911
{
28042912
CTxMemPool *mp;
@@ -3265,6 +3373,9 @@ bool PeerLogicValidation::SendMessages(CNode* pto, std::atomic<bool>& interruptM
32653373
}
32663374
}
32673375

3376+
// Check that outbound peers have reasonable chains
3377+
// GetTime() is used by this anti-DoS logic so we can test this using mocktime
3378+
ConsiderEviction(pto, GetTime());
32683379

32693380
//
32703381
// Message: getdata (blocks)

src/net_processing.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ static const unsigned int DEFAULT_BLOCK_RECONSTRUCTION_EXTRA_TXN = 100;
2121
* Timeout = base + per_header * (expected number of headers) */
2222
static constexpr int64_t HEADERS_DOWNLOAD_TIMEOUT_BASE = 15 * 60 * 1000000; // 15 minutes
2323
static constexpr int64_t HEADERS_DOWNLOAD_TIMEOUT_PER_HEADER = 1000; // 1ms/header
24+
/** Protect at least this many outbound peers from disconnection due to slow/
25+
* behind headers chain.
26+
*/
27+
static constexpr int32_t MAX_OUTBOUND_PEERS_TO_PROTECT_FROM_DISCONNECT = 4;
28+
/** Timeout for (unprotected) outbound peers to sync to our chainwork, in seconds */
29+
static constexpr int64_t CHAIN_SYNC_TIMEOUT = 20 * 60; // 20 minutes
2430

2531
class PeerLogicValidation : public CValidationInterface, public NetEventsInterface {
2632
private:
@@ -47,6 +53,8 @@ class PeerLogicValidation : public CValidationInterface, public NetEventsInterfa
4753
* @return True if there is more work to be done
4854
*/
4955
bool SendMessages(CNode* pto, std::atomic<bool>& interrupt) override;
56+
57+
void ConsiderEviction(CNode *pto, int64_t time_in_seconds);
5058
};
5159

5260
struct CNodeStateStats {

0 commit comments

Comments
 (0)