Skip to content

Commit 0565951

Browse files
mzumsandevasild
andcommitted
p2p: Make block stalling timeout adaptive
This makes the stalling detection mechanism (previously a fixed timeout of 2s) adaptive: If we disconnect a peer for stalling, double the timeout for the next peer - and let it slowly relax back to its default value each time the tip advances. (Idea by Pieter Wuille) This makes situations more unlikely in which we'd keep on disconnecting many of our peers for stalling, even though our own bandwidth is insufficient to download a block in 2 seconds. Co-authored-by: Vasil Dimov <[email protected]>
1 parent d480586 commit 0565951

File tree

1 file changed

+28
-4
lines changed

1 file changed

+28
-4
lines changed

src/net_processing.cpp

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,11 @@ static constexpr auto GETDATA_TX_INTERVAL{60s};
110110
static const unsigned int MAX_GETDATA_SZ = 1000;
111111
/** Number of blocks that can be requested at any given time from a single peer. */
112112
static const int MAX_BLOCKS_IN_TRANSIT_PER_PEER = 16;
113-
/** Time during which a peer must stall block download progress before being disconnected. */
114-
static constexpr auto BLOCK_STALLING_TIMEOUT{2s};
113+
/** Default time during which a peer must stall block download progress before being disconnected.
114+
* the actual timeout is increased temporarily if peers are disconnected for hitting the timeout */
115+
static constexpr auto BLOCK_STALLING_TIMEOUT_DEFAULT{2s};
116+
/** Maximum timeout for stalling block download. */
117+
static constexpr auto BLOCK_STALLING_TIMEOUT_MAX{64s};
115118
/** Number of headers sent in one getheaders result. We rely on the assumption that if a peer sends
116119
* less than this number, we reached its tip. Changing this value is a protocol upgrade. */
117120
static const unsigned int MAX_HEADERS_RESULTS = 2000;
@@ -705,6 +708,9 @@ class PeerManagerImpl final : public PeerManager
705708
/** Number of preferable block download peers. */
706709
int m_num_preferred_download_peers GUARDED_BY(cs_main){0};
707710

711+
/** Stalling timeout for blocks in IBD */
712+
std::atomic<std::chrono::seconds> m_block_stalling_timeout{BLOCK_STALLING_TIMEOUT_DEFAULT};
713+
708714
bool AlreadyHaveTx(const GenTxid& gtxid)
709715
EXCLUSIVE_LOCKS_REQUIRED(cs_main, !m_recent_confirmed_transactions_mutex);
710716

@@ -1700,7 +1706,8 @@ void PeerManagerImpl::StartScheduledTasks(CScheduler& scheduler)
17001706
/**
17011707
* Evict orphan txn pool entries based on a newly connected
17021708
* block, remember the recently confirmed transactions, and delete tracked
1703-
* announcements for them. Also save the time of the last tip update.
1709+
* announcements for them. Also save the time of the last tip update and
1710+
* possibly reduce dynamic block stalling timeout.
17041711
*/
17051712
void PeerManagerImpl::BlockConnected(const std::shared_ptr<const CBlock>& pblock, const CBlockIndex* pindex)
17061713
{
@@ -1723,6 +1730,16 @@ void PeerManagerImpl::BlockConnected(const std::shared_ptr<const CBlock>& pblock
17231730
m_txrequest.ForgetTxHash(ptx->GetWitnessHash());
17241731
}
17251732
}
1733+
1734+
// In case the dynamic timeout was doubled once or more, reduce it slowly back to its default value
1735+
auto stalling_timeout = m_block_stalling_timeout.load();
1736+
Assume(stalling_timeout >= BLOCK_STALLING_TIMEOUT_DEFAULT);
1737+
if (stalling_timeout != BLOCK_STALLING_TIMEOUT_DEFAULT) {
1738+
const auto new_timeout = std::max(std::chrono::duration_cast<std::chrono::seconds>(stalling_timeout * 0.85), BLOCK_STALLING_TIMEOUT_DEFAULT);
1739+
if (m_block_stalling_timeout.compare_exchange_strong(stalling_timeout, new_timeout)) {
1740+
LogPrint(BCLog::NET, "Decreased stalling timeout to %d seconds\n", new_timeout.count());
1741+
}
1742+
}
17261743
}
17271744

17281745
void PeerManagerImpl::BlockDisconnected(const std::shared_ptr<const CBlock> &block, const CBlockIndex* pindex)
@@ -5225,12 +5242,19 @@ bool PeerManagerImpl::SendMessages(CNode* pto)
52255242
m_connman.PushMessage(pto, msgMaker.Make(NetMsgType::INV, vInv));
52265243

52275244
// Detect whether we're stalling
5228-
if (state.m_stalling_since.count() && state.m_stalling_since < current_time - BLOCK_STALLING_TIMEOUT) {
5245+
auto stalling_timeout = m_block_stalling_timeout.load();
5246+
if (state.m_stalling_since.count() && state.m_stalling_since < current_time - stalling_timeout) {
52295247
// Stalling only triggers when the block download window cannot move. During normal steady state,
52305248
// the download window should be much larger than the to-be-downloaded set of blocks, so disconnection
52315249
// should only happen during initial block download.
52325250
LogPrintf("Peer=%d is stalling block download, disconnecting\n", pto->GetId());
52335251
pto->fDisconnect = true;
5252+
// Increase timeout for the next peer so that we don't disconnect multiple peers if our own
5253+
// bandwidth is insufficient.
5254+
const auto new_timeout = std::min(2 * stalling_timeout, BLOCK_STALLING_TIMEOUT_MAX);
5255+
if (stalling_timeout != new_timeout && m_block_stalling_timeout.compare_exchange_strong(stalling_timeout, new_timeout)) {
5256+
LogPrint(BCLog::NET, "Increased stalling timeout temporarily to %d seconds\n", m_block_stalling_timeout.load().count());
5257+
}
52345258
return true;
52355259
}
52365260
// In case there is a block that has been in flight from this peer for block_interval * (1 + 0.5 * N)

0 commit comments

Comments
 (0)