Skip to content

Commit af07a53

Browse files
authored
Beacon sync reorg headers download processing (#3359)
* Update/generalise last-slow-peer management why With slow peer management, the last remaining sync peer is never zombified if it is *slow* but delivers some data. This was implemented for the blocks download only, now extended to headers download. * Reorg headers download and stashing on header chain cache why Headers download and stashing on header chain cache is now updated after how it is done for blocks (which was originally re-modelled somehow after the headers download in PR #3306.) Apart from a code cleanup, the main change is that each queued record will now hold only a single sync peer response (previously this was a list of several concatenated responses.) * Remove restriction on the number of sync peers why This restriction is a legacy construct which was used for + allowing to run on a single peer for testing + implicitly restrict the header and block queues when the size was restricted by a high-water-mark rather than a strict upper bound. * Reduce number of headers requested at a time via ethXX to 800 why This reduces some some need for in-memory cache space. When downloading 22.6m headers from `mainnet` with download request size 1024 one can make it in just under a hour on a well exposed site (so that enough peers are available.) Reducing the request size to 800 one gets just some minutes over an hour. * Update copyright year
1 parent 5413777 commit af07a53

File tree

10 files changed

+186
-387
lines changed

10 files changed

+186
-387
lines changed

execution_chain/sync/beacon/worker.nim

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,6 @@ proc start*(buddy: BeaconBuddyRef; info: static[string]): bool =
6060
peer = buddy.peer
6161
ctx = buddy.ctx
6262

63-
if runsThisManyPeersOnly <= buddy.ctx.pool.nBuddies:
64-
if not ctx.hibernate: debug info & ": peers limit reached", peer
65-
return false
66-
6763
if not ctx.pool.seenData and buddy.peerID in ctx.pool.failedPeers:
6864
if not ctx.hibernate: debug info & ": useless peer already tried", peer
6965
return false
@@ -73,7 +69,7 @@ proc start*(buddy: BeaconBuddyRef; info: static[string]): bool =
7369
return false
7470

7571
if not ctx.hibernate: debug info & ": new peer",
76-
peer, nSyncPeers=buddy.ctx.pool.nBuddies
72+
peer, nSyncPeers=ctx.pool.nBuddies
7773
true
7874

7975
proc stop*(buddy: BeaconBuddyRef; info: static[string]) =

execution_chain/sync/beacon/worker/blocks_staged/bodies_fetch.nim

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ proc fetchRegisterError*(buddy: BeaconBuddyRef, slowPeer = false) =
3232
# It would have been zombified if it were not the last one. This can be
3333
# used in functions -- depending on context -- that will trigger if the
3434
# if the pool of available sync peers becomes empty.
35-
buddy.ctx.pool.blkLastSlowPeer = Opt.some(buddy.peerID)
35+
buddy.ctx.pool.lastSlowPeer = Opt.some(buddy.peerID)
3636
else:
3737
buddy.ctrl.zombie = true # abandon slow peer unless last one
3838

@@ -97,7 +97,7 @@ proc bodiesFetch*(
9797
buddy.fetchRegisterError(slowPeer=true)
9898
else:
9999
buddy.only.nRespErrors.blk = 0 # reset error count
100-
buddy.ctx.pool.blkLastSlowPeer = Opt.none(Hash) # not last one or not error
100+
buddy.ctx.pool.lastSlowPeer = Opt.none(Hash) # not last one or not error
101101

102102
trace trEthRecvReceivedBlockBodies, peer, nReq, nResp=b.len,
103103
elapsed=elapsed.toStr, syncState=($buddy.syncState),

execution_chain/sync/beacon/worker/headers_staged.nim

Lines changed: 44 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import
1515
pkg/eth/common,
1616
pkg/stew/[interval_set, sorted_set],
1717
../worker_desc,
18-
./headers_staged/[headers_fetch, staged_collect, staged_headers],
18+
./headers_staged/[headers_fetch, staged_headers],
1919
./headers_unproc
2020

2121
# ------------------------------------------------------------------------------
@@ -27,7 +27,7 @@ func headersStagedCollectOk*(buddy: BeaconBuddyRef): bool =
2727
if buddy.ctrl.running:
2828
let ctx = buddy.ctx
2929
if 0 < ctx.headersUnprocAvail() and
30-
not ctx.collectModeStopped():
30+
not ctx.headersModeStopped():
3131
return true
3232
false
3333

@@ -93,92 +93,56 @@ proc headersStagedCollect*(
9393
discard ctx.headersUnprocFetch(top - dangling).expect("iv")
9494

9595
let
96-
# Reserve the full range of block numbers so they can be appended in a
97-
# row. This avoid some fragmentation when header chains are stashed by
98-
# multiple peers, i.e. they interleave peer task-wise.
99-
iv = ctx.headersUnprocFetch(nFetchHeadersBatchListLen).valueOr:
100-
break fetchHeadersBody # done, exit this function
101-
10296
# Get parent hash from the most senior stored header
10397
parent = ctx.hdrCache.antecedent.parentHash
10498

105-
# Fetch headers and store them on the header chain cache. The function
106-
# returns the last unprocessed block number
107-
bottom = await buddy.collectAndStashOnDiskCache(iv, parent, info)
108-
109-
# Check whether there were some headers fetched at all
110-
if bottom < iv.maxPt:
111-
nStored += (iv.maxPt - bottom) # statistics
112-
ctx.pool.seenData = true # header data exist
99+
# Fetch some headers
100+
rev = (await buddy.headersFetch(
101+
parent, nFetchHeadersRequest, info)).valueOr:
102+
break fetchHeadersBody # error => exit block
113103

114-
# Job might have been cancelled or completed while downloading headers.
115-
# If so, no more bookkeeping of headers must take place. The *books*
116-
# might have been reset and prepared for the next stage.
117-
if ctx.collectModeStopped():
118-
trace info & ": stopped fetching/storing headers", peer, iv,
119-
bottom=bottom.bnStr, nStored, syncState=($buddy.syncState)
120-
break fetchHeadersBody # done, exit this function
104+
ctx.pool.seenData = true # header data exist
121105

122-
# Commit partially processed block numbers
123-
if iv.minPt <= bottom:
124-
ctx.headersUnprocCommit(iv,iv.minPt,bottom) # partial success only
125-
break fetchHeadersBody # done, exit this function
106+
# Store it on the header chain cache
107+
let dTop = ctx.hdrCache.antecedent.number # current antecedent
108+
if not buddy.headersStashOnDisk(rev, buddy.peerID, info):
109+
break fetchHeadersBody # error => exit block
126110

127-
ctx.headersUnprocCommit(iv) # all headers processed
111+
let dBottom = ctx.hdrCache.antecedent.number # update new antecedent
112+
nStored += (dTop - dBottom) # statistics
128113

129-
debug info & ": fetched headers count", peer,
130-
unprocTop=ctx.headersUnprocAvailTop.bnStr,
131-
D=ctx.hdrCache.antecedent.bnStr, nStored, nStagedQ=ctx.hdr.staged.len,
132-
syncState=($buddy.syncState)
114+
if dBottom == dTop:
115+
break fetchHeadersBody # nothing achieved
133116

134-
# Buddy might have been cancelled while downloading headers.
135-
if buddy.ctrl.stopped:
136-
break fetchHeadersBody
117+
if buddy.ctrl.stopped: # peer was cancelled
118+
break fetchHeadersBody # done, exit this block
137119

138120
# End while: `collectAndStashOnDiskCache()`
139121

140122
# Continue opportunistically fetching by block number rather than hash. The
141123
# fetched headers need to be staged and checked/serialised later.
142124
if ctx.hdr.staged.len + ctx.hdr.reserveStaged < headersStagedQueueLengthMax:
143125

144-
let
145-
# Comment see deterministic case
146-
iv = ctx.headersUnprocFetch(nFetchHeadersBatchListLen).valueOr:
147-
break fetchHeadersBody # done, exit this function
148-
149-
# This record will accumulate the fetched headers. It must be on the
150-
# heap so that `async` can capture that properly.
151-
lhc = (ref LinkedHChain)(peerID: buddy.peerID)
152-
153-
# Fetch headers and fill up the headers list of `lhc`. The function
154-
# returns the last unprocessed block number.
126+
# Fetch headers
155127
ctx.hdr.reserveStaged.inc # Book a slot on `staged`
156-
let bottom = await buddy.collectAndStageOnMemQueue(iv, lhc, info)
128+
let rc = await buddy.headersFetch(
129+
EMPTY_ROOT_HASH, nFetchHeadersRequest, info)
157130
ctx.hdr.reserveStaged.dec # Free that slot again
158131

159-
nQueued = lhc.revHdrs.len # statistics
132+
if rc.isErr:
133+
break fetchHeadersBody # done, exit this block
160134

161-
# Job might have been cancelled or completed while downloading headers.
162-
# If so, no more bookkeeping of headers must take place. The *books*
163-
# might have been reset and prepared for the next stage.
164-
if ctx.collectModeStopped():
165-
trace info & ": stopped fetching/staging headers", peer, iv,
166-
bottom=bottom.bnStr, nStored, syncState=($buddy.syncState)
167-
break fetchHeadersBody # done, exit this function
168-
169-
# Store `lhc` chain on the `staged` queue if there is any
170-
if 0 < lhc.revHdrs.len:
171-
let qItem = ctx.hdr.staged.insert(iv.maxPt).valueOr:
172-
raiseAssert info & ": duplicate key on staged queue iv=" & $iv
173-
qItem.data = lhc[]
174-
175-
# Commit processed block numbers
176-
if iv.minPt <= bottom:
177-
ctx.headersUnprocCommit(iv,iv.minPt,bottom) # partial success only
178-
break fetchHeadersBody # done, exit this function
135+
let
136+
# Insert headers list on the `staged` queue
137+
key = rc.value[0].number
138+
qItem = ctx.hdr.staged.insert(key).valueOr:
139+
raiseAssert info & ": duplicate key on staged queue" &
140+
" iv=" & (rc.value[^1].number,key).bnStr
141+
qItem.data.revHdrs = rc.value
142+
qItem.data.peerID = buddy.peerID
179143

180-
ctx.headersUnprocCommit(iv) # all headers processed
181-
# End inner block
144+
nQueued = rc.value.len # statistics
145+
# End if
182146

183147
# End block: `fetchHeadersBody`
184148

@@ -196,9 +160,10 @@ proc headersStagedCollect*(
196160
return
197161

198162
info "Queued/staged or DB/stored headers",
199-
unprocTop=(if ctx.collectModeStopped(): "n/a"
163+
unprocTop=(if ctx.headersModeStopped(): "n/a"
200164
else: ctx.headersUnprocAvailTop.bnStr),
201-
nQueued, nStored, nStagedQ=ctx.hdr.staged.len, nSyncPeers=ctx.pool.nBuddies
165+
nQueued, nStored, nStagedQ=ctx.hdr.staged.len,
166+
nSyncPeers=ctx.pool.nBuddies
202167

203168

204169
proc headersStagedProcess*(buddy: BeaconBuddyRef; info: static[string]): bool =
@@ -216,22 +181,22 @@ proc headersStagedProcess*(buddy: BeaconBuddyRef; info: static[string]): bool =
216181
return false # switch peer
217182

218183
var
219-
nStored = 0 # statistics
220-
switchPeer = false # for return code
184+
nStored = 0u64 # statistics
185+
switchPeer = false # for return code
221186

222187
while ctx.hdrCache.state == collecting:
223188

224189
# Fetch list with largest block numbers
225190
let qItem = ctx.hdr.staged.le(high BlockNumber).valueOr:
226-
break # all done
191+
break # all done
227192

228193
let
229194
minNum = qItem.data.revHdrs[^1].number
230195
maxNum = qItem.data.revHdrs[0].number
231196
dangling = ctx.hdrCache.antecedent.number
232197
if maxNum + 1 < dangling:
233-
debug info & ": gap, serialisation postponed", peer,
234-
qItem=qItem.data.bnStr, D=dangling.bnStr, nStored,
198+
trace info & ": gap, serialisation postponed", peer,
199+
qItem=qItem.data.revHdrs.bnStr, D=dangling.bnStr, nStored,
235200
nStagedQ=ctx.hdr.staged.len, nSyncPeers=ctx.pool.nBuddies
236201
switchPeer = true # there is a gap -- come back later
237202
break
@@ -240,18 +205,14 @@ proc headersStagedProcess*(buddy: BeaconBuddyRef; info: static[string]): bool =
240205
discard ctx.hdr.staged.delete(qItem.key)
241206

242207
# Store headers on database
243-
if not buddy.headersStashOnDisk(qItem.data.revHdrs, info):
244-
# Error mark buddy that produced that unusable headers list
245-
ctx.incHdrProcErrors qItem.data.peerID
246-
208+
if not buddy.headersStashOnDisk(
209+
qItem.data.revHdrs, qItem.data.peerID, info):
247210
ctx.headersUnprocAppend(minNum, maxNum)
248211
switchPeer = true
249212
break
250213

251-
# Antecedent `dangling` of the header cache might not be at `revHdrs[^1]`.
252-
let revHdrsLen = maxNum - ctx.hdrCache.antecedent.number + 1
253-
254-
nStored += revHdrsLen.int # count headers
214+
# Antecedent of the header cache might not be at `revHdrs[^1]`.
215+
nStored += (maxNum - ctx.hdrCache.antecedent.number + 1) # count headers
255216
# End while loop
256217

257218
if 0 < nStored:

execution_chain/sync/beacon/worker/headers_staged/headers_fetch.nim

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,13 @@ import
2424
proc registerError(buddy: BeaconBuddyRef, slowPeer = false) =
2525
buddy.only.nRespErrors.hdr.inc
2626
if nFetchHeadersErrThreshold < buddy.only.nRespErrors.hdr:
27-
if 1 < buddy.ctx.pool.nBuddies or not slowPeer:
27+
if buddy.ctx.pool.nBuddies == 1 and slowPeer:
28+
# Remember that the current peer is the last one and is lablelled slow.
29+
# It would have been zombified if it were not the last one. This can be
30+
# used in functions -- depending on context -- that will trigger if the
31+
# if the pool of available sync peers becomes empty.
32+
buddy.ctx.pool.lastSlowPeer = Opt.some(buddy.peerID)
33+
else:
2834
buddy.ctrl.zombie = true # abandon slow peer unless last one
2935

3036
# ------------------------------------------------------------------------------
@@ -131,9 +137,10 @@ proc headersFetchReversed*(
131137
# mimimum share of the number of requested headers expected, typically 10%.
132138
if fetchHeadersErrTimeout < elapsed or
133139
h.len.uint64 * 100 < req.maxResults * fetchHeadersMinResponsePC:
134-
buddy.registerError()
140+
buddy.registerError(slowPeer=true)
135141
else:
136-
buddy.only.nRespErrors.hdr = 0 # reset error count
142+
buddy.only.nRespErrors.hdr = 0 # reset error count
143+
buddy.ctx.pool.lastSlowPeer = Opt.none(Hash) # not last one or not error
137144

138145
trace trEthRecvReceivedBlockHeaders, peer, nReq=req.maxResults,
139146
hash=topHash.toStr, ivResp=BnRange.new(h[^1].number,h[0].number),

0 commit comments

Comments
 (0)