Skip to content

Commit 058c5fe

Browse files
committed
[release/1.4.6] eth/downloader: adaptive quality of service tuning
(cherry picked from commit 88f174a)
1 parent a29bdf5 commit 058c5fe

File tree

3 files changed

+235
-48
lines changed

3 files changed

+235
-48
lines changed

eth/downloader/downloader.go

Lines changed: 119 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,15 @@ var (
5454
blockTargetRTT = 3 * time.Second / 2 // [eth/61] Target time for completing a block retrieval request
5555
blockTTL = 3 * blockTargetRTT // [eth/61] Maximum time allowance before a block request is considered expired
5656

57-
headerTargetRTT = time.Second // [eth/62] Target time for completing a header retrieval request (only for measurements for now)
58-
headerTTL = 3 * time.Second // [eth/62] Time it takes for a header request to time out
59-
bodyTargetRTT = 3 * time.Second / 2 // [eth/62] Target time for completing a block body retrieval request
60-
bodyTTL = 3 * bodyTargetRTT // [eth/62] Maximum time allowance before a block body request is considered expired
61-
receiptTargetRTT = 3 * time.Second / 2 // [eth/63] Target time for completing a receipt retrieval request
62-
receiptTTL = 3 * receiptTargetRTT // [eth/63] Maximum time allowance before a receipt request is considered expired
63-
stateTargetRTT = 2 * time.Second / 2 // [eth/63] Target time for completing a state trie retrieval request
64-
stateTTL = 3 * stateTargetRTT // [eth/63] Maximum time allowance before a node data request is considered expired
57+
rttMinEstimate = 2 * time.Second // Minimum round-trip time to target for download requests
58+
rttMaxEstimate = 20 * time.Second // Maximum rount-trip time to target for download requests
59+
rttMinConfidence = 0.1 // Worse confidence factor in our estimated RTT value
60+
ttlScaling = 3 // Constant scaling factor for RTT -> TTL conversion
61+
ttlLimit = time.Minute // Maximum TTL allowance to prevent reaching crazy timeouts
62+
63+
qosTuningPeers = 5 // Number of peers to tune based on (best peers)
64+
qosConfidenceCap = 10 // Number of peers above which not to modify RTT confidence
65+
qosTuningImpact = 0.25 // Impact that a new tuning target has on the previous value
6566

6667
maxQueuedHashes = 32 * 1024 // [eth/61] Maximum number of hashes to queue for import (DOS protection)
6768
maxQueuedHeaders = 32 * 1024 // [eth/62] Maximum number of headers to queue for import (DOS protection)
@@ -113,7 +114,8 @@ type Downloader struct {
113114
fsPivotLock *types.Header // Pivot header on critical section entry (cannot change between retries)
114115
fsPivotFails int // Number of fast sync failures in the critical section
115116

116-
interrupt int32 // Atomic boolean to signal termination
117+
rttEstimate uint64 // Round trip time to target for download requests
118+
rttConfidence uint64 // Confidence in the estimated RTT (unit: millionths to allow atomic ops)
117119

118120
// Statistics
119121
syncStatsChainOrigin uint64 // Origin block number where syncing started at
@@ -159,6 +161,9 @@ type Downloader struct {
159161
cancelCh chan struct{} // Channel to cancel mid-flight syncs
160162
cancelLock sync.RWMutex // Lock to protect the cancel channel in delivers
161163

164+
quitCh chan struct{} // Quit channel to signal termination
165+
quitLock sync.RWMutex // Lock to prevent double closes
166+
162167
// Testing hooks
163168
syncInitHook func(uint64, uint64) // Method to call upon initiating a new sync run
164169
bodyFetchHook func([]*types.Header) // Method to call upon starting a block body fetch
@@ -172,11 +177,13 @@ func New(stateDb ethdb.Database, mux *event.TypeMux, hasHeader headerCheckFn, ha
172177
headFastBlock headFastBlockRetrievalFn, commitHeadBlock headBlockCommitterFn, getTd tdRetrievalFn, insertHeaders headerChainInsertFn,
173178
insertBlocks blockChainInsertFn, insertReceipts receiptChainInsertFn, rollback chainRollbackFn, dropPeer peerDropFn) *Downloader {
174179

175-
return &Downloader{
180+
dl := &Downloader{
176181
mode: FullSync,
177182
mux: mux,
178183
queue: newQueue(stateDb),
179184
peers: newPeerSet(),
185+
rttEstimate: uint64(rttMaxEstimate),
186+
rttConfidence: uint64(1000000),
180187
hasHeader: hasHeader,
181188
hasBlockAndState: hasBlockAndState,
182189
getHeader: getHeader,
@@ -203,7 +210,10 @@ func New(stateDb ethdb.Database, mux *event.TypeMux, hasHeader headerCheckFn, ha
203210
receiptWakeCh: make(chan bool, 1),
204211
stateWakeCh: make(chan bool, 1),
205212
headerProcCh: make(chan []*types.Header, 1),
213+
quitCh: make(chan struct{}),
206214
}
215+
go dl.qosTuner()
216+
return dl
207217
}
208218

209219
// Progress retrieves the synchronisation boundaries, specifically the origin
@@ -250,6 +260,8 @@ func (d *Downloader) RegisterPeer(id string, version int, head common.Hash,
250260
glog.V(logger.Error).Infoln("Register failed:", err)
251261
return err
252262
}
263+
d.qosReduceConfidence()
264+
253265
return nil
254266
}
255267

@@ -515,7 +527,16 @@ func (d *Downloader) cancel() {
515527
// Terminate interrupts the downloader, canceling all pending operations.
516528
// The downloader cannot be reused after calling Terminate.
517529
func (d *Downloader) Terminate() {
518-
atomic.StoreInt32(&d.interrupt, 1)
530+
// Close the termination channel (make sure double close is allowed)
531+
d.quitLock.Lock()
532+
select {
533+
case <-d.quitCh:
534+
default:
535+
close(d.quitCh)
536+
}
537+
d.quitLock.Unlock()
538+
539+
// Cancel any pending download requests
519540
d.cancel()
520541
}
521542

@@ -932,7 +953,7 @@ func (d *Downloader) fetchBlocks61(from uint64) error {
932953
// Reserve a chunk of hashes for a peer. A nil can mean either that
933954
// no more hashes are available, or that the peer is known not to
934955
// have them.
935-
request := d.queue.ReserveBlocks(peer, peer.BlockCapacity())
956+
request := d.queue.ReserveBlocks(peer, peer.BlockCapacity(blockTargetRTT))
936957
if request == nil {
937958
continue
938959
}
@@ -973,7 +994,7 @@ func (d *Downloader) fetchHeight(p *peer) (*types.Header, error) {
973994
// Request the advertised remote head block and wait for the response
974995
go p.getRelHeaders(p.head, 1, 0, false)
975996

976-
timeout := time.After(headerTTL)
997+
timeout := time.After(d.requestTTL())
977998
for {
978999
select {
9791000
case <-d.cancelCh:
@@ -1041,7 +1062,7 @@ func (d *Downloader) findAncestor(p *peer, height uint64) (uint64, error) {
10411062

10421063
// Wait for the remote response to the head fetch
10431064
number, hash := uint64(0), common.Hash{}
1044-
timeout := time.After(hashTTL)
1065+
timeout := time.After(d.requestTTL())
10451066

10461067
for finished := false; !finished; {
10471068
select {
@@ -1118,7 +1139,7 @@ func (d *Downloader) findAncestor(p *peer, height uint64) (uint64, error) {
11181139
// Split our chain interval in two, and request the hash to cross check
11191140
check := (start + end) / 2
11201141

1121-
timeout := time.After(hashTTL)
1142+
timeout := time.After(d.requestTTL())
11221143
go p.getAbsHeaders(uint64(check), 1, 0, false)
11231144

11241145
// Wait until a reply arrives to this request
@@ -1199,7 +1220,7 @@ func (d *Downloader) fetchHeaders(p *peer, from uint64) error {
11991220

12001221
getHeaders := func(from uint64) {
12011222
request = time.Now()
1202-
timeout.Reset(headerTTL)
1223+
timeout.Reset(d.requestTTL())
12031224

12041225
if skeleton {
12051226
glog.V(logger.Detail).Infof("%v: fetching %d skeleton headers from #%d", p, MaxHeaderFetch, from)
@@ -1311,13 +1332,13 @@ func (d *Downloader) fillHeaderSkeleton(from uint64, skeleton []*types.Header) (
13111332
pack := packet.(*headerPack)
13121333
return d.queue.DeliverHeaders(pack.peerId, pack.headers, d.headerProcCh)
13131334
}
1314-
expire = func() map[string]int { return d.queue.ExpireHeaders(headerTTL) }
1335+
expire = func() map[string]int { return d.queue.ExpireHeaders(d.requestTTL()) }
13151336
throttle = func() bool { return false }
13161337
reserve = func(p *peer, count int) (*fetchRequest, bool, error) {
13171338
return d.queue.ReserveHeaders(p, count), false, nil
13181339
}
13191340
fetch = func(p *peer, req *fetchRequest) error { return p.FetchHeaders(req.From, MaxHeaderFetch) }
1320-
capacity = func(p *peer) int { return p.HeaderCapacity() }
1341+
capacity = func(p *peer) int { return p.HeaderCapacity(d.requestRTT()) }
13211342
setIdle = func(p *peer, accepted int) { p.SetHeadersIdle(accepted) }
13221343
)
13231344
err := d.fetchParts(errCancelHeaderFetch, d.headerCh, deliver, d.queue.headerContCh, expire,
@@ -1341,9 +1362,9 @@ func (d *Downloader) fetchBodies(from uint64) error {
13411362
pack := packet.(*bodyPack)
13421363
return d.queue.DeliverBodies(pack.peerId, pack.transactions, pack.uncles)
13431364
}
1344-
expire = func() map[string]int { return d.queue.ExpireBodies(bodyTTL) }
1365+
expire = func() map[string]int { return d.queue.ExpireBodies(d.requestTTL()) }
13451366
fetch = func(p *peer, req *fetchRequest) error { return p.FetchBodies(req) }
1346-
capacity = func(p *peer) int { return p.BlockCapacity() }
1367+
capacity = func(p *peer) int { return p.BlockCapacity(d.requestRTT()) }
13471368
setIdle = func(p *peer, accepted int) { p.SetBodiesIdle(accepted) }
13481369
)
13491370
err := d.fetchParts(errCancelBodyFetch, d.bodyCh, deliver, d.bodyWakeCh, expire,
@@ -1365,9 +1386,9 @@ func (d *Downloader) fetchReceipts(from uint64) error {
13651386
pack := packet.(*receiptPack)
13661387
return d.queue.DeliverReceipts(pack.peerId, pack.receipts)
13671388
}
1368-
expire = func() map[string]int { return d.queue.ExpireReceipts(receiptTTL) }
1389+
expire = func() map[string]int { return d.queue.ExpireReceipts(d.requestTTL()) }
13691390
fetch = func(p *peer, req *fetchRequest) error { return p.FetchReceipts(req) }
1370-
capacity = func(p *peer) int { return p.ReceiptCapacity() }
1391+
capacity = func(p *peer) int { return p.ReceiptCapacity(d.requestRTT()) }
13711392
setIdle = func(p *peer, accepted int) { p.SetReceiptsIdle(accepted) }
13721393
)
13731394
err := d.fetchParts(errCancelReceiptFetch, d.receiptCh, deliver, d.receiptWakeCh, expire,
@@ -1417,13 +1438,13 @@ func (d *Downloader) fetchNodeData() error {
14171438
}
14181439
})
14191440
}
1420-
expire = func() map[string]int { return d.queue.ExpireNodeData(stateTTL) }
1441+
expire = func() map[string]int { return d.queue.ExpireNodeData(d.requestTTL()) }
14211442
throttle = func() bool { return false }
14221443
reserve = func(p *peer, count int) (*fetchRequest, bool, error) {
14231444
return d.queue.ReserveNodeData(p, count), false, nil
14241445
}
14251446
fetch = func(p *peer, req *fetchRequest) error { return p.FetchNodeData(req) }
1426-
capacity = func(p *peer) int { return p.NodeDataCapacity() }
1447+
capacity = func(p *peer) int { return p.NodeDataCapacity(d.requestRTT()) }
14271448
setIdle = func(p *peer, accepted int) { p.SetNodeDataIdle(accepted) }
14281449
)
14291450
err := d.fetchParts(errCancelStateFetch, d.stateCh, deliver, d.stateWakeCh, expire,
@@ -1799,8 +1820,10 @@ func (d *Downloader) processContent() error {
17991820
}
18001821
for len(results) != 0 {
18011822
// Check for any termination requests
1802-
if atomic.LoadInt32(&d.interrupt) == 1 {
1823+
select {
1824+
case <-d.quitCh:
18031825
return errCancelContentProcessing
1826+
default:
18041827
}
18051828
// Retrieve the a batch of results to import
18061829
var (
@@ -1901,3 +1924,74 @@ func (d *Downloader) deliver(id string, destCh chan dataPack, packet dataPack, i
19011924
return errNoSyncActive
19021925
}
19031926
}
1927+
1928+
// qosTuner is the quality of service tuning loop that occasionally gathers the
1929+
// peer latency statistics and updates the estimated request round trip time.
1930+
func (d *Downloader) qosTuner() {
1931+
for {
1932+
// Retrieve the current median RTT and integrate into the previoust target RTT
1933+
rtt := time.Duration(float64(1-qosTuningImpact)*float64(atomic.LoadUint64(&d.rttEstimate)) + qosTuningImpact*float64(d.peers.medianRTT()))
1934+
atomic.StoreUint64(&d.rttEstimate, uint64(rtt))
1935+
1936+
// A new RTT cycle passed, increase our confidence in the estimated RTT
1937+
conf := atomic.LoadUint64(&d.rttConfidence)
1938+
conf = conf + (1000000-conf)/2
1939+
atomic.StoreUint64(&d.rttConfidence, conf)
1940+
1941+
// Log the new QoS values and sleep until the next RTT
1942+
glog.V(logger.Debug).Infof("Quality of service: rtt %v, conf %.3f, ttl %v", rtt, float64(conf)/1000000.0, d.requestTTL())
1943+
select {
1944+
case <-d.quitCh:
1945+
return
1946+
case <-time.After(rtt):
1947+
}
1948+
}
1949+
}
1950+
1951+
// qosReduceConfidence is meant to be called when a new peer joins the downloader's
1952+
// peer set, needing to reduce the confidence we have in out QoS estimates.
1953+
func (d *Downloader) qosReduceConfidence() {
1954+
// If we have a single peer, confidence is always 1
1955+
peers := uint64(d.peers.Len())
1956+
if peers == 1 {
1957+
atomic.StoreUint64(&d.rttConfidence, 1000000)
1958+
return
1959+
}
1960+
// If we have a ton of peers, don't drop confidence)
1961+
if peers >= uint64(qosConfidenceCap) {
1962+
return
1963+
}
1964+
// Otherwise drop the confidence factor
1965+
conf := atomic.LoadUint64(&d.rttConfidence) * (peers - 1) / peers
1966+
if float64(conf)/1000000 < rttMinConfidence {
1967+
conf = uint64(rttMinConfidence * 1000000)
1968+
}
1969+
atomic.StoreUint64(&d.rttConfidence, conf)
1970+
1971+
rtt := time.Duration(atomic.LoadUint64(&d.rttEstimate))
1972+
glog.V(logger.Debug).Infof("Quality of service: rtt %v, conf %.3f, ttl %v", rtt, float64(conf)/1000000.0, d.requestTTL())
1973+
}
1974+
1975+
// requestRTT returns the current target round trip time for a download request
1976+
// to complete in.
1977+
//
1978+
// Note, the returned RTT is .9 of the actually estimated RTT. The reason is that
1979+
// the downloader tries to adapt queries to the RTT, so multiple RTT values can
1980+
// be adapted to, but smaller ones are preffered (stabler download stream).
1981+
func (d *Downloader) requestRTT() time.Duration {
1982+
return time.Duration(atomic.LoadUint64(&d.rttEstimate)) * 9 / 10
1983+
}
1984+
1985+
// requestTTL returns the current timeout allowance for a single download request
1986+
// to finish under.
1987+
func (d *Downloader) requestTTL() time.Duration {
1988+
var (
1989+
rtt = time.Duration(atomic.LoadUint64(&d.rttEstimate))
1990+
conf = float64(atomic.LoadUint64(&d.rttConfidence)) / 1000000.0
1991+
)
1992+
ttl := time.Duration(ttlScaling) * time.Duration(float64(rtt)/conf)
1993+
if ttl > ttlLimit {
1994+
ttl = ttlLimit
1995+
}
1996+
return ttl
1997+
}

0 commit comments

Comments
 (0)