Skip to content

Commit 27a89b2

Browse files
committed
StateDownloader: avoid infinite loop when no peers have nodes we want
Now if none of our peers have any of the nodes we want, those nodes are added to a list of missing node keys and retried after a while.
1 parent 3cde31a commit 27a89b2

File tree

3 files changed

+173
-35
lines changed

3 files changed

+173
-35
lines changed

p2p/exceptions.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,14 @@ class NoConnectedPeers(BaseP2PError):
109109

110110
class NoEligiblePeers(BaseP2PError):
111111
"""
112-
Raised when none of our peers have the blocks we want.
112+
Raised when none of our peers have the data we want.
113+
"""
114+
pass
115+
116+
117+
class NoIdlePeers(BaseP2PError):
118+
"""
119+
Raised when none of our peers is idle and can be used for data requests.
113120
"""
114121
pass
115122

p2p/state.py

Lines changed: 79 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import asyncio
22
import collections
3+
import itertools
34
import logging
45
import time
56
from typing import (
@@ -14,6 +15,8 @@
1415
Union,
1516
)
1617

18+
import cytoolz
19+
1720
import rlp
1821

1922
from trie.sync import (
@@ -40,11 +43,12 @@
4043
)
4144
from eth.db.backends.base import BaseDB
4245
from eth.rlp.accounts import Account
46+
from eth.utils.logging import TraceLogger
4347

4448
from p2p import eth
4549
from p2p import protocol
4650
from p2p.chain import PeerRequestHandler
47-
from p2p.exceptions import NoEligiblePeers
51+
from p2p.exceptions import NoEligiblePeers, NoIdlePeers
4852
from p2p.peer import BasePeer, ETHPeer, HeaderRequest, PeerPool, PeerSubscriber
4953
from p2p.service import BaseService
5054
from p2p.utils import get_asyncio_executor, Timer
@@ -73,7 +77,7 @@ def __init__(self,
7377
self.root_hash = root_hash
7478
self.scheduler = StateSync(root_hash, account_db)
7579
self._handler = PeerRequestHandler(self.chaindb, self.logger, self.cancel_token)
76-
self._active_requests: Dict[ETHPeer, Tuple[float, List[Hash32]]] = {}
80+
self.request_tracker = TrieNodeRequestTracker(self._reply_timeout, self.logger)
7781
self._peer_missing_nodes: Dict[ETHPeer, Set[Hash32]] = collections.defaultdict(set)
7882
self._executor = get_asyncio_executor()
7983

@@ -91,17 +95,27 @@ def deregister_peer(self, peer: BasePeer) -> None:
9195
self._peer_missing_nodes.pop(cast(ETHPeer, peer), None)
9296

9397
async def get_peer_for_request(self, node_keys: Set[Hash32]) -> ETHPeer:
94-
"""Return an idle peer that may have any of the trie nodes in node_keys."""
98+
"""Return an idle peer that may have any of the trie nodes in node_keys.
99+
100+
If none of our peers have any of the given node keys, raise NoEligiblePeers. If none of
101+
the peers which may have at least one of the given node keys is idle, raise NoIdlePeers.
102+
"""
103+
has_eligible_peers = False
95104
async for peer in self.peer_pool:
96105
peer = cast(ETHPeer, peer)
97-
if peer in self._active_requests:
106+
if self._peer_missing_nodes[peer].issuperset(node_keys):
107+
self.logger.trace("%s doesn't have any of the nodes we want, skipping it", peer)
108+
continue
109+
has_eligible_peers = True
110+
if peer in self.request_tracker.active_requests:
98111
self.logger.trace("%s is not idle, skipping it", peer)
99112
continue
100-
if node_keys.difference(self._peer_missing_nodes[peer]):
101-
return peer
102-
else:
103-
self.logger.trace("%s doesn't have the nodes we want, skipping it", peer)
104-
raise NoEligiblePeers()
113+
return peer
114+
115+
if not has_eligible_peers:
116+
raise NoEligiblePeers()
117+
else:
118+
raise NoIdlePeers()
105119

106120
async def _handle_msg_loop(self) -> None:
107121
while self.is_running:
@@ -141,7 +155,7 @@ async def _handle_msg(
141155
pass
142156
elif isinstance(cmd, eth.NodeData):
143157
msg = cast(List[bytes], msg)
144-
if peer not in self._active_requests:
158+
if peer not in self.request_tracker.active_requests:
145159
# This is probably a batch that we retried after a timeout and ended up receiving
146160
# more than once, so ignore but log as an INFO just in case.
147161
self.logger.info(
@@ -150,7 +164,7 @@ async def _handle_msg(
150164
return
151165

152166
self.logger.debug("Got %d NodeData entries from %s", len(msg), peer)
153-
_, requested_node_keys = self._active_requests.pop(peer)
167+
_, requested_node_keys = self.request_tracker.active_requests.pop(peer)
154168

155169
loop = asyncio.get_event_loop()
156170
node_keys = await loop.run_in_executor(self._executor, list, map(keccak, msg))
@@ -201,46 +215,49 @@ async def request_nodes(self, node_keys: Iterable[Hash32]) -> None:
201215
while not_yet_requested:
202216
try:
203217
peer = await self.get_peer_for_request(not_yet_requested)
204-
except NoEligiblePeers:
218+
except NoIdlePeers:
205219
self.logger.debug(
206220
"No idle peers have any of the trie nodes we want, sleeping a bit")
207221
await self.sleep(0.2)
208222
continue
223+
except NoEligiblePeers:
224+
self.request_tracker.missing[time.time()] = list(not_yet_requested)
225+
self.logger.debug(
226+
"No peers have any of the trie nodes in this batch, will retry later")
227+
# TODO: disconnect a peer if the pool is full
228+
return
209229

210230
candidates = list(not_yet_requested.difference(self._peer_missing_nodes[peer]))
211231
batch = candidates[:eth.MAX_STATE_FETCH]
212232
not_yet_requested = not_yet_requested.difference(batch)
213-
self._active_requests[peer] = (time.time(), batch)
233+
self.request_tracker.active_requests[peer] = (time.time(), batch)
214234
self.logger.debug("Requesting %d trie nodes to %s", len(batch), peer)
215235
peer.sub_proto.send_get_node_data(batch)
216236

217-
async def _periodically_retry_timedout(self) -> None:
237+
async def _periodically_retry_timedout_and_missing(self) -> None:
218238
while self.is_running:
219-
now = time.time()
220-
oldest_request_time = now
221-
timed_out = []
222-
# Iterate over a copy of our dict's items as we're going to mutate it.
223-
for peer, (req_time, node_keys) in list(self._active_requests.items()):
224-
if now - req_time > self._reply_timeout:
225-
self.logger.debug(
226-
"Timed out waiting for %d nodes from %s", len(node_keys), peer)
227-
timed_out.extend(node_keys)
228-
self._active_requests.pop(peer)
229-
elif req_time < oldest_request_time:
230-
oldest_request_time = req_time
239+
timed_out = self.request_tracker.get_timed_out()
231240
if timed_out:
232-
self.logger.debug("Re-requesting %d trie nodes", len(timed_out))
241+
self.logger.debug("Re-requesting %d timed out trie nodes", len(timed_out))
233242
self._total_timeouts += len(timed_out)
234243
try:
235244
await self.request_nodes(timed_out)
236245
except OperationCancelled:
237246
break
238247

239-
# Finally, sleep until the time our oldest request is scheduled to timeout.
240-
now = time.time()
241-
sleep_duration = (oldest_request_time + self._reply_timeout) - now
248+
retriable_missing = self.request_tracker.get_retriable_missing()
249+
if retriable_missing:
250+
self.logger.debug("Re-requesting %d missing trie nodes", len(timed_out))
251+
try:
252+
await self.request_nodes(retriable_missing)
253+
except OperationCancelled:
254+
break
255+
256+
# Finally, sleep until the time either our oldest request is scheduled to timeout or
257+
# one of our missing batches is scheduled to be retried.
258+
next_timeout = self.request_tracker.get_next_timeout()
242259
try:
243-
await self.sleep(sleep_duration)
260+
await self.sleep(next_timeout - time.time())
244261
except OperationCancelled:
245262
break
246263

@@ -253,7 +270,7 @@ async def _run(self) -> None:
253270
self.logger.info("Starting state sync for root hash %s", encode_hex(self.root_hash))
254271
asyncio.ensure_future(self._handle_msg_loop())
255272
asyncio.ensure_future(self._periodically_report_progress())
256-
asyncio.ensure_future(self._periodically_retry_timedout())
273+
asyncio.ensure_future(self._periodically_retry_timedout_and_missing())
257274
with self.subscribe(self.peer_pool):
258275
while self.scheduler.has_pending_requests:
259276
# This ensures we yield control and give _handle_msg() a chance to process any nodes
@@ -278,7 +295,7 @@ async def _run(self) -> None:
278295
async def _periodically_report_progress(self) -> None:
279296
while self.is_running:
280297
requested_nodes = sum(
281-
len(node_keys) for _, node_keys in self._active_requests.values())
298+
len(node_keys) for _, node_keys in self.request_tracker.active_requests.values())
282299
msg = "processed: %11d, " % self._total_processed_nodes
283300
msg += "tnps: %5d, " % (self._total_processed_nodes / self._timer.elapsed)
284301
msg += "committed: %11d, " % self.scheduler.committed_nodes
@@ -292,6 +309,35 @@ async def _periodically_report_progress(self) -> None:
292309
break
293310

294311

312+
class TrieNodeRequestTracker:
313+
314+
def __init__(self, reply_timeout: int, logger: TraceLogger) -> None:
315+
self.reply_timeout = reply_timeout
316+
self.logger = logger
317+
self.active_requests: Dict[ETHPeer, Tuple[float, List[Hash32]]] = {}
318+
self.missing: Dict[float, List[Hash32]] = {}
319+
320+
def get_timed_out(self) -> List[Hash32]:
321+
timed_out = cytoolz.valfilter(
322+
lambda v: time.time() - v[0] > self.reply_timeout, self.active_requests)
323+
for peer, (_, node_keys) in timed_out.items():
324+
self.logger.debug(
325+
"Timed out waiting for %d nodes from %s", len(node_keys), peer)
326+
self.active_requests = cytoolz.dissoc(self.active_requests, *timed_out.keys())
327+
return list(cytoolz.concat(node_keys for _, node_keys in timed_out.values()))
328+
329+
def get_retriable_missing(self) -> List[Hash32]:
330+
retriable = cytoolz.keyfilter(
331+
lambda k: time.time() - k > self.reply_timeout, self.missing)
332+
self.missing = cytoolz.dissoc(self.missing, *retriable.keys())
333+
return list(cytoolz.concat(retriable.values()))
334+
335+
def get_next_timeout(self) -> float:
336+
active_req_times = [req_time for (req_time, _) in self.active_requests.values()]
337+
oldest = min(itertools.chain([time.time()], self.missing.keys(), active_req_times))
338+
return oldest + self.reply_timeout
339+
340+
295341
class StateSync(HexaryTrieSync):
296342

297343
def __init__(self, root_hash: Hash32, db: BaseDB) -> None:

tests/p2p/test_state_sync.py

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import os
22
import random
3+
import time
34

45
from eth.db.backends.memory import MemoryDB
56
from eth.db.account import AccountDB
7+
from eth.utils.logging import TraceLogger
68

7-
from p2p.state import StateSync
9+
from p2p.state import StateSync, TrieNodeRequestTracker
810

911

1012
def make_random_state(n):
@@ -46,3 +48,86 @@ def test_state_sync():
4648
assert result_account_db.get_nonce(addr) == nonce
4749
assert result_account_db.get_storage(addr, 0) == storage
4850
assert result_account_db.get_code(addr) == code
51+
52+
53+
REPLY_TIMEOUT = 5
54+
55+
56+
def test_node_request_tracker_get_timed_out():
57+
tracker = TrieNodeRequestTracker(REPLY_TIMEOUT, TraceLogger('name'))
58+
peer1, peer2, peer3, peer4 = object(), object(), object(), object()
59+
peer_nodes = dict(
60+
(peer, [os.urandom(32) for _ in range(3)])
61+
for peer in [peer1, peer2, peer3, peer4])
62+
now = time.time()
63+
# Populate the tracker's active_requests with 4 requests, 2 of them made more than
64+
# REPLY_TIMEOUT seconds in the past and 2 made less than REPLY_TIMEOUT seconds ago.
65+
tracker.active_requests[peer1] = (now, peer_nodes[peer1])
66+
tracker.active_requests[peer2] = (now - REPLY_TIMEOUT - 1, peer_nodes[peer2])
67+
tracker.active_requests[peer3] = (now - REPLY_TIMEOUT - 2, peer_nodes[peer3])
68+
tracker.active_requests[peer4] = (now - REPLY_TIMEOUT + 1, peer_nodes[peer4])
69+
70+
# get_timed_out() must return all node keys from requests made more than REPLY_TIMEOUT seconds
71+
# in the past.
72+
expected = set(peer_nodes[peer2] + peer_nodes[peer3])
73+
timed_out = tracker.get_timed_out()
74+
assert len(timed_out) == len(expected)
75+
assert set(timed_out) == expected
76+
77+
# and it should remove the entries for those from the active_requests dict.
78+
assert peer2 not in tracker.active_requests
79+
assert peer3 not in tracker.active_requests
80+
assert peer1 in tracker.active_requests
81+
assert peer4 in tracker.active_requests
82+
83+
84+
def test_node_request_tracker_get_retriable_missing():
85+
tracker = TrieNodeRequestTracker(REPLY_TIMEOUT, TraceLogger('name'))
86+
now = time.time()
87+
# Populate the tracker's missing dict with 4 requests, 2 of them made more than
88+
# REPLY_TIMEOUT seconds in the past and 2 made less than REPLY_TIMEOUT seconds ago.
89+
req1_time, req1_nodes = now, [os.urandom(32) for _ in range(3)]
90+
req2_time, req2_nodes = (now - REPLY_TIMEOUT - 1), [os.urandom(32) for _ in range(3)]
91+
req3_time, req3_nodes = (now - REPLY_TIMEOUT - 2), [os.urandom(32) for _ in range(3)]
92+
req4_time, req4_nodes = (now - REPLY_TIMEOUT + 1), [os.urandom(32) for _ in range(3)]
93+
tracker.missing[req1_time] = req1_nodes
94+
tracker.missing[req2_time] = req2_nodes
95+
tracker.missing[req3_time] = req3_nodes
96+
tracker.missing[req4_time] = req4_nodes
97+
98+
expected = set(req2_nodes + req3_nodes)
99+
retriable_missing = tracker.get_retriable_missing()
100+
assert len(retriable_missing) == len(expected)
101+
assert set(retriable_missing) == expected
102+
103+
assert req2_time not in tracker.missing
104+
assert req3_time not in tracker.missing
105+
assert req1_time in tracker.missing
106+
assert req4_time in tracker.missing
107+
108+
109+
def test_node_request_tracker_get_next_timeout():
110+
tracker = TrieNodeRequestTracker(REPLY_TIMEOUT, TraceLogger('name'))
111+
oldest_req_time = 1234
112+
113+
# Populate the tracker with missing and active requests, one of each made at oldest_req_time
114+
# and one of each made 1s after that.
115+
peer1, peer2 = object(), object()
116+
tracker.missing[oldest_req_time] = []
117+
tracker.missing[oldest_req_time + 1] = []
118+
tracker.active_requests[peer1] = (oldest_req_time, [])
119+
tracker.active_requests[peer2] = (oldest_req_time + 1, [])
120+
121+
# Our next shcheduled timeout must be the oldest_req_time + REPLY_TIMEOUT
122+
assert tracker.get_next_timeout() == oldest_req_time + REPLY_TIMEOUT
123+
124+
# Now, if we pop any of the requests made at oldest_req_time, but leave one behind, the next
125+
# scheduled timeout will still be the same since we still have one request made at
126+
# oldest_req_time.
127+
tracker.missing.pop(oldest_req_time)
128+
assert tracker.get_next_timeout() == oldest_req_time + REPLY_TIMEOUT
129+
130+
# Removing the last remaining request made at oldest_req_time will cause the next scheduled
131+
# timeout to be (oldest_req_time + 1) + REPLY_TIMEOUT as expected.
132+
tracker.active_requests.pop(peer1)
133+
assert tracker.get_next_timeout() == oldest_req_time + 1 + REPLY_TIMEOUT

0 commit comments

Comments
 (0)