Skip to content

Commit 0c5facc

Browse files
committed
Also import all state entries, add some args for controlling behavior
1 parent 997f43f commit 0c5facc

File tree

3 files changed

+326
-10
lines changed

3 files changed

+326
-10
lines changed

eth/db/chain.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
GENESIS_PARENT_HASH,
3636
)
3737
from eth.exceptions import (
38+
CanonicalHeadNotFound,
3839
HeaderNotFound,
3940
ReceiptNotFound,
4041
TransactionNotFound,

eth/db/trie_iteration.py

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
import enum
2+
from typing import (
3+
cast,
4+
Iterable,
5+
List,
6+
NamedTuple,
7+
Tuple,
8+
)
9+
10+
from eth.db.chain import ChainDB
11+
12+
from eth_utils import (
13+
to_tuple,
14+
)
15+
16+
from eth_typing import (
17+
Hash32,
18+
)
19+
20+
from trie.constants import (
21+
NODE_TYPE_BLANK,
22+
NODE_TYPE_BRANCH,
23+
NODE_TYPE_EXTENSION,
24+
NODE_TYPE_LEAF,
25+
)
26+
27+
from trie.utils.nodes import (
28+
get_common_prefix_length,
29+
decode_node,
30+
extract_key,
31+
get_node_type,
32+
)
33+
34+
35+
Nibbles = Tuple[int, ...]
36+
37+
38+
class NodeKind(enum.Enum):
39+
BLANK = NODE_TYPE_BLANK
40+
LEAF = NODE_TYPE_LEAF
41+
EXTENSION = NODE_TYPE_EXTENSION
42+
BRANCH = NODE_TYPE_BRANCH
43+
44+
45+
class TrieNode(NamedTuple):
46+
kind: NodeKind
47+
rlp: bytes
48+
obj: List[bytes] # this type is wrong but mypy doesn't support recursive types
49+
keccak: Hash32
50+
51+
def __str__(self) -> str:
52+
if self.kind == NodeKind.EXTENSION:
53+
return (
54+
"TrieNode<Extension>("
55+
f"hash={self.keccak.hex()}"
56+
f" path={self.path_rest}"
57+
f" child={self.obj[1].hex()}"
58+
")"
59+
)
60+
if self.kind == NodeKind.LEAF:
61+
return (
62+
"TrieNode<Leaf>("
63+
f"hash={self.keccak.hex()}"
64+
f" path={self.path_rest[:10]}..."
65+
")"
66+
)
67+
return f"TrieNode(kind={self.kind.name} hash={self.keccak.hex()})"
68+
69+
@property
70+
def path_rest(self) -> Nibbles:
71+
# careful: this doesn't make any sense for branches
72+
return cast(Nibbles, extract_key(self.obj))
73+
74+
75+
def is_subtree(prefix: Nibbles, nibbles: Nibbles) -> bool:
76+
"""
77+
Returns True if {nibbles} represents a subtree of {prefix}.
78+
"""
79+
if len(nibbles) < len(prefix):
80+
# nibbles represents a bigger tree than prefix does
81+
return False
82+
return get_common_prefix_length(prefix, nibbles) == len(prefix)
83+
84+
85+
@to_tuple
86+
def _get_children_with_nibbles(node: TrieNode, prefix: Nibbles) -> Iterable[Tuple[Nibbles, Hash32]]:
87+
"""
88+
Return the children of the given node at the given path, including their full paths
89+
"""
90+
if node.kind == NodeKind.BLANK:
91+
return
92+
elif node.kind == NodeKind.LEAF:
93+
full_path = prefix + node.path_rest
94+
yield (full_path, cast(Hash32, node.obj[1]))
95+
elif node.kind == NodeKind.EXTENSION:
96+
full_path = prefix + node.path_rest
97+
# TODO: this cast to a Hash32 is not right, nodes smaller than 32 are inlined
98+
yield (full_path, cast(Hash32, node.obj[1]))
99+
elif node.kind == NodeKind.BRANCH:
100+
for i in range(17):
101+
full_path = prefix + (i,)
102+
yield (full_path, cast(Hash32, node.obj[i]))
103+
104+
105+
def _get_node(db: ChainDB, node_hash: Hash32) -> TrieNode:
106+
if len(node_hash) < 32:
107+
node_rlp = node_hash
108+
else:
109+
node_rlp = db.get(node_hash)
110+
111+
node = decode_node(node_rlp)
112+
node_type = get_node_type(node)
113+
114+
return TrieNode(kind=NodeKind(node_type), rlp=node_rlp, obj=node, keccak=node_hash)
115+
116+
117+
def _iterate_trie(db: ChainDB,
118+
node: TrieNode, # the node we should look at
119+
sub_trie: Nibbles, # which sub_trie to return nodes from
120+
prefix: Nibbles, # our current path in the trie
121+
) -> Iterable[Tuple[Nibbles, TrieNode]]:
122+
123+
if node.kind == NodeKind.BLANK:
124+
return
125+
126+
if node.kind == NodeKind.LEAF:
127+
full_path = prefix + node.path_rest
128+
129+
if is_subtree(sub_trie, prefix) or is_subtree(sub_trie, full_path):
130+
# also check full_path because either the node or the item the node points to
131+
# might be part of the desired subtree
132+
yield (prefix, node)
133+
134+
# there's no need to recur, this is a leaf
135+
return
136+
137+
child_of_sub_trie = is_subtree(sub_trie, prefix)
138+
139+
if child_of_sub_trie:
140+
# this node is part of the subtrie which should be returned
141+
yield (prefix, node)
142+
143+
parent_of_sub_trie = is_subtree(prefix, sub_trie)
144+
145+
if child_of_sub_trie or parent_of_sub_trie:
146+
for path, child_hash in _get_children_with_nibbles(node, prefix):
147+
child_node = _get_node(db, child_hash)
148+
yield from _iterate_trie(db, child_node, sub_trie, path)
149+
150+
151+
def iterate_trie(db: ChainDB, root_hash: Hash32,
152+
sub_trie: Nibbles = ()) -> Iterable[Tuple[Nibbles, TrieNode]]:
153+
154+
root_node = _get_node(db, root_hash)
155+
156+
yield from _iterate_trie(
157+
db, root_node, sub_trie,
158+
prefix=(),
159+
)
160+
161+
162+
def iterate_leaves(db: ChainDB, root_hash: Hash32,
163+
sub_trie: Nibbles = ()) -> Iterable[Tuple[Nibbles, bytes]]:
164+
"""
165+
Rather than returning the raw nodes, this returns just the leaves (usually, accounts),
166+
along with their full paths
167+
"""
168+
169+
node_iterator = iterate_trie(db, root_hash, sub_trie)
170+
171+
for path, node in node_iterator:
172+
if node.kind == NodeKind.LEAF:
173+
full_path = path + node.path_rest
174+
yield (full_path, node.obj[1])
175+
176+
177+
def _iterate_node_chunk(db: ChainDB,
178+
node: TrieNode,
179+
sub_trie: Nibbles,
180+
prefix: Nibbles,
181+
target_depth: int) -> Iterable[Tuple[Nibbles, TrieNode]]:
182+
183+
def recur(new_depth: int) -> Iterable[Tuple[Nibbles, TrieNode]]:
184+
for path, child_hash in _get_children_with_nibbles(node, prefix):
185+
child_node = _get_node(db, child_hash)
186+
yield from _iterate_node_chunk(db, child_node, sub_trie, path, new_depth)
187+
188+
if node.kind == NodeKind.BLANK:
189+
return
190+
191+
if node.kind == NodeKind.LEAF:
192+
full_path = prefix + node.path_rest
193+
194+
if is_subtree(sub_trie, prefix) or is_subtree(sub_trie, full_path):
195+
yield (prefix, node)
196+
197+
# there's no need to recur, this is a leaf
198+
return
199+
200+
child_of_sub_trie = is_subtree(sub_trie, prefix)
201+
202+
if child_of_sub_trie:
203+
# the node is part of the sub_trie which we want to return
204+
yield (prefix, node)
205+
206+
if target_depth == 0:
207+
# there's no point in recursing
208+
return
209+
210+
parent_of_sub_trie = is_subtree(prefix, sub_trie)
211+
212+
if child_of_sub_trie:
213+
# if we're returning nodes start decrementing the count
214+
yield from recur(target_depth - 1)
215+
elif parent_of_sub_trie:
216+
# if we're still looking for the sub_trie just recur
217+
yield from recur(target_depth)
218+
219+
220+
def iterate_node_chunk(db: ChainDB,
221+
root_hash: Hash32,
222+
sub_trie: Nibbles,
223+
target_depth: int) -> Iterable[Tuple[Nibbles, TrieNode]]:
224+
"""
225+
Get all the nodes up to {target_depth} deep from the given sub_trie.
226+
227+
Does a truncated breadth-first search rooted at the given node and returns everything
228+
it finds.
229+
"""
230+
# TODO: notice BLANK_NODE_HASH and fail fast?
231+
root_node = _get_node(db, root_hash)
232+
233+
yield from _iterate_node_chunk(
234+
db, root_node, sub_trie, prefix=(), target_depth=target_depth,
235+
)

scripts/gethimport.py

Lines changed: 90 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,15 @@
2020
from rlp.sedes import CountableList
2121

2222
from eth.chains.mainnet import MAINNET_GENESIS_HEADER, MainnetChain
23+
from eth.constants import BLANK_ROOT_HASH, EMPTY_SHA3
2324
from eth.db.backends.level import LevelDB
2425
from eth.rlp.headers import BlockHeader
2526
from eth.rlp.transactions import BaseTransactionFields
27+
from eth.rlp.accounts import Account
28+
29+
from eth.db.trie_iteration import iterate_leaves
30+
31+
from trie.utils.nibbles import nibbles_to_bytes
2632

2733

2834
logger = logging.getLogger('importer')
@@ -223,6 +229,26 @@ def block_body(self, block_number: int, header_hash: bytes = None):
223229
return rlp.decode(raw_data, sedes=BlockBody)
224230

225231

232+
class ImportDatabase:
233+
"Creates a 'ChainDB' which can be passed to the trie_iteration utils"
234+
def __init__(self, gethdb, trinitydb):
235+
self.gethdb = gethdb
236+
self.trinitydb = trinitydb
237+
238+
def get(self, node_hash):
239+
trinity_result = self.trinitydb.get(node_hash)
240+
if trinity_result is not None:
241+
return trinity_result
242+
243+
geth_result = self.gethdb.get(node_hash)
244+
if geth_result is None:
245+
logger.error(f'could not find node for hash: {node_hash.hex()}')
246+
assert False
247+
248+
self.trinitydb.put(node_hash, geth_result)
249+
return geth_result
250+
251+
226252
def main(args):
227253
# 1. Open Geth database
228254

@@ -263,27 +289,74 @@ def main(args):
263289
geth_header = gethdb.block_header(canonical_head.block_number, canonical_head.hash)
264290
assert geth_header.hash == canonical_head.hash
265291

266-
for i in range(canonical_head.block_number, last_block_num + 1):
292+
final_block_to_sync = last_block_num
293+
if args.syncuntil:
294+
final_block_to_sync = min(args.syncuntil, final_block_to_sync)
295+
296+
for i in range(canonical_head.block_number, final_block_to_sync + 1):
267297
header_hash = gethdb.header_hash_for_block_number(i)
268298
header = gethdb.block_header(i, header_hash)
269299

270-
body = gethdb.block_body(i)
271-
block_class = chain.get_vm_class(header).get_block_class()
272-
block = block_class(header, body.transactions, body.uncles)
273-
chain.chaindb.persist_block(block)
300+
if not args.nobodies:
301+
body = gethdb.block_body(i)
302+
block_class = chain.get_vm_class(header).get_block_class()
303+
block = block_class(header, body.transactions, body.uncles)
304+
chain.chaindb.persist_block(block)
305+
else:
306+
headerdb.persist_header(header)
274307

275308
if i % 1000 == 0:
276309
logger.debug(f'current canonical header: {headerdb.get_canonical_head()}')
277310

278-
# some final checks, these should never fail
279311
canonical_head = headerdb.get_canonical_head()
280-
geth_last_block_hash = gethdb.last_block_hash
281-
geth_last_block_num = gethdb.block_num_for_hash(geth_last_block_hash)
282-
assert canonical_head.hash == geth_last_block_hash
283-
assert canonical_head.block_number == geth_last_block_num
312+
if not args.syncuntil:
313+
# similar checks should be run if we added sync until!
314+
# some final checks, these should never fail
315+
geth_last_block_hash = gethdb.last_block_hash
316+
geth_last_block_num = gethdb.block_num_for_hash(geth_last_block_hash)
317+
assert canonical_head.hash == geth_last_block_hash
318+
assert canonical_head.block_number == geth_last_block_num
284319

285320
logger.info('finished importing headers + bodies')
286321

322+
if args.justblocks:
323+
return
324+
325+
state_root = canonical_head.state_root
326+
logger.info(f'starting state trie import: {humanize_hash(state_root)}')
327+
328+
# 4. Import the state trie + storage tries
329+
# Write something which iterates over the entire trie, from left to right
330+
# Pass it a database which first looks in the trinity db, and if nothing is there
331+
# copies the requested node from geth->trinity before returning it
332+
333+
imported_leaf_count = 0
334+
importdb = ImportDatabase(gethdb=gethdb.db, trinitydb=leveldb.db)
335+
for path, leaf_data in iterate_leaves(importdb, state_root):
336+
account = rlp.decode(leaf_data, sedes=Account)
337+
addr_hash = nibbles_to_bytes(path)
338+
339+
340+
if account.code_hash != EMPTY_SHA3:
341+
bytecode = importdb.get(account.code_hash)
342+
343+
if account.storage_root == BLANK_ROOT_HASH:
344+
imported_leaf_count += 1
345+
346+
if imported_leaf_count % 1000 == 0:
347+
logger.debug(f'progress sha(addr)={addr_hash.hex()}')
348+
continue
349+
350+
for path, leaf_data in iterate_leaves(importdb, account.storage_root):
351+
item_addr = nibbles_to_bytes(path)
352+
imported_leaf_count += 1
353+
354+
if imported_leaf_count % 1000 == 0:
355+
logger.debug(f'progress sha(addr)={addr_hash.hex()} sha(item)={item_addr.hex()}')
356+
357+
loger.info('successfully imported state trie and all storage tries')
358+
359+
287360

288361
if __name__ == "__main__":
289362
logging.basicConfig(
@@ -295,6 +368,13 @@ def main(args):
295368
parser = argparse.ArgumentParser()
296369
parser.add_argument('-gethdb', type=str, required=True)
297370
parser.add_argument('-destdb', type=str, required=True)
371+
parser.add_argument('-justblocks', action='store_true')
372+
parser.add_argument('-nobodies', action='store_true')
373+
parser.add_argument('-syncuntil', type=int, action='store')
298374
args = parser.parse_args()
299375

300376
main(args)
377+
378+
logger.warning('Some features are not yet implemented:')
379+
logger.warning('- Receipts were not imported')
380+
logger.warning('- This script did not verify that the chain configs match')

0 commit comments

Comments
 (0)