Also import all state entries, add some args for controlling behavior

lithp · lithp · commit 0c5facc2b55b · 2019-12-05T16:10:25.000-08:00
diff --git a/eth/db/chain.py b/eth/db/chain.py
@@ -35,6 +35,7 @@
     GENESIS_PARENT_HASH,
 )
 from eth.exceptions import (
+    CanonicalHeadNotFound,
     HeaderNotFound,
     ReceiptNotFound,
     TransactionNotFound,
diff --git a/eth/db/trie_iteration.py b/eth/db/trie_iteration.py
@@ -0,0 +1,235 @@
+import enum
+from typing import (
+    cast,
+    Iterable,
+    List,
+    NamedTuple,
+    Tuple,
+)
+
+from eth.db.chain import ChainDB
+
+from eth_utils import (
+    to_tuple,
+)
+
+from eth_typing import (
+    Hash32,
+)
+
+from trie.constants import (
+    NODE_TYPE_BLANK,
+    NODE_TYPE_BRANCH,
+    NODE_TYPE_EXTENSION,
+    NODE_TYPE_LEAF,
+)
+
+from trie.utils.nodes import (
+    get_common_prefix_length,
+    decode_node,
+    extract_key,
+    get_node_type,
+)
+
+
+Nibbles = Tuple[int, ...]
+
+
+class NodeKind(enum.Enum):
+    BLANK = NODE_TYPE_BLANK
+    LEAF = NODE_TYPE_LEAF
+    EXTENSION = NODE_TYPE_EXTENSION
+    BRANCH = NODE_TYPE_BRANCH
+
+
+class TrieNode(NamedTuple):
+    kind: NodeKind
+    rlp: bytes
+    obj: List[bytes]  # this type is wrong but mypy doesn't support recursive types
+    keccak: Hash32
+
+    def __str__(self) -> str:
+        if self.kind == NodeKind.EXTENSION:
+            return (
+                "TrieNode<Extension>("
+                f"hash={self.keccak.hex()}"
+                f" path={self.path_rest}"
+                f" child={self.obj[1].hex()}"
+                 ")"
+            )
+        if self.kind == NodeKind.LEAF:
+            return (
+                "TrieNode<Leaf>("
+                f"hash={self.keccak.hex()}"
+                f" path={self.path_rest[:10]}..."
+                 ")"
+            )
+        return f"TrieNode(kind={self.kind.name} hash={self.keccak.hex()})"
+
+    @property
+    def path_rest(self) -> Nibbles:
+        # careful: this doesn't make any sense for branches
+        return cast(Nibbles, extract_key(self.obj))
+
+
+def is_subtree(prefix: Nibbles, nibbles: Nibbles) -> bool:
+    """
+    Returns True if {nibbles} represents a subtree of {prefix}.
+    """
+    if len(nibbles) < len(prefix):
+        # nibbles represents a bigger tree than prefix does
+        return False
+    return get_common_prefix_length(prefix, nibbles) == len(prefix)
+
+
+@to_tuple
+def _get_children_with_nibbles(node: TrieNode, prefix: Nibbles) -> Iterable[Tuple[Nibbles, Hash32]]:
+    """
+    Return the children of the given node at the given path, including their full paths
+    """
+    if node.kind == NodeKind.BLANK:
+        return
+    elif node.kind == NodeKind.LEAF:
+        full_path = prefix + node.path_rest
+        yield (full_path, cast(Hash32, node.obj[1]))
+    elif node.kind == NodeKind.EXTENSION:
+        full_path = prefix + node.path_rest
+        # TODO: this cast to a Hash32 is not right, nodes smaller than 32 are inlined
+        yield (full_path, cast(Hash32, node.obj[1]))
+    elif node.kind == NodeKind.BRANCH:
+        for i in range(17):
+            full_path = prefix + (i,)
+            yield (full_path, cast(Hash32, node.obj[i]))
+
+
+def _get_node(db: ChainDB, node_hash: Hash32) -> TrieNode:
+    if len(node_hash) < 32:
+        node_rlp = node_hash
+    else:
+        node_rlp = db.get(node_hash)
+
+    node = decode_node(node_rlp)
+    node_type = get_node_type(node)
+
+    return TrieNode(kind=NodeKind(node_type), rlp=node_rlp, obj=node, keccak=node_hash)
+
+
+def _iterate_trie(db: ChainDB,
+                  node: TrieNode,  # the node we should look at
+                  sub_trie: Nibbles,  # which sub_trie to return nodes from
+                  prefix: Nibbles,  # our current path in the trie
+                  ) -> Iterable[Tuple[Nibbles, TrieNode]]:
+
+    if node.kind == NodeKind.BLANK:
+        return
+
+    if node.kind == NodeKind.LEAF:
+        full_path = prefix + node.path_rest
+
+        if is_subtree(sub_trie, prefix) or is_subtree(sub_trie, full_path):
+            # also check full_path because either the node or the item the node points to
+            # might be part of the desired subtree
+            yield (prefix, node)
+
+        # there's no need to recur, this is a leaf
+        return
+
+    child_of_sub_trie = is_subtree(sub_trie, prefix)
+
+    if child_of_sub_trie:
+        # this node is part of the subtrie which should be returned
+        yield (prefix, node)
+
+    parent_of_sub_trie = is_subtree(prefix, sub_trie)
+
+    if child_of_sub_trie or parent_of_sub_trie:
+        for path, child_hash in _get_children_with_nibbles(node, prefix):
+            child_node = _get_node(db, child_hash)
+            yield from _iterate_trie(db, child_node, sub_trie, path)
+
+
+def iterate_trie(db: ChainDB, root_hash: Hash32,
+                 sub_trie: Nibbles = ()) -> Iterable[Tuple[Nibbles, TrieNode]]:
+
+    root_node = _get_node(db, root_hash)
+
+    yield from _iterate_trie(
+        db, root_node, sub_trie,
+        prefix=(),
+    )
+
+
+def iterate_leaves(db: ChainDB, root_hash: Hash32,
+                   sub_trie: Nibbles = ()) -> Iterable[Tuple[Nibbles, bytes]]:
+    """
+    Rather than returning the raw nodes, this returns just the leaves (usually, accounts),
+    along with their full paths
+    """
+
+    node_iterator = iterate_trie(db, root_hash, sub_trie)
+
+    for path, node in node_iterator:
+        if node.kind == NodeKind.LEAF:
+            full_path = path + node.path_rest
+            yield (full_path, node.obj[1])
+
+
+def _iterate_node_chunk(db: ChainDB,
+                        node: TrieNode,
+                        sub_trie: Nibbles,
+                        prefix: Nibbles,
+                        target_depth: int) -> Iterable[Tuple[Nibbles, TrieNode]]:
+
+    def recur(new_depth: int) -> Iterable[Tuple[Nibbles, TrieNode]]:
+        for path, child_hash in _get_children_with_nibbles(node, prefix):
+            child_node = _get_node(db, child_hash)
+            yield from _iterate_node_chunk(db, child_node, sub_trie, path, new_depth)
+
+    if node.kind == NodeKind.BLANK:
+        return
+
+    if node.kind == NodeKind.LEAF:
+        full_path = prefix + node.path_rest
+
+        if is_subtree(sub_trie, prefix) or is_subtree(sub_trie, full_path):
+            yield (prefix, node)
+
+        # there's no need to recur, this is a leaf
+        return
+
+    child_of_sub_trie = is_subtree(sub_trie, prefix)
+
+    if child_of_sub_trie:
+        # the node is part of the sub_trie which we want to return
+        yield (prefix, node)
+
+    if target_depth == 0:
+        # there's no point in recursing
+        return
+
+    parent_of_sub_trie = is_subtree(prefix, sub_trie)
+
+    if child_of_sub_trie:
+        # if we're returning nodes start decrementing the count
+        yield from recur(target_depth - 1)
+    elif parent_of_sub_trie:
+        # if we're still looking for the sub_trie just recur
+        yield from recur(target_depth)
+
+
+def iterate_node_chunk(db: ChainDB,
+                       root_hash: Hash32,
+                       sub_trie: Nibbles,
+                       target_depth: int) -> Iterable[Tuple[Nibbles, TrieNode]]:
+    """
+    Get all the nodes up to {target_depth} deep from the given sub_trie.
+
+    Does a truncated breadth-first search rooted at the given node and returns everything
+    it finds.
+    """
+    # TODO: notice BLANK_NODE_HASH and fail fast?
+    root_node = _get_node(db, root_hash)
+
+    yield from _iterate_node_chunk(
+        db, root_node, sub_trie, prefix=(), target_depth=target_depth,
+    )
diff --git a/scripts/gethimport.py b/scripts/gethimport.py
@@ -20,9 +20,15 @@
 from rlp.sedes import CountableList
 
 from eth.chains.mainnet import MAINNET_GENESIS_HEADER, MainnetChain
+from eth.constants import BLANK_ROOT_HASH, EMPTY_SHA3
 from eth.db.backends.level import LevelDB
 from eth.rlp.headers import BlockHeader
 from eth.rlp.transactions import BaseTransactionFields
+from eth.rlp.accounts import Account
+
+from eth.db.trie_iteration import iterate_leaves
+
+from trie.utils.nibbles import nibbles_to_bytes
 
 
 logger = logging.getLogger('importer')
@@ -223,6 +229,26 @@ def block_body(self, block_number: int, header_hash: bytes = None):
         return rlp.decode(raw_data, sedes=BlockBody)
 
 
+class ImportDatabase:
+    "Creates a 'ChainDB' which can be passed to the trie_iteration utils"
+    def __init__(self, gethdb, trinitydb):
+        self.gethdb = gethdb
+        self.trinitydb = trinitydb
+
+    def get(self, node_hash):
+        trinity_result = self.trinitydb.get(node_hash)
+        if trinity_result is not None:
+            return trinity_result
+
+        geth_result = self.gethdb.get(node_hash)
+        if geth_result is None:
+            logger.error(f'could not find node for hash: {node_hash.hex()}')
+            assert False
+
+        self.trinitydb.put(node_hash, geth_result)
+        return geth_result
+
+
 def main(args):
     # 1. Open Geth database
 
@@ -263,27 +289,74 @@ def main(args):
     geth_header = gethdb.block_header(canonical_head.block_number, canonical_head.hash)
     assert geth_header.hash == canonical_head.hash
 
-    for i in range(canonical_head.block_number, last_block_num + 1):
+    final_block_to_sync = last_block_num
+    if args.syncuntil:
+        final_block_to_sync = min(args.syncuntil, final_block_to_sync)
+
+    for i in range(canonical_head.block_number, final_block_to_sync + 1):
         header_hash = gethdb.header_hash_for_block_number(i)
         header = gethdb.block_header(i, header_hash)
 
-        body = gethdb.block_body(i)
-        block_class = chain.get_vm_class(header).get_block_class()
-        block = block_class(header, body.transactions, body.uncles)
-        chain.chaindb.persist_block(block)
+        if not args.nobodies:
+            body = gethdb.block_body(i)
+            block_class = chain.get_vm_class(header).get_block_class()
+            block = block_class(header, body.transactions, body.uncles)
+            chain.chaindb.persist_block(block)
+        else:
+            headerdb.persist_header(header)
 
         if i % 1000 == 0:
             logger.debug(f'current canonical header: {headerdb.get_canonical_head()}')
 
-    # some final checks, these should never fail
     canonical_head = headerdb.get_canonical_head()
-    geth_last_block_hash = gethdb.last_block_hash
-    geth_last_block_num = gethdb.block_num_for_hash(geth_last_block_hash)
-    assert canonical_head.hash == geth_last_block_hash
-    assert canonical_head.block_number == geth_last_block_num
+    if not args.syncuntil:
+        # similar checks should be run if we added sync until!
+        # some final checks, these should never fail
+        geth_last_block_hash = gethdb.last_block_hash
+        geth_last_block_num = gethdb.block_num_for_hash(geth_last_block_hash)
+        assert canonical_head.hash == geth_last_block_hash
+        assert canonical_head.block_number == geth_last_block_num
 
     logger.info('finished importing headers + bodies')
 
+    if args.justblocks:
+        return
+
+    state_root = canonical_head.state_root
+    logger.info(f'starting state trie import: {humanize_hash(state_root)}')
+
+    # 4. Import the state trie + storage tries
+    # Write something which iterates over the entire trie, from left to right
+    # Pass it a database which first looks in the trinity db, and if nothing is there
+    #   copies the requested node from geth->trinity before returning it
+
+    imported_leaf_count = 0
+    importdb = ImportDatabase(gethdb=gethdb.db, trinitydb=leveldb.db)
+    for path, leaf_data in iterate_leaves(importdb, state_root):
+        account = rlp.decode(leaf_data, sedes=Account)
+        addr_hash = nibbles_to_bytes(path)
+
+
+        if account.code_hash != EMPTY_SHA3:
+            bytecode = importdb.get(account.code_hash)
+
+        if account.storage_root == BLANK_ROOT_HASH:
+            imported_leaf_count += 1
+
+            if imported_leaf_count % 1000 == 0:
+                logger.debug(f'progress sha(addr)={addr_hash.hex()}')
+            continue
+
+        for path, leaf_data in iterate_leaves(importdb, account.storage_root):
+            item_addr = nibbles_to_bytes(path)
+            imported_leaf_count += 1
+
+            if imported_leaf_count % 1000 == 0:
+                logger.debug(f'progress sha(addr)={addr_hash.hex()} sha(item)={item_addr.hex()}')
+
+    loger.info('successfully imported state trie and all storage tries')
+
+
 
 if __name__ == "__main__":
     logging.basicConfig(
@@ -295,6 +368,13 @@ def main(args):
     parser = argparse.ArgumentParser()
     parser.add_argument('-gethdb', type=str, required=True)
     parser.add_argument('-destdb', type=str, required=True)
+    parser.add_argument('-justblocks', action='store_true')
+    parser.add_argument('-nobodies', action='store_true')
+    parser.add_argument('-syncuntil', type=int, action='store')
     args = parser.parse_args()
 
     main(args)
+
+    logger.warning('Some features are not yet implemented:')
+    logger.warning('- Receipts were not imported')
+    logger.warning('- This script did not verify that the chain configs match')

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@`
`35`	`35`	`GENESIS_PARENT_HASH,`
`36`	`36`	`)`
`37`	`37`	`from eth.exceptions import (`
	`38`	`+ CanonicalHeadNotFound,`
`38`	`39`	`HeaderNotFound,`
`39`	`40`	`ReceiptNotFound,`
`40`	`41`	`TransactionNotFound,`