Skip to content

Commit 561e26a

Browse files
committed
Initial prototype - imports block headers from geth
1 parent 10df2a6 commit 561e26a

File tree

2 files changed

+250
-0
lines changed

2 files changed

+250
-0
lines changed

scripts/gethimport.py

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Create a Trinity database by importing the current state of a Geth database
5+
"""
6+
7+
import argparse
8+
import os
9+
import os.path
10+
from pathlib import Path
11+
import shutil
12+
import snappy
13+
import struct
14+
15+
import plyvel
16+
17+
from eth_utils import humanize_hash
18+
import rlp
19+
20+
from eth.chains.mainnet import MAINNET_GENESIS_HEADER, MainnetChain
21+
from eth.db.backends.level import LevelDB
22+
from eth.rlp.headers import BlockHeader
23+
24+
25+
class GethKeys:
26+
# from https://github.com/ethereum/go-ethereum/blob/master/core/rawdb/schema.go
27+
DatabaseVersion = b'DatabaseVersion'
28+
HeadBlock = b'LastBlock'
29+
30+
headerPrefix = b'h'
31+
headerNumberPrefix = b'H'
32+
headerHashSuffix = b'n'
33+
34+
@classmethod
35+
def header_hash_for_block_number(cls, block_number: int) -> bytes:
36+
"The key to get the hash of the header with the given block number"
37+
packed_block_number = struct.pack('>Q', block_number)
38+
return cls.headerPrefix + packed_block_number + cls.headerHashSuffix
39+
40+
@classmethod
41+
def block_number_for_header_hash(cls, header_hash: bytes) -> bytes:
42+
"The key to get the block number of the header with the given hash"
43+
return cls.headerNumberPrefix + header_hash
44+
45+
@classmethod
46+
def block_header(cls, block_number: int, header_hash: bytes) -> bytes:
47+
packed_block_number = struct.pack('>Q', block_number)
48+
return cls.headerPrefix + packed_block_number + header_hash
49+
50+
51+
class GethFreezerIndexEntry:
52+
def __init__(self, filenum: int, offset: int):
53+
self.filenum = filenum
54+
self.offset = offset
55+
56+
@classmethod
57+
def from_bytes(cls, data: bytes) -> 'GethFreezerIndexEntry':
58+
assert len(data) == 6
59+
filenum, offset = struct.unpack('>HI', data)
60+
return cls(filenum, offset)
61+
62+
def __repr__(self):
63+
return f'IndexEntry(filenum={self.filenum}, offset={self.offset})'
64+
65+
66+
class GethFreezerTable:
67+
def __init__(self, ancient_path, name, uses_compression):
68+
self.ancient_path = ancient_path
69+
self.name = name
70+
self.uses_compression = uses_compression
71+
print(f'opening freezer table. name={self.name}')
72+
73+
self.index_file = open(os.path.join(ancient_path, self.index_file_name), 'rb')
74+
stat_result = os.stat(self.index_file.fileno())
75+
index_file_size = stat_result.st_size
76+
assert index_file_size % 6 == 0, index_file_size
77+
print(f'index_size={index_file_size} ({index_file_size // 6} entries)')
78+
self.entries = index_file_size // 6
79+
80+
first_index_bytes = self.index_file.read(6)
81+
first_index = GethFreezerIndexEntry.from_bytes(first_index_bytes)
82+
print(f'first_index={first_index}')
83+
84+
self.index_file.seek(-6, 2)
85+
last_index_bytes = self.index_file.read(6)
86+
last_index = GethFreezerIndexEntry.from_bytes(last_index_bytes)
87+
print(f'last_index={last_index}')
88+
89+
self._data_files = dict()
90+
91+
@property
92+
def index_file_name(self):
93+
suffix = 'cidx' if self.uses_compression else 'ridx'
94+
return f'{self.name}.{suffix}'
95+
96+
def data_file_name(self, number: int):
97+
suffix = 'cdat' if self.uses_compression else 'rdat'
98+
return f'{self.name}.{number:04d}.{suffix}'
99+
100+
def _data_file(self, number: int):
101+
if number not in self._data_files:
102+
path = os.path.join(self.ancient_path, self.data_file_name(number))
103+
data_file = open(path, 'rb')
104+
self._data_files[number] = data_file
105+
106+
return self._data_files[number]
107+
108+
def get(self, number: int) -> bytes:
109+
assert number < self.entries
110+
111+
self.index_file.seek(number * 6)
112+
entry_bytes = self.index_file.read(6)
113+
start_entry = GethFreezerIndexEntry.from_bytes(entry_bytes)
114+
115+
# What happens if we're trying to read the last item? Won't this fail?
116+
# Is there always one extra entry in the index file?
117+
self.index_file.seek((number+1) * 6)
118+
entry_bytes = self.index_file.read(6)
119+
end_entry = GethFreezerIndexEntry.from_bytes(entry_bytes)
120+
121+
if start_entry.filenum != end_entry.filenum:
122+
# Duplicates logic from freezer_table.go:getBounds
123+
start_entry = GethFreezerIndexEntry(end_entry.filenum, offset=0)
124+
125+
data_file = self._data_file(start_entry.filenum)
126+
data_file.seek(start_entry.offset)
127+
data = data_file.read(end_entry.offset - start_entry.offset)
128+
129+
if not self.uses_compression:
130+
return data
131+
132+
return snappy.decompress(data)
133+
134+
def __del__(self) -> None:
135+
for f in self._data_files.values():
136+
f.close()
137+
self.index_file.close()
138+
139+
140+
class GethDatabase:
141+
def __init__(self, path):
142+
self.db = plyvel.DB(
143+
path,
144+
create_if_missing=False,
145+
error_if_exists=False,
146+
max_open_files=16
147+
)
148+
149+
ancient_path = os.path.join(path, 'ancient')
150+
self.ancient_hashes = GethFreezerTable(ancient_path, 'hashes', False)
151+
self.ancient_headers = GethFreezerTable(ancient_path, 'headers', True)
152+
153+
if self.database_version != b'\x07':
154+
raise Exception(f'geth database version {self.database_version} is not supported')
155+
156+
@property
157+
def database_version(self) -> bytes:
158+
raw_version = self.db.get(GethKeys.DatabaseVersion)
159+
return rlp.decode(raw_version)
160+
161+
@property
162+
def last_block_hash(self) -> bytes:
163+
return self.db.get(GethKeys.HeadBlock)
164+
165+
def block_num_for_hash(self, header_hash: bytes) -> int:
166+
raw_num = self.db.get(GethKeys.block_number_for_header_hash(header_hash))
167+
return struct.unpack('>Q', raw_num)[0]
168+
169+
def block_header(self, block_number: int, header_hash: bytes) -> BlockHeader:
170+
# This also needs to check the ancient db
171+
raw_data = self.db.get(GethKeys.block_header(block_number, header_hash))
172+
if raw_data is not None:
173+
return rlp.decode(raw_data, sedes=BlockHeader)
174+
175+
raw_data = self.ancient_headers.get(block_number)
176+
return rlp.decode(raw_data, sedes=BlockHeader)
177+
178+
def header_hash_for_block_number(self, block_number: int) -> bytes:
179+
# This needs to check the ancient db (freezerHashTable)
180+
result = self.db.get(GethKeys.header_hash_for_block_number(block_number))
181+
182+
if result is not None:
183+
return result
184+
185+
return self.ancient_hashes.get(block_number)
186+
187+
188+
def main(args):
189+
# Open geth database
190+
gethdb = GethDatabase(args.gethdb)
191+
192+
last_block = gethdb.last_block_hash
193+
last_block_num = gethdb.block_num_for_hash(last_block)
194+
print('geth database opened')
195+
print(f'found chain tip: header_hash={humanize_hash(last_block)} block_number={last_block_num}')
196+
197+
print(f'header: {len(gethdb.block_header(last_block_num, last_block))}')
198+
199+
genesis_hash = gethdb.header_hash_for_block_number(0)
200+
genesis_header = gethdb.block_header(0, genesis_hash)
201+
print(f'genesis header: {genesis_header}')
202+
assert genesis_header == MAINNET_GENESIS_HEADER
203+
204+
first_hash = gethdb.header_hash_for_block_number(1)
205+
first_block = gethdb.block_header(1, first_hash)
206+
print(f'first header: {first_block}')
207+
208+
# Create trinity database
209+
210+
db_already_existed = False
211+
if os.path.exists(args.destdb):
212+
db_already_existed = True
213+
214+
leveldb = LevelDB(db_path=Path(args.destdb), max_open_files=16)
215+
216+
if not db_already_existed:
217+
print(f'Trinity database did not already exist, initializing it now')
218+
chain = MainnetChain.from_genesis_header(leveldb, MAINNET_GENESIS_HEADER)
219+
else:
220+
chain = MainnetChain(leveldb)
221+
222+
headerdb = chain.headerdb
223+
224+
canonical_head = headerdb.get_canonical_head()
225+
print(f'starting copy from trinity\'s canonical head: {canonical_head}')
226+
227+
# verify the trinity database matches what geth has
228+
geth_header = gethdb.block_header(canonical_head.block_number, canonical_head.hash)
229+
assert geth_header.hash == canonical_head.hash
230+
231+
for i in range(canonical_head.block_number, last_block_num + 1):
232+
header_hash = gethdb.header_hash_for_block_number(i)
233+
header = gethdb.block_header(i, header_hash)
234+
235+
headerdb.persist_header(header)
236+
237+
if i % 1000 == 0:
238+
print(f'current canonical header: {headerdb.get_canonical_head()}')
239+
240+
return
241+
242+
243+
if __name__ == "__main__":
244+
parser = argparse.ArgumentParser()
245+
parser.add_argument('-gethdb', type=str, required=True)
246+
parser.add_argument('-destdb', type=str, required=True)
247+
args = parser.parse_args()
248+
249+
main(args)

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
"pyethash>=0.1.27,<1.0.0",
1818
"rlp>=1.1.0,<2.0.0",
1919
"trie>=1.4.0,<2.0.0",
20+
"python-snappy==0.5.4",
2021
],
2122
# The eth-extra sections is for libraries that the evm does not
2223
# explicitly need to function and hence should not depend on.

0 commit comments

Comments
 (0)