Skip to content

Commit c576fa1

Browse files
zzyalbertrjl493456442holiman
authored
core: fix snapshot missing when recovery from crash (#23496)
It is because write known block only checks block and state without snapshot, which could lead to gap between newest snapshot and newest block state. However, new blocks which would cause snapshot to become fixed were ignored, since state was already known. Co-authored-by: Gary Rong <[email protected]> Co-authored-by: Martin Holst Swende <[email protected]>
1 parent c2e64db commit c576fa1

File tree

3 files changed

+182
-10
lines changed

3 files changed

+182
-10
lines changed

core/blockchain.go

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,11 +1436,10 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
14361436

14371437
// Peek the error for the first block to decide the directing import logic
14381438
it := newInsertIterator(chain, results, bc.validator)
1439-
14401439
block, err := it.next()
14411440

1442-
// Left-trim all the known blocks
1443-
if err == ErrKnownBlock {
1441+
// Left-trim all the known blocks that don't need to build snapshot
1442+
if bc.skipBlock(err, it) {
14441443
// First block (and state) is known
14451444
// 1. We did a roll-back, and should now do a re-import
14461445
// 2. The block is stored as a sidechain, and is lying about it's stateroot, and passes a stateroot
@@ -1451,7 +1450,7 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
14511450
localTd = bc.GetTd(current.Hash(), current.NumberU64())
14521451
externTd = bc.GetTd(block.ParentHash(), block.NumberU64()-1) // The first block can't be nil
14531452
)
1454-
for block != nil && err == ErrKnownBlock {
1453+
for block != nil && bc.skipBlock(err, it) {
14551454
externTd = new(big.Int).Add(externTd, block.Difficulty())
14561455
if localTd.Cmp(externTd) < 0 {
14571456
break
@@ -1469,7 +1468,7 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
14691468
// When node runs a fast sync again, it can re-import a batch of known blocks via
14701469
// `insertChain` while a part of them have higher total difficulty than current
14711470
// head full block(new pivot point).
1472-
for block != nil && err == ErrKnownBlock {
1471+
for block != nil && bc.skipBlock(err, it) {
14731472
log.Debug("Writing previously known block", "number", block.Number(), "hash", block.Hash())
14741473
if err := bc.writeKnownBlock(block); err != nil {
14751474
return it.index, err
@@ -1501,8 +1500,10 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
15011500
// If there are any still remaining, mark as ignored
15021501
return it.index, err
15031502

1504-
// Some other error occurred, abort
1505-
case err != nil:
1503+
// Some other error(except ErrKnownBlock) occurred, abort.
1504+
// ErrKnownBlock is allowed here since some known blocks
1505+
// still need re-execution to generate snapshots that are missing
1506+
case err != nil && !errors.Is(err, ErrKnownBlock):
15061507
bc.futureBlocks.Remove(block.Hash())
15071508
stats.ignored += len(it.chain)
15081509
bc.reportBlock(block, nil, err)
@@ -1520,7 +1521,7 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
15201521
}
15211522
}()
15221523

1523-
for ; block != nil && err == nil || err == ErrKnownBlock; block, err = it.next() {
1524+
for ; block != nil && err == nil || errors.Is(err, ErrKnownBlock); block, err = it.next() {
15241525
// If the chain is terminating, stop processing blocks
15251526
if bc.insertStopped() {
15261527
log.Debug("Abort during block processing")
@@ -1535,8 +1536,9 @@ func (bc *BlockChain) insertChain(chain types.Blocks, verifySeals bool) (int, er
15351536
// Clique blocks where they can share state among each other, so importing an
15361537
// older block might complete the state of the subsequent one. In this case,
15371538
// just skip the block (we already validated it once fully (and crashed), since
1538-
// its header and body was already in the database).
1539-
if err == ErrKnownBlock {
1539+
// its header and body was already in the database). But if the corresponding
1540+
// snapshot layer is missing, forcibly rerun the execution to build it.
1541+
if bc.skipBlock(err, it) {
15401542
logger := log.Debug
15411543
if bc.chainConfig.Clique == nil {
15421544
logger = log.Warn
@@ -2013,6 +2015,47 @@ func (bc *BlockChain) futureBlocksLoop() {
20132015
}
20142016
}
20152017

2018+
// skipBlock returns 'true', if the block being imported can be skipped over, meaning
2019+
// that the block does not need to be processed but can be considered already fully 'done'.
2020+
func (bc *BlockChain) skipBlock(err error, it *insertIterator) bool {
2021+
// We can only ever bypass processing if the only error returned by the validator
2022+
// is ErrKnownBlock, which means all checks passed, but we already have the block
2023+
// and state.
2024+
if !errors.Is(err, ErrKnownBlock) {
2025+
return false
2026+
}
2027+
// If we're not using snapshots, we can skip this, since we have both block
2028+
// and (trie-) state
2029+
if bc.snaps == nil {
2030+
return true
2031+
}
2032+
var (
2033+
header = it.current() // header can't be nil
2034+
parentRoot common.Hash
2035+
)
2036+
// If we also have the snapshot-state, we can skip the processing.
2037+
if bc.snaps.Snapshot(header.Root) != nil {
2038+
return true
2039+
}
2040+
// In this case, we have the trie-state but not snapshot-state. If the parent
2041+
// snapshot-state exists, we need to process this in order to not get a gap
2042+
// in the snapshot layers.
2043+
// Resolve parent block
2044+
if parent := it.previous(); parent != nil {
2045+
parentRoot = parent.Root
2046+
} else if parent = bc.GetHeaderByHash(header.ParentHash); parent != nil {
2047+
parentRoot = parent.Root
2048+
}
2049+
if parentRoot == (common.Hash{}) {
2050+
return false // Theoretically impossible case
2051+
}
2052+
// Parent is also missing snapshot: we can skip this. Otherwise process.
2053+
if bc.snaps.Snapshot(parentRoot) == nil {
2054+
return true
2055+
}
2056+
return false
2057+
}
2058+
20162059
// maintainTxIndex is responsible for the construction and deletion of the
20172060
// transaction index.
20182061
//

core/blockchain_insert.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,14 @@ func (it *insertIterator) previous() *types.Header {
150150
return it.chain[it.index-1].Header()
151151
}
152152

153+
// current returns the current header that is being processed, or nil.
154+
func (it *insertIterator) current() *types.Header {
155+
if it.index == -1 || it.index >= len(it.chain) {
156+
return nil
157+
}
158+
return it.chain[it.index].Header()
159+
}
160+
153161
// first returns the first block in the it.
154162
func (it *insertIterator) first() *types.Block {
155163
return it.chain[0]

core/blockchain_repair_test.go

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1863,3 +1863,124 @@ func testRepair(t *testing.T, tt *rewindTest, snapshots bool) {
18631863
t.Errorf("Frozen block count mismatch: have %d, want %d", frozen, tt.expFrozen)
18641864
}
18651865
}
1866+
1867+
// TestIssue23496 tests scenario described in https://github.com/ethereum/go-ethereum/pull/23496#issuecomment-926393893
1868+
// Credits to @zzyalbert for finding the issue.
1869+
//
1870+
// Local chain owns these blocks:
1871+
// G B1 B2 B3 B4
1872+
// B1: state committed
1873+
// B2: snapshot disk layer
1874+
// B3: state committed
1875+
// B4: head block
1876+
//
1877+
// Crash happens without fully persisting snapshot and in-memory states,
1878+
// chain rewinds itself to the B1 (skip B3 in order to recover snapshot)
1879+
// In this case the snapshot layer of B3 is not created because of existent
1880+
// state.
1881+
func TestIssue23496(t *testing.T) {
1882+
// It's hard to follow the test case, visualize the input
1883+
//log.Root().SetHandler(log.LvlFilterHandler(log.LvlTrace, log.StreamHandler(os.Stderr, log.TerminalFormat(true))))
1884+
1885+
// Create a temporary persistent database
1886+
datadir, err := ioutil.TempDir("", "")
1887+
if err != nil {
1888+
t.Fatalf("Failed to create temporary datadir: %v", err)
1889+
}
1890+
os.RemoveAll(datadir)
1891+
1892+
db, err := rawdb.NewLevelDBDatabaseWithFreezer(datadir, 0, 0, datadir, "", false)
1893+
if err != nil {
1894+
t.Fatalf("Failed to create persistent database: %v", err)
1895+
}
1896+
defer db.Close() // Might double close, should be fine
1897+
1898+
// Initialize a fresh chain
1899+
var (
1900+
genesis = (&Genesis{BaseFee: big.NewInt(params.InitialBaseFee)}).MustCommit(db)
1901+
engine = ethash.NewFullFaker()
1902+
config = &CacheConfig{
1903+
TrieCleanLimit: 256,
1904+
TrieDirtyLimit: 256,
1905+
TrieTimeLimit: 5 * time.Minute,
1906+
SnapshotLimit: 256,
1907+
SnapshotWait: true,
1908+
}
1909+
)
1910+
chain, err := NewBlockChain(db, config, params.AllEthashProtocolChanges, engine, vm.Config{}, nil, nil)
1911+
if err != nil {
1912+
t.Fatalf("Failed to create chain: %v", err)
1913+
}
1914+
blocks, _ := GenerateChain(params.TestChainConfig, genesis, engine, rawdb.NewMemoryDatabase(), 4, func(i int, b *BlockGen) {
1915+
b.SetCoinbase(common.Address{0x02})
1916+
b.SetDifficulty(big.NewInt(1000000))
1917+
})
1918+
1919+
// Insert block B1 and commit the state into disk
1920+
if _, err := chain.InsertChain(blocks[:1]); err != nil {
1921+
t.Fatalf("Failed to import canonical chain start: %v", err)
1922+
}
1923+
chain.stateCache.TrieDB().Commit(blocks[0].Root(), true, nil)
1924+
1925+
// Insert block B2 and commit the snapshot into disk
1926+
if _, err := chain.InsertChain(blocks[1:2]); err != nil {
1927+
t.Fatalf("Failed to import canonical chain start: %v", err)
1928+
}
1929+
if err := chain.snaps.Cap(blocks[1].Root(), 0); err != nil {
1930+
t.Fatalf("Failed to flatten snapshots: %v", err)
1931+
}
1932+
1933+
// Insert block B3 and commit the state into disk
1934+
if _, err := chain.InsertChain(blocks[2:3]); err != nil {
1935+
t.Fatalf("Failed to import canonical chain start: %v", err)
1936+
}
1937+
chain.stateCache.TrieDB().Commit(blocks[2].Root(), true, nil)
1938+
1939+
// Insert the remaining blocks
1940+
if _, err := chain.InsertChain(blocks[3:]); err != nil {
1941+
t.Fatalf("Failed to import canonical chain tail: %v", err)
1942+
}
1943+
1944+
// Pull the plug on the database, simulating a hard crash
1945+
db.Close()
1946+
1947+
// Start a new blockchain back up and see where the repair leads us
1948+
db, err = rawdb.NewLevelDBDatabaseWithFreezer(datadir, 0, 0, datadir, "", false)
1949+
if err != nil {
1950+
t.Fatalf("Failed to reopen persistent database: %v", err)
1951+
}
1952+
defer db.Close()
1953+
1954+
chain, err = NewBlockChain(db, nil, params.AllEthashProtocolChanges, engine, vm.Config{}, nil, nil)
1955+
if err != nil {
1956+
t.Fatalf("Failed to recreate chain: %v", err)
1957+
}
1958+
defer chain.Stop()
1959+
1960+
if head := chain.CurrentHeader(); head.Number.Uint64() != uint64(4) {
1961+
t.Errorf("Head header mismatch: have %d, want %d", head.Number, 4)
1962+
}
1963+
if head := chain.CurrentFastBlock(); head.NumberU64() != uint64(4) {
1964+
t.Errorf("Head fast block mismatch: have %d, want %d", head.NumberU64(), uint64(4))
1965+
}
1966+
if head := chain.CurrentBlock(); head.NumberU64() != uint64(1) {
1967+
t.Errorf("Head block mismatch: have %d, want %d", head.NumberU64(), uint64(1))
1968+
}
1969+
1970+
// Reinsert B2-B4
1971+
if _, err := chain.InsertChain(blocks[1:]); err != nil {
1972+
t.Fatalf("Failed to import canonical chain tail: %v", err)
1973+
}
1974+
if head := chain.CurrentHeader(); head.Number.Uint64() != uint64(4) {
1975+
t.Errorf("Head header mismatch: have %d, want %d", head.Number, 4)
1976+
}
1977+
if head := chain.CurrentFastBlock(); head.NumberU64() != uint64(4) {
1978+
t.Errorf("Head fast block mismatch: have %d, want %d", head.NumberU64(), uint64(4))
1979+
}
1980+
if head := chain.CurrentBlock(); head.NumberU64() != uint64(4) {
1981+
t.Errorf("Head block mismatch: have %d, want %d", head.NumberU64(), uint64(4))
1982+
}
1983+
if layer := chain.Snapshots().Snapshot(blocks[2].Root()); layer == nil {
1984+
t.Error("Failed to regenerate the snapshot of known state")
1985+
}
1986+
}

0 commit comments

Comments
 (0)