core: improve snapshot journal recovery (#21594)

* core/state/snapshot: introduce snapshot journal version * core: update the disk layer in an atomic way * core: persist the disk layer generator periodically * core/state/snapshot: improve logging * core/state/snapshot: forcibly ensure the legacy snapshot is matched * core/state/snapshot: add debug logs * core, tests: fix tests and special recovery case * core: polish * core: add more blockchain tests for snapshot recovery * core/state: fix comment * core: add recovery flag for snapshot * core: add restart after start-after-crash tests * core/rawdb: fix imports * core: fix tests * core: remove log * core/state/snapshot: fix snapshot * core: avoid callbacks in SetHead * core: fix setHead cornercase where the threshold root has state * core: small docs for the test cases Co-authored-by: Péter Szilágyi <peterke@gmail.com>
2020-10-30 03:01:58 +08:00
parent 43c278cdf9
commit b63e3c37a6
11 changed files with 1792 additions and 159 deletions
--- a/core/state/snapshot/snapshot.go
+++ b/core/state/snapshot/snapshot.go
@ -29,6 +29,7 @@ import (
 	"github.com/ethereum/go-ethereum/ethdb"
 	"github.com/ethereum/go-ethereum/log"
 	"github.com/ethereum/go-ethereum/metrics"
+	"github.com/ethereum/go-ethereum/rlp"
 	"github.com/ethereum/go-ethereum/trie"
 )

@ -136,6 +137,10 @@ type snapshot interface {
 	// flattening everything down (bad for reorgs).
 	Journal(buffer *bytes.Buffer) (common.Hash, error)

+	// LegacyJournal is basically identical to Journal. it's the legacy version for
+	// flushing legacy journal. Now the only purpose of this function is for testing.
+	LegacyJournal(buffer *bytes.Buffer) (common.Hash, error)
+
 	// Stale return whether this layer has become stale (was flattened across) or
 	// if it's still live.
 	Stale() bool
@ -168,10 +173,12 @@ type Tree struct {
 // store (with a number of memory layers from a journal), ensuring that the head
 // of the snapshot matches the expected one.
 //
-// If the snapshot is missing or inconsistent, the entirety is deleted and will
-// be reconstructed from scratch based on the tries in the key-value store, on a
-// background thread.
-func New(diskdb ethdb.KeyValueStore, triedb *trie.Database, cache int, root common.Hash, async bool) *Tree {
+// If the snapshot is missing or the disk layer is broken, the entire is deleted
+// and will be reconstructed from scratch based on the tries in the key-value
+// store, on a background thread. If the memory layers from the journal is not
+// continuous with disk layer or the journal is missing, all diffs will be discarded
+// iff it's in "recovery" mode, otherwise rebuild is mandatory.
+func New(diskdb ethdb.KeyValueStore, triedb *trie.Database, cache int, root common.Hash, async bool, recovery bool) *Tree {
 	// Create a new, empty snapshot tree
 	snap := &Tree{
 		diskdb: diskdb,
@ -183,7 +190,7 @@ func New(diskdb ethdb.KeyValueStore, triedb *trie.Database, cache int, root comm
 		defer snap.waitBuild()
 	}
 	// Attempt to load a previously persisted snapshot and rebuild one if failed
-	head, err := loadSnapshot(diskdb, triedb, cache, root)
+	head, err := loadSnapshot(diskdb, triedb, cache, root, recovery)
 	if err != nil {
 		log.Warn("Failed to load snapshot, regenerating", "err", err)
 		snap.Rebuild(root)
@ -198,7 +205,7 @@ func New(diskdb ethdb.KeyValueStore, triedb *trie.Database, cache int, root comm
 }

 // waitBuild blocks until the snapshot finishes rebuilding. This method is meant
-// to  be used by tests to ensure we're testing what we believe we are.
+// to be used by tests to ensure we're testing what we believe we are.
 func (t *Tree) waitBuild() {
 	// Find the rebuild termination channel
 	var done chan struct{}
@ -415,6 +422,9 @@ func (t *Tree) cap(diff *diffLayer, layers int) *diskLayer {

 // diffToDisk merges a bottom-most diff into the persistent disk layer underneath
 // it. The method will panic if called onto a non-bottom-most diff layer.
+//
+// The disk layer persistence should be operated in an atomic way. All updates should
+// be discarded if the whole transition if not finished.
 func diffToDisk(bottom *diffLayer) *diskLayer {
 	var (
 		base  = bottom.parent.(*diskLayer)
@ -427,8 +437,7 @@ func diffToDisk(bottom *diffLayer) *diskLayer {
 		base.genAbort <- abort
 		stats = <-abort
 	}
-	// Start by temporarily deleting the current snapshot block marker. This
-	// ensures that in the case of a crash, the entire snapshot is invalidated.
+	// Put the deletion in the batch writer, flush all updates in the final step.
 	rawdb.DeleteSnapshotRoot(batch)

 	// Mark the original base as stale as we're going to create a new wrapper
@ -471,12 +480,6 @@ func diffToDisk(bottom *diffLayer) *diskLayer {
 		base.cache.Set(hash[:], data)
 		snapshotCleanAccountWriteMeter.Mark(int64(len(data)))

-		if batch.ValueSize() > ethdb.IdealBatchSize {
-			if err := batch.Write(); err != nil {
-				log.Crit("Failed to write account snapshot", "err", err)
-			}
-			batch.Reset()
-		}
 		snapshotFlushAccountItemMeter.Mark(1)
 		snapshotFlushAccountSizeMeter.Mark(int64(len(data)))
 	}
@ -505,18 +508,33 @@ func diffToDisk(bottom *diffLayer) *diskLayer {
 			snapshotFlushStorageItemMeter.Mark(1)
 			snapshotFlushStorageSizeMeter.Mark(int64(len(data)))
 		}
-		if batch.ValueSize() > ethdb.IdealBatchSize {
-			if err := batch.Write(); err != nil {
-				log.Crit("Failed to write storage snapshot", "err", err)
-			}
-			batch.Reset()
-		}
 	}
 	// Update the snapshot block marker and write any remainder data
 	rawdb.WriteSnapshotRoot(batch, bottom.root)
+
+	// Write out the generator marker
+	entry := journalGenerator{
+		Done:   base.genMarker == nil,
+		Marker: base.genMarker,
+	}
+	if stats != nil {
+		entry.Wiping = (stats.wiping != nil)
+		entry.Accounts = stats.accounts
+		entry.Slots = stats.slots
+		entry.Storage = uint64(stats.storage)
+	}
+	blob, err := rlp.EncodeToBytes(entry)
+	if err != nil {
+		panic(fmt.Sprintf("Failed to RLP encode generator %v", err))
+	}
+	rawdb.WriteSnapshotGenerator(batch, blob)
+
+	// Flush all the updates in the single db operation. Ensure the
+	// disk layer transition is atomic.
 	if err := batch.Write(); err != nil {
 		log.Crit("Failed to write leftover snapshot", "err", err)
 	}
+	log.Debug("Journalled disk layer", "root", bottom.root, "complete", base.genMarker == nil)
 	res := &diskLayer{
 		root:       bottom.root,
 		cache:      base.cache,
@ -554,7 +572,21 @@ func (t *Tree) Journal(root common.Hash) (common.Hash, error) {
 	t.lock.Lock()
 	defer t.lock.Unlock()

+	// Firstly write out the metadata of journal
 	journal := new(bytes.Buffer)
+	if err := rlp.Encode(journal, journalVersion); err != nil {
+		return common.Hash{}, err
+	}
+	diskroot := t.diskRoot()
+	if diskroot == (common.Hash{}) {
+		return common.Hash{}, errors.New("invalid disk root")
+	}
+	// Secondly write out the disk layer root, ensure the
+	// diff journal is continuous with disk.
+	if err := rlp.Encode(journal, diskroot); err != nil {
+		return common.Hash{}, err
+	}
+	// Finally write out the journal of each layer in reverse order.
 	base, err := snap.(snapshot).Journal(journal)
 	if err != nil {
 		return common.Hash{}, err
@ -564,6 +596,29 @@ func (t *Tree) Journal(root common.Hash) (common.Hash, error) {
 	return base, nil
 }

+// LegacyJournal is basically identical to Journal. it's the legacy
+// version for flushing legacy journal. Now the only purpose of this
+// function is for testing.
+func (t *Tree) LegacyJournal(root common.Hash) (common.Hash, error) {
+	// Retrieve the head snapshot to journal from var snap snapshot
+	snap := t.Snapshot(root)
+	if snap == nil {
+		return common.Hash{}, fmt.Errorf("snapshot [%#x] missing", root)
+	}
+	// Run the journaling
+	t.lock.Lock()
+	defer t.lock.Unlock()
+
+	journal := new(bytes.Buffer)
+	base, err := snap.(snapshot).LegacyJournal(journal)
+	if err != nil {
+		return common.Hash{}, err
+	}
+	// Store the journal into the database and return
+	rawdb.WriteSnapshotJournal(t.diskdb, journal.Bytes())
+	return base, nil
+}
+
 // Rebuild wipes all available snapshot data from the persistent database and
 // discard all caches and diff layers. Afterwards, it starts a new snapshot
 // generator with the given root hash.
@ -571,6 +626,10 @@ func (t *Tree) Rebuild(root common.Hash) {
 	t.lock.Lock()
 	defer t.lock.Unlock()

+	// Firstly delete any recovery flag in the database. Because now we are
+	// building a brand new snapshot.
+	rawdb.DeleteSnapshotRecoveryNumber(t.diskdb)
+
 	// Track whether there's a wipe currently running and keep it alive if so
 	var wiper chan struct{}

@ -657,6 +716,16 @@ func (t *Tree) disklayer() *diskLayer {
 	}
 }

+// diskRoot is a internal helper function to return the disk layer root.
+// The lock of snapTree is assumed to be held already.
+func (t *Tree) diskRoot() common.Hash {
+	disklayer := t.disklayer()
+	if disklayer == nil {
+		return common.Hash{}
+	}
+	return disklayer.Root()
+}
+
 // generating is an internal helper function which reports whether the snapshot
 // is still under the construction.
 func (t *Tree) generating() (bool, error) {
@ -671,3 +740,11 @@ func (t *Tree) generating() (bool, error) {
 	defer layer.lock.RUnlock()
 	return layer.genMarker != nil, nil
 }
+
+// diskRoot is a external helper function to return the disk layer root.
+func (t *Tree) DiskRoot() common.Hash {
+	t.lock.Lock()
+	defer t.lock.Unlock()
+
+	return t.diskRoot()
+}