eth: request id dispatcher and direct req/reply APIs (#23576)

* eth: request ID based message dispatcher * eth: fix dispatcher cancellation, rework fetchers idleness tracker * eth/downloader: drop peers who refuse to serve advertised chains
2021-11-26 13:26:03 +02:00
parent 3038e480f5
commit c10a0a62c3
52 changed files with 3213 additions and 3400 deletions
--- a/eth/handler.go
+++ b/eth/handler.go
@@ -83,8 +83,8 @@ type handlerConfig struct {
 	TxPool     txPool                    // Transaction pool to propagate from
 	Merger     *consensus.Merger         // The manager for eth1/2 transition
 	Network    uint64                    // Network identifier to adfvertise
-	Sync       downloader.SyncMode       // Whether to fast or full sync
-	BloomCache uint64                    // Megabytes to alloc for fast sync bloom
+	Sync       downloader.SyncMode       // Whether to snap or full sync
+	BloomCache uint64                    // Megabytes to alloc for snap sync bloom
 	EventMux   *event.TypeMux            // Legacy event mux, deprecate for `feed`
 	Checkpoint *params.TrustedCheckpoint // Hard coded checkpoint for sync challenges
 	Whitelist  map[uint64]common.Hash    // Hard coded whitelist for sync challenged
@@ -94,8 +94,7 @@ type handler struct {
 	networkID  uint64
 	forkFilter forkid.Filter // Fork ID filter, constant across the lifetime of the node

-	fastSync  uint32 // Flag whether fast sync is enabled (gets disabled if we already have blocks)
-	snapSync  uint32 // Flag whether fast sync should operate on top of the snap protocol
+	snapSync  uint32 // Flag whether snap sync is enabled (gets disabled if we already have blocks)
 	acceptTxs uint32 // Flag whether we're considered synchronised (enables transaction processing)

 	checkpointNumber uint64      // Block number for the sync progress validator to cross reference
@@ -147,29 +146,26 @@ func newHandler(config *handlerConfig) (*handler, error) {
 		quitSync:   make(chan struct{}),
 	}
 	if config.Sync == downloader.FullSync {
-		// The database seems empty as the current block is the genesis. Yet the fast
-		// block is ahead, so fast sync was enabled for this node at a certain point.
+		// The database seems empty as the current block is the genesis. Yet the snap
+		// block is ahead, so snap sync was enabled for this node at a certain point.
 		// The scenarios where this can happen is
-		// * if the user manually (or via a bad block) rolled back a fast sync node
+		// * if the user manually (or via a bad block) rolled back a snap sync node
 		//   below the sync point.
-		// * the last fast sync is not finished while user specifies a full sync this
+		// * the last snap sync is not finished while user specifies a full sync this
 		//   time. But we don't have any recent state for full sync.
-		// In these cases however it's safe to reenable fast sync.
+		// In these cases however it's safe to reenable snap sync.
 		fullBlock, fastBlock := h.chain.CurrentBlock(), h.chain.CurrentFastBlock()
 		if fullBlock.NumberU64() == 0 && fastBlock.NumberU64() > 0 {
-			h.fastSync = uint32(1)
-			log.Warn("Switch sync mode from full sync to fast sync")
+			h.snapSync = uint32(1)
+			log.Warn("Switch sync mode from full sync to snap sync")
 		}
 	} else {
 		if h.chain.CurrentBlock().NumberU64() > 0 {
-			// Print warning log if database is not empty to run fast sync.
-			log.Warn("Switch sync mode from fast sync to full sync")
+			// Print warning log if database is not empty to run snap sync.
+			log.Warn("Switch sync mode from snap sync to full sync")
 		} else {
-			// If fast sync was requested and our database is empty, grant it
-			h.fastSync = uint32(1)
-			if config.Sync == downloader.SnapSync {
-				h.snapSync = uint32(1)
-			}
+			// If snap sync was requested and our database is empty, grant it
+			h.snapSync = uint32(1)
 		}
 	}
 	// If we have trusted checkpoints, enforce them on the chain
@@ -177,14 +173,14 @@ func newHandler(config *handlerConfig) (*handler, error) {
 		h.checkpointNumber = (config.Checkpoint.SectionIndex+1)*params.CHTFrequency - 1
 		h.checkpointHash = config.Checkpoint.SectionHead
 	}
-	// Construct the downloader (long sync) and its backing state bloom if fast
+	// Construct the downloader (long sync) and its backing state bloom if snap
 	// sync is requested. The downloader is responsible for deallocating the state
 	// bloom when it's done.
 	// Note: we don't enable it if snap-sync is performed, since it's very heavy
-	// and the heal-portion of the snap sync is much lighter than fast. What we particularly
+	// and the heal-portion of the snap sync is much lighter than snap. What we particularly
 	// want to avoid, is a 90%-finished (but restarted) snap-sync to begin
 	// indexing the entire trie
-	if atomic.LoadUint32(&h.fastSync) == 1 && atomic.LoadUint32(&h.snapSync) == 0 {
+	if atomic.LoadUint32(&h.snapSync) == 1 && atomic.LoadUint32(&h.snapSync) == 0 {
 		h.stateBloom = trie.NewSyncBloom(config.BloomCache, config.Database)
 	}
 	h.downloader = downloader.New(h.checkpointNumber, config.Database, h.stateBloom, h.eventMux, h.chain, nil, h.removePeer)
@@ -236,12 +232,12 @@ func newHandler(config *handlerConfig) (*handler, error) {
 			log.Warn("Unsynced yet, discarded propagated block", "number", blocks[0].Number(), "hash", blocks[0].Hash())
 			return 0, nil
 		}
-		// If fast sync is running, deny importing weird blocks. This is a problematic
-		// clause when starting up a new network, because fast-syncing miners might not
+		// If snap sync is running, deny importing weird blocks. This is a problematic
+		// clause when starting up a new network, because snap-syncing miners might not
 		// accept each others' blocks until a restart. Unfortunately we haven't figured
 		// out a way yet where nodes can decide unilaterally whether the network is new
 		// or not. This should be fixed if we figure out a solution.
-		if atomic.LoadUint32(&h.fastSync) == 1 {
+		if atomic.LoadUint32(&h.snapSync) == 1 {
 			log.Warn("Fast syncing, discarded propagated block", "number", blocks[0].Number(), "hash", blocks[0].Hash())
 			return 0, nil
 		}
@@ -365,30 +361,93 @@ func (h *handler) runEthPeer(peer *eth.Peer, handler eth.Handler) error {
 	// after this will be sent via broadcasts.
 	h.syncTransactions(peer)

+	// Create a notification channel for pending requests if the peer goes down
+	dead := make(chan struct{})
+	defer close(dead)
+
 	// If we have a trusted CHT, reject all peers below that (avoid fast sync eclipse)
 	if h.checkpointHash != (common.Hash{}) {
 		// Request the peer's checkpoint header for chain height/weight validation
-		if err := peer.RequestHeadersByNumber(h.checkpointNumber, 1, 0, false); err != nil {
+		resCh := make(chan *eth.Response)
+		if _, err := peer.RequestHeadersByNumber(h.checkpointNumber, 1, 0, false, resCh); err != nil {
 			return err
 		}
 		// Start a timer to disconnect if the peer doesn't reply in time
-		p.syncDrop = time.AfterFunc(syncChallengeTimeout, func() {
-			peer.Log().Warn("Checkpoint challenge timed out, dropping", "addr", peer.RemoteAddr(), "type", peer.Name())
-			h.removePeer(peer.ID())
-		})
-		// Make sure it's cleaned up if the peer dies off
-		defer func() {
-			if p.syncDrop != nil {
-				p.syncDrop.Stop()
-				p.syncDrop = nil
+		go func() {
+			timeout := time.NewTimer(syncChallengeTimeout)
+			defer timeout.Stop()
+
+			select {
+			case res := <-resCh:
+				headers := ([]*types.Header)(*res.Res.(*eth.BlockHeadersPacket))
+				if len(headers) == 0 {
+					// If we're doing a snap sync, we must enforce the checkpoint
+					// block to avoid eclipse attacks. Unsynced nodes are welcome
+					// to connect after we're done joining the network.
+					if atomic.LoadUint32(&h.snapSync) == 1 {
+						peer.Log().Warn("Dropping unsynced node during sync", "addr", peer.RemoteAddr(), "type", peer.Name())
+						res.Done <- errors.New("unsynced node cannot serve sync")
+						return
+					}
+					res.Done <- nil
+					return
+				}
+				// Validate the header and either drop the peer or continue
+				if len(headers) > 1 {
+					res.Done <- errors.New("too many headers in checkpoint response")
+					return
+				}
+				if headers[0].Hash() != h.checkpointHash {
+					res.Done <- errors.New("checkpoint hash mismatch")
+					return
+				}
+				res.Done <- nil
+
+			case <-timeout.C:
+				peer.Log().Warn("Checkpoint challenge timed out, dropping", "addr", peer.RemoteAddr(), "type", peer.Name())
+				h.removePeer(peer.ID())
+
+			case <-dead:
+				// Peer handler terminated, abort all goroutines
 			}
 		}()
 	}
 	// If we have any explicit whitelist block hashes, request them
-	for number := range h.whitelist {
-		if err := peer.RequestHeadersByNumber(number, 1, 0, false); err != nil {
+	for number, hash := range h.whitelist {
+		resCh := make(chan *eth.Response)
+		if _, err := peer.RequestHeadersByNumber(number, 1, 0, false, resCh); err != nil {
 			return err
 		}
+		go func(number uint64, hash common.Hash) {
+			timeout := time.NewTimer(syncChallengeTimeout)
+			defer timeout.Stop()
+
+			select {
+			case res := <-resCh:
+				headers := ([]*types.Header)(*res.Res.(*eth.BlockHeadersPacket))
+				if len(headers) == 0 {
+					// Whitelisted blocks are allowed to be missing if the remote
+					// node is not yet synced
+					res.Done <- nil
+					return
+				}
+				// Validate the header and either drop the peer or continue
+				if len(headers) > 1 {
+					res.Done <- errors.New("too many headers in whitelist response")
+					return
+				}
+				if headers[0].Number.Uint64() != number || headers[0].Hash() != hash {
+					peer.Log().Info("Whitelist mismatch, dropping peer", "number", number, "hash", headers[0].Hash(), "want", hash)
+					res.Done <- errors.New("whitelist block mismatch")
+					return
+				}
+				peer.Log().Debug("Whitelist block verified", "number", number, "hash", hash)
+
+			case <-timeout.C:
+				peer.Log().Warn("Whitelist challenge timed out, dropping", "addr", peer.RemoteAddr(), "type", peer.Name())
+				h.removePeer(peer.ID())
+			}
+		}(number, hash)
 	}
 	// Handle incoming messages until the connection is torn down
 	return handler(peer)