Fix node sync issue which caused failure to sign and node being kicked out (#2564)

* Only rollup last crosslink for beacon chain * Fix node sync'ing issue * add more log * Fix sync loop * Remove prepared check during sync'ing * update debug info
5 years ago · 2cbb3fb2e1
parent 146e1c2f80
commit 2cbb3fb2e1
8 changed files with 125 additions and 109 deletions
--- a/api/service/syncing/syncing.go
+++ b/api/service/syncing/syncing.go
@ -805,7 +805,7 @@ Loop:
 			}
 			ss.purgeOldBlocksFromCache()
 			if consensus != nil {
-				consensus.UpdateConsensusInformation()
+				consensus.SetMode(consensus.UpdateConsensusInformation())
 			}
 		}
 	}
--- a/consensus/checks.go
+++ b/consensus/checks.go
@ -11,6 +11,11 @@ import (
 const MaxBlockNumDiff = 100
 func (consensus *Consensus) validatorSanityChecks(msg *msg_pb.Message) bool {
 	consensus.getLogger().Debug().
 		Uint64("blockNum", msg.GetConsensus().BlockNum).
 		Uint64("viewID", msg.GetConsensus().ViewId).
 		Str("msgType", msg.Type.String()).
 		Msg("[validatorSanityChecks] Checking new message")
 	senderKey, err := consensus.verifySenderKey(msg)
 	if err != nil {
 		if err == shard.ErrValidNotInCommittee {
@ -42,6 +47,11 @@ func (consensus *Consensus) validatorSanityChecks(msg *msg_pb.Message) bool {
 }
 func (consensus *Consensus) leaderSanityChecks(msg *msg_pb.Message) bool {
 	consensus.getLogger().Debug().
 		Uint64("blockNum", msg.GetConsensus().BlockNum).
 		Uint64("viewID", msg.GetConsensus().ViewId).
 		Str("msgType", msg.Type.String()).
 		Msg("[leaderSanityChecks] Checking new message")
 	senderKey, err := consensus.verifySenderKey(msg)
 	if err != nil {
 		if err == shard.ErrValidNotInCommittee {
@ -186,6 +196,11 @@ func (consensus *Consensus) onPreparedSanityChecks(
 }
 func (consensus *Consensus) viewChangeSanityCheck(msg *msg_pb.Message) bool {
 	consensus.getLogger().Debug().
 		Uint64("blockNum", msg.GetConsensus().BlockNum).
 		Uint64("viewID", msg.GetConsensus().ViewId).
 		Str("msgType", msg.Type.String()).
 		Msg("[viewChangeSanityCheck] Checking new message")
 	senderKey, err := consensus.verifyViewChangeSenderKey(msg)
 	if err != nil {
 		consensus.getLogger().Error().Err(err).Msgf(
--- a/consensus/consensus.go
+++ b/consensus/consensus.go
@ -112,7 +112,7 @@ type Consensus struct {
 	// verified block to state sync broadcast
 	VerifiedNewBlock chan *types.Block
 	// will trigger state syncing when blockNum is low
-	blockNumLowChan chan struct{}
+	BlockNumLowChan chan struct{}
 	// Channel for DRG protocol to send pRnd (preimage of randomness resulting from combined vrf
 	// randomnesses) to consensus. The first 32 bytes are randomness, the rest is for bitmap.
 	PRndChannel chan []byte
@ -163,11 +163,6 @@ func (consensus *Consensus) BlocksNotSynchronized() {
 	consensus.syncNotReadyChan <- struct{}{}
 }
 // WaitForSyncing informs the node syncing service to start syncing
 func (consensus *Consensus) WaitForSyncing() {
 	<-consensus.blockNumLowChan
 }
 // VdfSeedSize returns the number of VRFs for VDF computation
 func (consensus *Consensus) VdfSeedSize() int {
 	return int(consensus.Decider.ParticipantsCount()) * 2 / 3
@ -204,7 +199,7 @@ func New(
 	consensus.Decider = Decider
 	consensus.host = host
 	consensus.msgSender = NewMessageSender(host)
-	consensus.blockNumLowChan = make(chan struct{})
+	consensus.BlockNumLowChan = make(chan struct{})
 	// FBFT related
 	consensus.FBFTLog = NewFBFTLog()
 	consensus.phase = FBFTAnnounce
--- a/consensus/consensus_v2.go
+++ b/consensus/consensus_v2.go
@ -32,12 +32,9 @@ func (consensus *Consensus) handleMessageUpdate(payload []byte) {
 	// when node is in ViewChanging mode, it still accepts normal messages into FBFTLog
 	// in order to avoid possible trap forever but drop PREPARE and COMMIT
 	// which are message types specifically for a node acting as leader
-	switch {
+	if (consensus.current.Mode() == ViewChanging) &&
 	case (consensus.current.Mode() == ViewChanging) &&
 		(msg.Type == msg_pb.MessageType_PREPARE ||
-			msg.Type == msg_pb.MessageType_COMMIT):
+			msg.Type == msg_pb.MessageType_COMMIT) {
 		return
 	case consensus.current.Mode() == Listening:
 		return
 	}
@ -62,16 +59,8 @@ func (consensus *Consensus) handleMessageUpdate(payload []byte) {
 		}
 	}
 	notMemberButStillCatchup := !consensus.Decider.AmIMemberOfCommitee() &&
 		msg.Type == msg_pb.MessageType_COMMITTED
 	if notMemberButStillCatchup {
 		consensus.onCommitted(msg)
 		return
 	}
 	intendedForValidator, intendedForLeader :=
-		!(consensus.IsLeader() && consensus.current.Mode() == Normal),
+		!consensus.IsLeader(),
 		consensus.IsLeader()
 	switch t := msg.Type; true {
@ -485,7 +474,6 @@ func (consensus *Consensus) Start(
 				consensus.announce(newBlock)
 			case msg := <-consensus.MsgChan:
 				consensus.getLogger().Debug().Msg("[ConsensusMainLoop] MsgChan")
 				consensus.handleMessageUpdate(msg)
 			case viewID := <-consensus.commitFinishChan:
--- a/consensus/validator.go
+++ b/consensus/validator.go
@ -69,15 +69,17 @@ func (consensus *Consensus) prepare() {
 		}
 		// TODO: this will not return immediatey, may block
-		if err := consensus.msgSender.SendWithoutRetry(
+		if consensus.current.Mode() != Listening {
-			groupID,
+			if err := consensus.msgSender.SendWithoutRetry(
-			host.ConstructP2pMessage(byte(17), networkMessage.Bytes),
+				groupID,
-		); err != nil {
+				host.ConstructP2pMessage(byte(17), networkMessage.Bytes),
-			consensus.getLogger().Warn().Err(err).Msg("[OnAnnounce] Cannot send prepare message")
+			); err != nil {
-		} else {
+				consensus.getLogger().Warn().Err(err).Msg("[OnAnnounce] Cannot send prepare message")
-			consensus.getLogger().Info().
+			} else {
-				Str("blockHash", hex.EncodeToString(consensus.blockHash[:])).
+				consensus.getLogger().Info().
-				Msg("[OnAnnounce] Sent Prepare Message!!")
+					Str("blockHash", hex.EncodeToString(consensus.blockHash[:])).
 					Msg("[OnAnnounce] Sent Prepare Message!!")
 			}
 		}
 	}
 	consensus.getLogger().Debug().
@ -207,16 +209,18 @@ func (consensus *Consensus) onPrepared(msg *msg_pb.Message) {
 			time.Sleep(consensus.delayCommit)
 		}
-		if err := consensus.msgSender.SendWithoutRetry(
+		if consensus.current.Mode() != Listening {
-			groupID,
+			if err := consensus.msgSender.SendWithoutRetry(
-			host.ConstructP2pMessage(byte(17), networkMessage.Bytes),
+				groupID,
-		); err != nil {
+				host.ConstructP2pMessage(byte(17), networkMessage.Bytes),
-			consensus.getLogger().Warn().Msg("[OnPrepared] Cannot send commit message!!")
+			); err != nil {
-		} else {
+				consensus.getLogger().Warn().Msg("[OnPrepared] Cannot send commit message!!")
-			consensus.getLogger().Info().
+			} else {
-				Uint64("blockNum", consensus.blockNum).
+				consensus.getLogger().Info().
-				Hex("blockHash", consensus.blockHash[:]).
+					Uint64("blockNum", consensus.blockNum).
-				Msg("[OnPrepared] Sent Commit Message!!")
+					Hex("blockHash", consensus.blockHash[:]).
 					Msg("[OnPrepared] Sent Commit Message!!")
 			}
 		}
 	}
 	consensus.getLogger().Debug().
@ -277,7 +281,7 @@ func (consensus *Consensus) onCommitted(msg *msg_pb.Message) {
 		consensus.getLogger().Debug().Uint64("MsgBlockNum", recvMsg.BlockNum).Msg("[OnCommitted] out of sync")
 		go func() {
 			select {
-			case consensus.blockNumLowChan <- struct{}{}:
+			case consensus.BlockNumLowChan <- struct{}{}:
 				consensus.current.SetMode(Syncing)
 				for _, v := range consensus.consensusTimeout {
 					v.Stop()
--- a/core/offchain.go
+++ b/core/offchain.go
@ -147,7 +147,7 @@ func (bc *BlockChain) CommitOffChainData(
 	}
 	// Writing beacon chain cross links
-	if header.ShardID() == shard.BeaconChainShardID &&
+	if isBeaconChain &&
 		bc.chainConfig.IsCrossLink(block.Epoch()) &&
 		len(header.CrossLinks()) > 0 {
 		crossLinks := &types.CrossLinks{}
@ -192,13 +192,16 @@ func (bc *BlockChain) CommitOffChainData(
 			Msgf(msg, len(*crossLinks), num)
 		utils.Logger().Debug().Msgf(msg, len(*crossLinks), num)
 	}
-	// Roll up latest crosslinks
+
-	for i, c := uint32(0), shard.Schedule.InstanceForEpoch(
+	if isBeaconChain {
-		epoch,
+		// Roll up latest crosslinks
-	).NumShards(); i < c; i++ {
+		for i, c := uint32(0), shard.Schedule.InstanceForEpoch(
-		if err := bc.LastContinuousCrossLink(batch, i); err != nil {
+			epoch,
-			utils.Logger().Info().
+		).NumShards(); i < c; i++ {
-				Err(err).Msg("could not batch process last continuous crosslink")
+			if err := bc.LastContinuousCrossLink(batch, i); err != nil {
 				utils.Logger().Info().
 					Err(err).Msg("could not batch process last continuous crosslink")
 			}
 		}
 	}
--- a/node/node_handler.go
+++ b/node/node_handler.go
@ -7,6 +7,8 @@ import (
 	"math/rand"
 	"time"
 	"github.com/harmony-one/harmony/consensus"
 	"github.com/ethereum/go-ethereum/rlp"
 	"github.com/harmony-one/bls/ffi/go/bls"
 	"github.com/harmony-one/harmony/api/proto"
@ -456,23 +458,25 @@ func (node *Node) PostConsensusProcessing(
 		}
 		node.BroadcastCXReceipts(newBlock, commitSigAndBitmap)
 	} else {
-		utils.Logger().Info().
+		if node.Consensus.Mode() != consensus.Listening {
-			Uint64("blockNum", newBlock.NumberU64()).
+			utils.Logger().Info().
-			Uint64("epochNum", newBlock.Epoch().Uint64()).
+				Uint64("blockNum", newBlock.NumberU64()).
-			Uint64("ViewId", newBlock.Header().ViewID().Uint64()).
+				Uint64("epochNum", newBlock.Epoch().Uint64()).
-			Str("blockHash", newBlock.Hash().String()).
+				Uint64("ViewId", newBlock.Header().ViewID().Uint64()).
-			Int("numTxns", len(newBlock.Transactions())).
+				Str("blockHash", newBlock.Hash().String()).
-			Int("numStakingTxns", len(newBlock.StakingTransactions())).
+				Int("numTxns", len(newBlock.Transactions())).
-			Msg("BINGO !!! Reached Consensus")
+				Int("numStakingTxns", len(newBlock.StakingTransactions())).
-		// 1% of the validator also need to do broadcasting
+				Msg("BINGO !!! Reached Consensus")
-		rand.Seed(time.Now().UTC().UnixNano())
+			// 1% of the validator also need to do broadcasting
-		rnd := rand.Intn(100)
+			rand.Seed(time.Now().UTC().UnixNano())
-		if rnd < 1 {
+			rnd := rand.Intn(100)
-			// Beacon validators also broadcast new blocks to make sure beacon sync is strong.
+			if rnd < 1 {
-			if node.NodeConfig.ShardID == shard.BeaconChainShardID {
+				// Beacon validators also broadcast new blocks to make sure beacon sync is strong.
-				node.BroadcastNewBlock(newBlock)
+				if node.NodeConfig.ShardID == shard.BeaconChainShardID {
 					node.BroadcastNewBlock(newBlock)
 				}
 				node.BroadcastCXReceipts(newBlock, commitSigAndBitmap)
 			}
 			node.BroadcastCXReceipts(newBlock, commitSigAndBitmap)
 		}
 	}
@ -481,7 +485,7 @@ func (node *Node) PostConsensusProcessing(
 	// Update consensus keys at last so the change of leader status doesn't mess up normal flow
 	if len(newBlock.Header().ShardState()) > 0 {
-		node.Consensus.UpdateConsensusInformation()
+		node.Consensus.SetMode(node.Consensus.UpdateConsensusInformation())
 	}
 	if h := node.NodeConfig.WebHooks.Hooks; h != nil {
 		if h.Availability != nil {
--- a/node/node_syncing.go
+++ b/node/node_syncing.go
@ -203,55 +203,62 @@ func (node *Node) DoBeaconSyncing() {
 // DoSyncing keep the node in sync with other peers, willJoinConsensus means the node will try to join consensus after catch up
 func (node *Node) DoSyncing(bc *core.BlockChain, worker *worker.Worker, willJoinConsensus bool) {
-
+	ticker := time.NewTicker(time.Duration(node.syncFreq) * time.Second)
 	// TODO ek – infinite loop; add shutdown/cleanup logic
 SyncingLoop:
 	for {
-		if node.stateSync == nil {
+		select {
-			node.stateSync = syncing.CreateStateSync(node.SelfPeer.IP, node.SelfPeer.Port, node.GetSyncID())
+		case <-ticker.C:
-			utils.Logger().Debug().Msg("[SYNC] initialized state sync")
+			node.doSync(bc, worker, willJoinConsensus)
 		case <-node.Consensus.BlockNumLowChan:
 			node.doSync(bc, worker, willJoinConsensus)
 		}
-		if node.stateSync.GetActivePeerNumber() < MinConnectedPeers {
+	}
-			shardID := bc.ShardID()
+}
-			peers, err := node.SyncingPeerProvider.SyncingPeers(shardID)
+
-			if err != nil {
+// doSync keep the node in sync with other peers, willJoinConsensus means the node will try to join consensus after catch up
-				utils.Logger().Warn().
+func (node *Node) doSync(bc *core.BlockChain, worker *worker.Worker, willJoinConsensus bool) {
-					Err(err).
+	if node.stateSync == nil {
-					Uint32("shard_id", shardID).
+		node.stateSync = syncing.CreateStateSync(node.SelfPeer.IP, node.SelfPeer.Port, node.GetSyncID())
-					Msg("cannot retrieve syncing peers")
+		utils.Logger().Debug().Msg("[SYNC] initialized state sync")
-				continue SyncingLoop
+	}
-			}
+	if node.stateSync.GetActivePeerNumber() < MinConnectedPeers {
-			if err := node.stateSync.CreateSyncConfig(peers, false); err != nil {
+		shardID := bc.ShardID()
-				utils.Logger().Warn().
+		peers, err := node.SyncingPeerProvider.SyncingPeers(shardID)
-					Err(err).
+		if err != nil {
-					Interface("peers", peers).
+			utils.Logger().Warn().
-					Msg("[SYNC] create peers error")
+				Err(err).
-				continue SyncingLoop
+				Uint32("shard_id", shardID).
-			}
+				Msg("cannot retrieve syncing peers")
-			utils.Logger().Debug().Int("len", node.stateSync.GetActivePeerNumber()).Msg("[SYNC] Get Active Peers")
+			return
 		}
-		// TODO: treat fake maximum height
+		if err := node.stateSync.CreateSyncConfig(peers, false); err != nil {
-		if node.stateSync.IsOutOfSync(bc) {
+			utils.Logger().Warn().
-			node.stateMutex.Lock()
+				Err(err).
-			node.State = NodeNotInSync
+				Interface("peers", peers).
-			node.stateMutex.Unlock()
+				Msg("[SYNC] create peers error")
-			if willJoinConsensus {
+			return
 				node.Consensus.BlocksNotSynchronized()
 			}
 			node.stateSync.SyncLoop(bc, worker, false, node.Consensus)
 			if willJoinConsensus {
 				node.stateMutex.Lock()
 				node.State = NodeReadyForConsensus
 				node.stateMutex.Unlock()
 				node.Consensus.BlocksSynchronized()
 			}
 		}
 		utils.Logger().Debug().Int("len", node.stateSync.GetActivePeerNumber()).Msg("[SYNC] Get Active Peers")
 	}
 	// TODO: treat fake maximum height
 	if node.stateSync.IsOutOfSync(bc) {
 		node.stateMutex.Lock()
-		node.State = NodeReadyForConsensus
+		node.State = NodeNotInSync
 		node.stateMutex.Unlock()
-		// TODO on demand syncing
+		if willJoinConsensus {
-		time.Sleep(time.Duration(node.syncFreq) * time.Second)
+			node.Consensus.BlocksNotSynchronized()
 		}
 		node.stateSync.SyncLoop(bc, worker, false, node.Consensus)
 		if willJoinConsensus {
 			node.stateMutex.Lock()
 			node.State = NodeReadyForConsensus
 			node.stateMutex.Unlock()
 			node.Consensus.BlocksSynchronized()
 		}
 	}
 	node.stateMutex.Lock()
 	node.State = NodeReadyForConsensus
 	node.stateMutex.Unlock()
 }
 // SupportBeaconSyncing sync with beacon chain for archival node in beacon chan or non-beacon node