Fix node sync issue which caused failure to sign and node being kicked out (#2564)

* Only rollup last crosslink for beacon chain

* Fix node sync'ing issue

* add more log

* Fix sync loop

* Remove prepared check during sync'ing

* update debug info
pull/2580/head
Rongjian Lan 5 years ago committed by GitHub
parent 146e1c2f80
commit 2cbb3fb2e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      api/service/syncing/syncing.go
  2. 15
      consensus/checks.go
  3. 9
      consensus/consensus.go
  4. 18
      consensus/consensus_v2.go
  5. 44
      consensus/validator.go
  6. 19
      core/offchain.go
  7. 38
      node/node_handler.go
  8. 89
      node/node_syncing.go

@ -805,7 +805,7 @@ Loop:
} }
ss.purgeOldBlocksFromCache() ss.purgeOldBlocksFromCache()
if consensus != nil { if consensus != nil {
consensus.UpdateConsensusInformation() consensus.SetMode(consensus.UpdateConsensusInformation())
} }
} }
} }

@ -11,6 +11,11 @@ import (
const MaxBlockNumDiff = 100 const MaxBlockNumDiff = 100
func (consensus *Consensus) validatorSanityChecks(msg *msg_pb.Message) bool { func (consensus *Consensus) validatorSanityChecks(msg *msg_pb.Message) bool {
consensus.getLogger().Debug().
Uint64("blockNum", msg.GetConsensus().BlockNum).
Uint64("viewID", msg.GetConsensus().ViewId).
Str("msgType", msg.Type.String()).
Msg("[validatorSanityChecks] Checking new message")
senderKey, err := consensus.verifySenderKey(msg) senderKey, err := consensus.verifySenderKey(msg)
if err != nil { if err != nil {
if err == shard.ErrValidNotInCommittee { if err == shard.ErrValidNotInCommittee {
@ -42,6 +47,11 @@ func (consensus *Consensus) validatorSanityChecks(msg *msg_pb.Message) bool {
} }
func (consensus *Consensus) leaderSanityChecks(msg *msg_pb.Message) bool { func (consensus *Consensus) leaderSanityChecks(msg *msg_pb.Message) bool {
consensus.getLogger().Debug().
Uint64("blockNum", msg.GetConsensus().BlockNum).
Uint64("viewID", msg.GetConsensus().ViewId).
Str("msgType", msg.Type.String()).
Msg("[leaderSanityChecks] Checking new message")
senderKey, err := consensus.verifySenderKey(msg) senderKey, err := consensus.verifySenderKey(msg)
if err != nil { if err != nil {
if err == shard.ErrValidNotInCommittee { if err == shard.ErrValidNotInCommittee {
@ -186,6 +196,11 @@ func (consensus *Consensus) onPreparedSanityChecks(
} }
func (consensus *Consensus) viewChangeSanityCheck(msg *msg_pb.Message) bool { func (consensus *Consensus) viewChangeSanityCheck(msg *msg_pb.Message) bool {
consensus.getLogger().Debug().
Uint64("blockNum", msg.GetConsensus().BlockNum).
Uint64("viewID", msg.GetConsensus().ViewId).
Str("msgType", msg.Type.String()).
Msg("[viewChangeSanityCheck] Checking new message")
senderKey, err := consensus.verifyViewChangeSenderKey(msg) senderKey, err := consensus.verifyViewChangeSenderKey(msg)
if err != nil { if err != nil {
consensus.getLogger().Error().Err(err).Msgf( consensus.getLogger().Error().Err(err).Msgf(

@ -112,7 +112,7 @@ type Consensus struct {
// verified block to state sync broadcast // verified block to state sync broadcast
VerifiedNewBlock chan *types.Block VerifiedNewBlock chan *types.Block
// will trigger state syncing when blockNum is low // will trigger state syncing when blockNum is low
blockNumLowChan chan struct{} BlockNumLowChan chan struct{}
// Channel for DRG protocol to send pRnd (preimage of randomness resulting from combined vrf // Channel for DRG protocol to send pRnd (preimage of randomness resulting from combined vrf
// randomnesses) to consensus. The first 32 bytes are randomness, the rest is for bitmap. // randomnesses) to consensus. The first 32 bytes are randomness, the rest is for bitmap.
PRndChannel chan []byte PRndChannel chan []byte
@ -163,11 +163,6 @@ func (consensus *Consensus) BlocksNotSynchronized() {
consensus.syncNotReadyChan <- struct{}{} consensus.syncNotReadyChan <- struct{}{}
} }
// WaitForSyncing informs the node syncing service to start syncing
func (consensus *Consensus) WaitForSyncing() {
<-consensus.blockNumLowChan
}
// VdfSeedSize returns the number of VRFs for VDF computation // VdfSeedSize returns the number of VRFs for VDF computation
func (consensus *Consensus) VdfSeedSize() int { func (consensus *Consensus) VdfSeedSize() int {
return int(consensus.Decider.ParticipantsCount()) * 2 / 3 return int(consensus.Decider.ParticipantsCount()) * 2 / 3
@ -204,7 +199,7 @@ func New(
consensus.Decider = Decider consensus.Decider = Decider
consensus.host = host consensus.host = host
consensus.msgSender = NewMessageSender(host) consensus.msgSender = NewMessageSender(host)
consensus.blockNumLowChan = make(chan struct{}) consensus.BlockNumLowChan = make(chan struct{})
// FBFT related // FBFT related
consensus.FBFTLog = NewFBFTLog() consensus.FBFTLog = NewFBFTLog()
consensus.phase = FBFTAnnounce consensus.phase = FBFTAnnounce

@ -32,12 +32,9 @@ func (consensus *Consensus) handleMessageUpdate(payload []byte) {
// when node is in ViewChanging mode, it still accepts normal messages into FBFTLog // when node is in ViewChanging mode, it still accepts normal messages into FBFTLog
// in order to avoid possible trap forever but drop PREPARE and COMMIT // in order to avoid possible trap forever but drop PREPARE and COMMIT
// which are message types specifically for a node acting as leader // which are message types specifically for a node acting as leader
switch { if (consensus.current.Mode() == ViewChanging) &&
case (consensus.current.Mode() == ViewChanging) &&
(msg.Type == msg_pb.MessageType_PREPARE || (msg.Type == msg_pb.MessageType_PREPARE ||
msg.Type == msg_pb.MessageType_COMMIT): msg.Type == msg_pb.MessageType_COMMIT) {
return
case consensus.current.Mode() == Listening:
return return
} }
@ -62,16 +59,8 @@ func (consensus *Consensus) handleMessageUpdate(payload []byte) {
} }
} }
notMemberButStillCatchup := !consensus.Decider.AmIMemberOfCommitee() &&
msg.Type == msg_pb.MessageType_COMMITTED
if notMemberButStillCatchup {
consensus.onCommitted(msg)
return
}
intendedForValidator, intendedForLeader := intendedForValidator, intendedForLeader :=
!(consensus.IsLeader() && consensus.current.Mode() == Normal), !consensus.IsLeader(),
consensus.IsLeader() consensus.IsLeader()
switch t := msg.Type; true { switch t := msg.Type; true {
@ -485,7 +474,6 @@ func (consensus *Consensus) Start(
consensus.announce(newBlock) consensus.announce(newBlock)
case msg := <-consensus.MsgChan: case msg := <-consensus.MsgChan:
consensus.getLogger().Debug().Msg("[ConsensusMainLoop] MsgChan")
consensus.handleMessageUpdate(msg) consensus.handleMessageUpdate(msg)
case viewID := <-consensus.commitFinishChan: case viewID := <-consensus.commitFinishChan:

@ -69,15 +69,17 @@ func (consensus *Consensus) prepare() {
} }
// TODO: this will not return immediatey, may block // TODO: this will not return immediatey, may block
if err := consensus.msgSender.SendWithoutRetry( if consensus.current.Mode() != Listening {
groupID, if err := consensus.msgSender.SendWithoutRetry(
host.ConstructP2pMessage(byte(17), networkMessage.Bytes), groupID,
); err != nil { host.ConstructP2pMessage(byte(17), networkMessage.Bytes),
consensus.getLogger().Warn().Err(err).Msg("[OnAnnounce] Cannot send prepare message") ); err != nil {
} else { consensus.getLogger().Warn().Err(err).Msg("[OnAnnounce] Cannot send prepare message")
consensus.getLogger().Info(). } else {
Str("blockHash", hex.EncodeToString(consensus.blockHash[:])). consensus.getLogger().Info().
Msg("[OnAnnounce] Sent Prepare Message!!") Str("blockHash", hex.EncodeToString(consensus.blockHash[:])).
Msg("[OnAnnounce] Sent Prepare Message!!")
}
} }
} }
consensus.getLogger().Debug(). consensus.getLogger().Debug().
@ -207,16 +209,18 @@ func (consensus *Consensus) onPrepared(msg *msg_pb.Message) {
time.Sleep(consensus.delayCommit) time.Sleep(consensus.delayCommit)
} }
if err := consensus.msgSender.SendWithoutRetry( if consensus.current.Mode() != Listening {
groupID, if err := consensus.msgSender.SendWithoutRetry(
host.ConstructP2pMessage(byte(17), networkMessage.Bytes), groupID,
); err != nil { host.ConstructP2pMessage(byte(17), networkMessage.Bytes),
consensus.getLogger().Warn().Msg("[OnPrepared] Cannot send commit message!!") ); err != nil {
} else { consensus.getLogger().Warn().Msg("[OnPrepared] Cannot send commit message!!")
consensus.getLogger().Info(). } else {
Uint64("blockNum", consensus.blockNum). consensus.getLogger().Info().
Hex("blockHash", consensus.blockHash[:]). Uint64("blockNum", consensus.blockNum).
Msg("[OnPrepared] Sent Commit Message!!") Hex("blockHash", consensus.blockHash[:]).
Msg("[OnPrepared] Sent Commit Message!!")
}
} }
} }
consensus.getLogger().Debug(). consensus.getLogger().Debug().
@ -277,7 +281,7 @@ func (consensus *Consensus) onCommitted(msg *msg_pb.Message) {
consensus.getLogger().Debug().Uint64("MsgBlockNum", recvMsg.BlockNum).Msg("[OnCommitted] out of sync") consensus.getLogger().Debug().Uint64("MsgBlockNum", recvMsg.BlockNum).Msg("[OnCommitted] out of sync")
go func() { go func() {
select { select {
case consensus.blockNumLowChan <- struct{}{}: case consensus.BlockNumLowChan <- struct{}{}:
consensus.current.SetMode(Syncing) consensus.current.SetMode(Syncing)
for _, v := range consensus.consensusTimeout { for _, v := range consensus.consensusTimeout {
v.Stop() v.Stop()

@ -147,7 +147,7 @@ func (bc *BlockChain) CommitOffChainData(
} }
// Writing beacon chain cross links // Writing beacon chain cross links
if header.ShardID() == shard.BeaconChainShardID && if isBeaconChain &&
bc.chainConfig.IsCrossLink(block.Epoch()) && bc.chainConfig.IsCrossLink(block.Epoch()) &&
len(header.CrossLinks()) > 0 { len(header.CrossLinks()) > 0 {
crossLinks := &types.CrossLinks{} crossLinks := &types.CrossLinks{}
@ -192,13 +192,16 @@ func (bc *BlockChain) CommitOffChainData(
Msgf(msg, len(*crossLinks), num) Msgf(msg, len(*crossLinks), num)
utils.Logger().Debug().Msgf(msg, len(*crossLinks), num) utils.Logger().Debug().Msgf(msg, len(*crossLinks), num)
} }
// Roll up latest crosslinks
for i, c := uint32(0), shard.Schedule.InstanceForEpoch( if isBeaconChain {
epoch, // Roll up latest crosslinks
).NumShards(); i < c; i++ { for i, c := uint32(0), shard.Schedule.InstanceForEpoch(
if err := bc.LastContinuousCrossLink(batch, i); err != nil { epoch,
utils.Logger().Info(). ).NumShards(); i < c; i++ {
Err(err).Msg("could not batch process last continuous crosslink") if err := bc.LastContinuousCrossLink(batch, i); err != nil {
utils.Logger().Info().
Err(err).Msg("could not batch process last continuous crosslink")
}
} }
} }

@ -7,6 +7,8 @@ import (
"math/rand" "math/rand"
"time" "time"
"github.com/harmony-one/harmony/consensus"
"github.com/ethereum/go-ethereum/rlp" "github.com/ethereum/go-ethereum/rlp"
"github.com/harmony-one/bls/ffi/go/bls" "github.com/harmony-one/bls/ffi/go/bls"
"github.com/harmony-one/harmony/api/proto" "github.com/harmony-one/harmony/api/proto"
@ -456,23 +458,25 @@ func (node *Node) PostConsensusProcessing(
} }
node.BroadcastCXReceipts(newBlock, commitSigAndBitmap) node.BroadcastCXReceipts(newBlock, commitSigAndBitmap)
} else { } else {
utils.Logger().Info(). if node.Consensus.Mode() != consensus.Listening {
Uint64("blockNum", newBlock.NumberU64()). utils.Logger().Info().
Uint64("epochNum", newBlock.Epoch().Uint64()). Uint64("blockNum", newBlock.NumberU64()).
Uint64("ViewId", newBlock.Header().ViewID().Uint64()). Uint64("epochNum", newBlock.Epoch().Uint64()).
Str("blockHash", newBlock.Hash().String()). Uint64("ViewId", newBlock.Header().ViewID().Uint64()).
Int("numTxns", len(newBlock.Transactions())). Str("blockHash", newBlock.Hash().String()).
Int("numStakingTxns", len(newBlock.StakingTransactions())). Int("numTxns", len(newBlock.Transactions())).
Msg("BINGO !!! Reached Consensus") Int("numStakingTxns", len(newBlock.StakingTransactions())).
// 1% of the validator also need to do broadcasting Msg("BINGO !!! Reached Consensus")
rand.Seed(time.Now().UTC().UnixNano()) // 1% of the validator also need to do broadcasting
rnd := rand.Intn(100) rand.Seed(time.Now().UTC().UnixNano())
if rnd < 1 { rnd := rand.Intn(100)
// Beacon validators also broadcast new blocks to make sure beacon sync is strong. if rnd < 1 {
if node.NodeConfig.ShardID == shard.BeaconChainShardID { // Beacon validators also broadcast new blocks to make sure beacon sync is strong.
node.BroadcastNewBlock(newBlock) if node.NodeConfig.ShardID == shard.BeaconChainShardID {
node.BroadcastNewBlock(newBlock)
}
node.BroadcastCXReceipts(newBlock, commitSigAndBitmap)
} }
node.BroadcastCXReceipts(newBlock, commitSigAndBitmap)
} }
} }
@ -481,7 +485,7 @@ func (node *Node) PostConsensusProcessing(
// Update consensus keys at last so the change of leader status doesn't mess up normal flow // Update consensus keys at last so the change of leader status doesn't mess up normal flow
if len(newBlock.Header().ShardState()) > 0 { if len(newBlock.Header().ShardState()) > 0 {
node.Consensus.UpdateConsensusInformation() node.Consensus.SetMode(node.Consensus.UpdateConsensusInformation())
} }
if h := node.NodeConfig.WebHooks.Hooks; h != nil { if h := node.NodeConfig.WebHooks.Hooks; h != nil {
if h.Availability != nil { if h.Availability != nil {

@ -203,55 +203,62 @@ func (node *Node) DoBeaconSyncing() {
// DoSyncing keep the node in sync with other peers, willJoinConsensus means the node will try to join consensus after catch up // DoSyncing keep the node in sync with other peers, willJoinConsensus means the node will try to join consensus after catch up
func (node *Node) DoSyncing(bc *core.BlockChain, worker *worker.Worker, willJoinConsensus bool) { func (node *Node) DoSyncing(bc *core.BlockChain, worker *worker.Worker, willJoinConsensus bool) {
ticker := time.NewTicker(time.Duration(node.syncFreq) * time.Second)
// TODO ek – infinite loop; add shutdown/cleanup logic // TODO ek – infinite loop; add shutdown/cleanup logic
SyncingLoop:
for { for {
if node.stateSync == nil { select {
node.stateSync = syncing.CreateStateSync(node.SelfPeer.IP, node.SelfPeer.Port, node.GetSyncID()) case <-ticker.C:
utils.Logger().Debug().Msg("[SYNC] initialized state sync") node.doSync(bc, worker, willJoinConsensus)
case <-node.Consensus.BlockNumLowChan:
node.doSync(bc, worker, willJoinConsensus)
} }
if node.stateSync.GetActivePeerNumber() < MinConnectedPeers { }
shardID := bc.ShardID() }
peers, err := node.SyncingPeerProvider.SyncingPeers(shardID)
if err != nil { // doSync keep the node in sync with other peers, willJoinConsensus means the node will try to join consensus after catch up
utils.Logger().Warn(). func (node *Node) doSync(bc *core.BlockChain, worker *worker.Worker, willJoinConsensus bool) {
Err(err). if node.stateSync == nil {
Uint32("shard_id", shardID). node.stateSync = syncing.CreateStateSync(node.SelfPeer.IP, node.SelfPeer.Port, node.GetSyncID())
Msg("cannot retrieve syncing peers") utils.Logger().Debug().Msg("[SYNC] initialized state sync")
continue SyncingLoop }
} if node.stateSync.GetActivePeerNumber() < MinConnectedPeers {
if err := node.stateSync.CreateSyncConfig(peers, false); err != nil { shardID := bc.ShardID()
utils.Logger().Warn(). peers, err := node.SyncingPeerProvider.SyncingPeers(shardID)
Err(err). if err != nil {
Interface("peers", peers). utils.Logger().Warn().
Msg("[SYNC] create peers error") Err(err).
continue SyncingLoop Uint32("shard_id", shardID).
} Msg("cannot retrieve syncing peers")
utils.Logger().Debug().Int("len", node.stateSync.GetActivePeerNumber()).Msg("[SYNC] Get Active Peers") return
} }
// TODO: treat fake maximum height if err := node.stateSync.CreateSyncConfig(peers, false); err != nil {
if node.stateSync.IsOutOfSync(bc) { utils.Logger().Warn().
node.stateMutex.Lock() Err(err).
node.State = NodeNotInSync Interface("peers", peers).
node.stateMutex.Unlock() Msg("[SYNC] create peers error")
if willJoinConsensus { return
node.Consensus.BlocksNotSynchronized()
}
node.stateSync.SyncLoop(bc, worker, false, node.Consensus)
if willJoinConsensus {
node.stateMutex.Lock()
node.State = NodeReadyForConsensus
node.stateMutex.Unlock()
node.Consensus.BlocksSynchronized()
}
} }
utils.Logger().Debug().Int("len", node.stateSync.GetActivePeerNumber()).Msg("[SYNC] Get Active Peers")
}
// TODO: treat fake maximum height
if node.stateSync.IsOutOfSync(bc) {
node.stateMutex.Lock() node.stateMutex.Lock()
node.State = NodeReadyForConsensus node.State = NodeNotInSync
node.stateMutex.Unlock() node.stateMutex.Unlock()
// TODO on demand syncing if willJoinConsensus {
time.Sleep(time.Duration(node.syncFreq) * time.Second) node.Consensus.BlocksNotSynchronized()
}
node.stateSync.SyncLoop(bc, worker, false, node.Consensus)
if willJoinConsensus {
node.stateMutex.Lock()
node.State = NodeReadyForConsensus
node.stateMutex.Unlock()
node.Consensus.BlocksSynchronized()
}
} }
node.stateMutex.Lock()
node.State = NodeReadyForConsensus
node.stateMutex.Unlock()
} }
// SupportBeaconSyncing sync with beacon chain for archival node in beacon chan or non-beacon node // SupportBeaconSyncing sync with beacon chain for archival node in beacon chan or non-beacon node

Loading…
Cancel
Save