fix(cosmos): validator checkpoint & agent indexing improvements (#3958)

- propagates RPC errors to the indexing logic, so errors cause sleeps instead of overwhelming the RPCs. This also greatly reduces cosmos log noisiness: from 9.5k logs per min, to 2.5k logs per min. - checks for published checkpoints in reverse, to prioritize recent messages among those from before the validator was spun up - fixes a validator bug where errors in checkpoint publishing would cause the entire checkpoint queue to be reiterated through, from scratch - Builds on top of https://github.com/hyperlane-xyz/hyperlane-monorepo/pull/3887 so we can test this out on osmosis ### Description  ### Drive-by changes  ### Related issues  ### Backward compatibility  ### Testing
5 months ago · 3bb9d0a767
parent bbbe93206f
commit 3bb9d0a767
13 changed files with 129 additions and 164 deletions
--- a/rust/agents/validator/src/submit.rs
+++ b/rust/agents/validator/src/submit.rs
@ -4,7 +4,7 @@ use std::time::{Duration, Instant};
 use std::vec;

 use hyperlane_core::rpc_clients::call_and_retry_indefinitely;
-use hyperlane_core::{ChainCommunicationError, ChainResult, MerkleTreeHook};
+use hyperlane_core::{ChainResult, MerkleTreeHook};
 use prometheus::IntGauge;
 use tokio::time::sleep;
 use tracing::{debug, error, info};
@ -61,16 +61,7 @@ impl ValidatorSubmitter {
    /// Runs idly forever once the target checkpoint is reached to avoid exiting the task.
    pub(crate) async fn backfill_checkpoint_submitter(self, target_checkpoint: Checkpoint) {
        let mut tree = IncrementalMerkle::default();
-        call_and_retry_indefinitely(|| {
-            let target_checkpoint = target_checkpoint;
-            let self_clone = self.clone();
-            Box::pin(async move {
-                self_clone
-                    .submit_checkpoints_until_correctness_checkpoint(&mut tree, &target_checkpoint)
-                    .await?;
-                Ok(())
-            })
-        })
+        self.submit_checkpoints_until_correctness_checkpoint(&mut tree, &target_checkpoint)
            .await;

        info!(
@ -132,20 +123,7 @@ impl ValidatorSubmitter {
                sleep(self.interval).await;
                continue;
            }
-
-            tree = call_and_retry_indefinitely(|| {
-                let mut tree = tree;
-                let self_clone = self.clone();
-                Box::pin(async move {
-                    self_clone
-                        .submit_checkpoints_until_correctness_checkpoint(
-                            &mut tree,
-                            &latest_checkpoint,
-                        )
-                        .await?;
-                    Ok(tree)
-                })
-            })
+            self.submit_checkpoints_until_correctness_checkpoint(&mut tree, &latest_checkpoint)
                .await;

            self.metrics
@ -162,7 +140,7 @@ impl ValidatorSubmitter {
        &self,
        tree: &mut IncrementalMerkle,
        correctness_checkpoint: &Checkpoint,
-    ) -> ChainResult<()> {
+    ) {
        // This should never be called with a tree that is ahead of the correctness checkpoint.
        assert!(
            !tree_exceeds_checkpoint(correctness_checkpoint, tree),
@ -182,7 +160,14 @@ impl ValidatorSubmitter {
        while tree.count() as u32 <= correctness_checkpoint.index {
            if let Some(insertion) = self
                .message_db
-                .retrieve_merkle_tree_insertion_by_leaf_index(&(tree.count() as u32))?
+                .retrieve_merkle_tree_insertion_by_leaf_index(&(tree.count() as u32))
+                .unwrap_or_else(|err| {
+                    panic!(
+                        "Error fetching merkle tree insertion for leaf index {}: {}",
+                        tree.count(),
+                        err
+                    )
+                })
            {
                debug!(
                    index = insertion.index(),
@ -225,9 +210,7 @@ impl ValidatorSubmitter {
                ?correctness_checkpoint,
                "Incorrect tree root, something went wrong"
            );
-            return Err(ChainCommunicationError::CustomError(
-                "Incorrect tree root, something went wrong".to_string(),
-            ));
+            panic!("Incorrect tree root, something went wrong");
        }

        if !checkpoint_queue.is_empty() {
@ -236,56 +219,70 @@ impl ValidatorSubmitter {
                queue_len = checkpoint_queue.len(),
                "Reached tree consistency"
            );
-
-            self.sign_and_submit_checkpoints(checkpoint_queue).await?;
+            self.sign_and_submit_checkpoints(checkpoint_queue).await;

            info!(
                index = checkpoint.index,
                "Signed all queued checkpoints until index"
            );
        }
-
-        Ok(())
    }

-    /// Signs and submits any previously unsubmitted checkpoints.
-    async fn sign_and_submit_checkpoints(
+    async fn sign_and_submit_checkpoint(
        &self,
-        checkpoints: Vec<CheckpointWithMessageId>,
+        checkpoint: CheckpointWithMessageId,
    ) -> ChainResult<()> {
-        let last_checkpoint = checkpoints.as_slice()[checkpoints.len() - 1];
-
-        for queued_checkpoint in checkpoints {
        let existing = self
            .checkpoint_syncer
-                .fetch_checkpoint(queued_checkpoint.index)
+            .fetch_checkpoint(checkpoint.index)
            .await?;
        if existing.is_some() {
-                debug!(
-                    index = queued_checkpoint.index,
-                    "Checkpoint already submitted"
-                );
-                continue;
+            debug!(index = checkpoint.index, "Checkpoint already submitted");
+            return Ok(());
        }
-            let signed_checkpoint = self.signer.sign(queued_checkpoint).await?;
+        let signed_checkpoint = self.signer.sign(checkpoint).await?;
        self.checkpoint_syncer
            .write_checkpoint(&signed_checkpoint)
            .await?;
-            debug!(
-                index = queued_checkpoint.index,
-                "Signed and submitted checkpoint"
-            );
+        debug!(index = checkpoint.index, "Signed and submitted checkpoint");

        // TODO: move these into S3 implementations
        // small sleep before signing next checkpoint to avoid rate limiting
        sleep(Duration::from_millis(100)).await;
+        Ok(())
    }

-        self.checkpoint_syncer
-            .update_latest_index(last_checkpoint.index)
+    /// Signs and submits any previously unsubmitted checkpoints.
+    async fn sign_and_submit_checkpoints(&self, checkpoints: Vec<CheckpointWithMessageId>) {
+        let last_checkpoint = checkpoints.as_slice()[checkpoints.len() - 1];
+        // Submits checkpoints to the store in reverse order. This speeds up processing historic checkpoints (those before the validator is spun up),
+        // since those are the most likely to make messages become processable.
+        // A side effect is that new checkpoints will also be submitted in reverse order.
+        for queued_checkpoint in checkpoints.into_iter().rev() {
+            // certain checkpoint stores rate limit very aggressively, so we retry indefinitely
+            call_and_retry_indefinitely(|| {
+                let self_clone = self.clone();
+                Box::pin(async move {
+                    self_clone
+                        .sign_and_submit_checkpoint(queued_checkpoint)
                        .await?;
+                    Ok(())
+                })
+            })
+            .await;
+        }

+        call_and_retry_indefinitely(|| {
+            let self_clone = self.clone();
+            Box::pin(async move {
+                self_clone
+                    .checkpoint_syncer
+                    .update_latest_index(last_checkpoint.index)
+                    .await?;
                Ok(())
+            })
+        })
+        .await;
    }
 }

--- a/rust/chains/hyperlane-cosmos/src/interchain_gas.rs
+++ b/rust/chains/hyperlane-cosmos/src/interchain_gas.rs
@ -1,6 +1,5 @@
 use async_trait::async_trait;
 use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
-use futures::future;
 use hyperlane_core::{
    ChainCommunicationError, ChainResult, ContractLocator, HyperlaneChain, HyperlaneContract,
    HyperlaneDomain, HyperlaneProvider, Indexed, Indexer, InterchainGasPaymaster,
@ -9,12 +8,15 @@ use hyperlane_core::{
 use once_cell::sync::Lazy;
 use std::ops::RangeInclusive;
 use tendermint::abci::EventAttribute;
-use tracing::{instrument, warn};
+use tracing::instrument;

 use crate::{
    rpc::{CosmosWasmIndexer, ParsedEvent, WasmIndexer},
    signers::Signer,
-    utils::{CONTRACT_ADDRESS_ATTRIBUTE_KEY, CONTRACT_ADDRESS_ATTRIBUTE_KEY_BASE64},
+    utils::{
+        execute_and_parse_log_futures, CONTRACT_ADDRESS_ATTRIBUTE_KEY,
+        CONTRACT_ADDRESS_ATTRIBUTE_KEY_BASE64,
+    },
    ConnectionConf, CosmosProvider, HyperlaneCosmosError,
 };

@ -223,26 +225,7 @@ impl Indexer<InterchainGasPayment> for CosmosInterchainGasPaymasterIndexer {
            })
            .collect();

-        // TODO: this can be refactored when we rework indexing, to be part of the block-by-block indexing
-        let result = future::join_all(logs_futures)
-            .await
-            .into_iter()
-            .flatten()
-            .map(|(logs, block_number)| {
-                if let Err(err) = &logs {
-                    warn!(?err, ?block_number, "Failed to fetch logs for block");
-                }
-                logs
-            })
-            // Propagate errors from any of the queries. This will cause the entire range to be retried,
-            // including successful ones, but we don't have a way to handle partial failures in a range for now.
-            .collect::<Result<Vec<_>, _>>()?
-            .into_iter()
-            .flatten()
-            .map(|(log, meta)| (Indexed::new(log), meta))
-            .collect();
-
-        Ok(result)
+        execute_and_parse_log_futures(logs_futures).await
    }

    async fn get_finalized_block_number(&self) -> ChainResult<u32> {
--- a/rust/chains/hyperlane-cosmos/src/mailbox.rs
+++ b/rust/chains/hyperlane-cosmos/src/mailbox.rs
@ -1,5 +1,4 @@
 use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
-use futures::future;
 use std::{
    fmt::{Debug, Formatter},
    io::Cursor,
@ -8,14 +7,15 @@ use std::{
    str::FromStr,
 };

-use crate::payloads::mailbox::{
-    GeneralMailboxQuery, ProcessMessageRequest, ProcessMessageRequestInner,
-};
 use crate::payloads::{general, mailbox};
 use crate::rpc::{CosmosWasmIndexer, ParsedEvent, WasmIndexer};
 use crate::CosmosProvider;
 use crate::{address::CosmosAddress, types::tx_response_to_outcome};
 use crate::{grpc::WasmProvider, HyperlaneCosmosError};
+use crate::{
+    payloads::mailbox::{GeneralMailboxQuery, ProcessMessageRequest, ProcessMessageRequestInner},
+    utils::execute_and_parse_log_futures,
+};
 use crate::{signers::Signer, utils::get_block_height_for_lag, ConnectionConf};
 use async_trait::async_trait;
 use cosmrs::proto::cosmos::base::abci::v1beta1::TxResponse;
@ -371,23 +371,7 @@ impl Indexer<HyperlaneMessage> for CosmosMailboxIndexer {
            })
            .collect();

-        // TODO: this can be refactored when we rework indexing, to be part of the block-by-block indexing
-        let result = future::join_all(logs_futures)
-            .await
-            .into_iter()
-            .flatten()
-            .filter_map(|(logs_res, block_number)| match logs_res {
-                Ok(logs) => Some(logs),
-                Err(err) => {
-                    warn!(?err, ?block_number, "Failed to fetch logs for block");
-                    None
-                }
-            })
-            .flatten()
-            .map(|(log, meta)| (log.into(), meta))
-            .collect();
-
-        Ok(result)
+        execute_and_parse_log_futures(logs_futures).await
    }

    async fn get_finalized_block_number(&self) -> ChainResult<u32> {
--- a/rust/chains/hyperlane-cosmos/src/merkle_tree_hook.rs
+++ b/rust/chains/hyperlane-cosmos/src/merkle_tree_hook.rs
@ -2,7 +2,6 @@ use std::{fmt::Debug, num::NonZeroU64, ops::RangeInclusive, str::FromStr};

 use async_trait::async_trait;
 use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
-use futures::future;
 use hyperlane_core::{
    accumulator::incremental::IncrementalMerkle, ChainCommunicationError, ChainResult, Checkpoint,
    ContractLocator, HyperlaneChain, HyperlaneContract, HyperlaneDomain, HyperlaneProvider,
@ -10,17 +9,14 @@ use hyperlane_core::{
 };
 use once_cell::sync::Lazy;
 use tendermint::abci::EventAttribute;
-use tracing::{instrument, warn};
+use tracing::instrument;

 use crate::{
    grpc::WasmProvider,
-    payloads::{
-        general::{self},
-        merkle_tree_hook,
-    },
+    payloads::{general, merkle_tree_hook},
    rpc::{CosmosWasmIndexer, ParsedEvent, WasmIndexer},
    utils::{
-        get_block_height_for_lag, CONTRACT_ADDRESS_ATTRIBUTE_KEY,
+        execute_and_parse_log_futures, get_block_height_for_lag, CONTRACT_ADDRESS_ATTRIBUTE_KEY,
        CONTRACT_ADDRESS_ATTRIBUTE_KEY_BASE64,
    },
    ConnectionConf, CosmosProvider, HyperlaneCosmosError, Signer,
@ -304,23 +300,7 @@ impl Indexer<MerkleTreeInsertion> for CosmosMerkleTreeHookIndexer {
            })
            .collect();

-        // TODO: this can be refactored when we rework indexing, to be part of the block-by-block indexing
-        let result = future::join_all(logs_futures)
-            .await
-            .into_iter()
-            .flatten()
-            .filter_map(|(logs_res, block_number)| match logs_res {
-                Ok(logs) => Some(logs),
-                Err(err) => {
-                    warn!(?err, ?block_number, "Failed to fetch logs for block");
-                    None
-                }
-            })
-            .flatten()
-            .map(|(log, meta)| (log.into(), meta))
-            .collect();
-
-        Ok(result)
+        execute_and_parse_log_futures(logs_futures).await
    }

    /// Get the chain's latest block number that has reached finality
--- a/rust/chains/hyperlane-cosmos/src/providers/rpc.rs
+++ b/rust/chains/hyperlane-cosmos/src/providers/rpc.rs
@ -1,6 +1,5 @@
 use async_trait::async_trait;
 use cosmrs::rpc::client::Client;
-use hyperlane_core::rpc_clients::call_with_retry;
 use hyperlane_core::{ChainCommunicationError, ChainResult, ContractLocator, LogMeta, H256, U256};
 use sha256::digest;
 use std::fmt::Debug;
@ -216,9 +215,7 @@ impl CosmosWasmIndexer {
 impl WasmIndexer for CosmosWasmIndexer {
    #[instrument(err, skip(self))]
    async fn get_finalized_block_number(&self) -> ChainResult<u32> {
-        let latest_block =
-            call_with_retry(move || Box::pin(Self::get_latest_block(self.provider.rpc().clone())))
-                .await?;
+        let latest_block = Self::get_latest_block(self.provider.rpc().clone()).await?;
        let latest_height: u32 = latest_block
            .block
            .header
@ -242,11 +239,11 @@ impl WasmIndexer for CosmosWasmIndexer {
        let client = self.provider.rpc().clone();
        debug!(?block_number, cursor_label, domain=?self.provider.domain, "Getting logs in block");

-        let (block, block_results) = tokio::join!(
-            call_with_retry(|| { Box::pin(Self::get_block(client.clone(), block_number)) }),
-            call_with_retry(|| { Box::pin(Self::get_block_results(client.clone(), block_number)) }),
-        );
+        // The two calls below could be made in parallel, but on cosmos rate limiting is a bigger problem
+        // than indexing latency, so we do them sequentially.
+        let block = Self::get_block(client.clone(), block_number).await?;
+        let block_results = Self::get_block_results(client.clone(), block_number).await?;

-        Ok(self.handle_txs(block?, block_results?, parser, cursor_label))
+        Ok(self.handle_txs(block, block_results, parser, cursor_label))
    }
 }
--- a/rust/chains/hyperlane-cosmos/src/utils.rs
+++ b/rust/chains/hyperlane-cosmos/src/utils.rs
@ -1,8 +1,11 @@
 use std::num::NonZeroU64;

 use base64::{engine::general_purpose::STANDARD as BASE64, Engine};
-use hyperlane_core::ChainResult;
+use futures::future;
+use hyperlane_core::{ChainCommunicationError, ChainResult, Indexed, LogMeta};
 use once_cell::sync::Lazy;
+use tokio::task::JoinHandle;
+use tracing::warn;

 use crate::grpc::{WasmGrpcProvider, WasmProvider};

@ -31,6 +34,32 @@ pub(crate) async fn get_block_height_for_lag(
    Ok(block_height)
 }

+#[allow(clippy::type_complexity)]
+pub(crate) async fn execute_and_parse_log_futures<T: Into<Indexed<T>>>(
+    logs_futures: Vec<JoinHandle<(Result<Vec<(T, LogMeta)>, ChainCommunicationError>, u32)>>,
+) -> ChainResult<Vec<(Indexed<T>, LogMeta)>> {
+    // TODO: this can be refactored when we rework indexing, to be part of the block-by-block indexing
+    let result = future::join_all(logs_futures)
+        .await
+        .into_iter()
+        .flatten()
+        .map(|(logs, block_number)| {
+            if let Err(err) = &logs {
+                warn!(?err, ?block_number, "Failed to fetch logs for block");
+            }
+            logs
+        })
+        // Propagate errors from any of the queries. This will cause the entire range to be retried,
+        // including successful ones, but we don't have a way to handle partial failures in a range for now.
+        // This is also why cosmos indexing should be run with small chunks (currently set to 5).
+        .collect::<Result<Vec<_>, _>>()?
+        .into_iter()
+        .flatten()
+        .map(|(log, meta)| (log.into(), meta))
+        .collect();
+    Ok(result)
+}
+
 #[cfg(test)]
 /// Helper function to create a Vec<EventAttribute> from a JSON string -
 /// crate::payloads::general::EventAttribute has a Deserialize impl while
--- a/rust/config/mainnet_config.json
+++ b/rust/config/mainnet_config.json
@ -715,7 +715,7 @@
        }
      ],
      "index": {
-        "chunk": 25,
+        "chunk": 5,
        "from": 58419500
      },
      "interchainGasPaymaster": "0x27ae52298e5b53b34b7ae0ca63e05845c31e1f59",
@ -1007,7 +1007,7 @@
        }
      ],
      "index": {
-        "chunk": 50,
+        "chunk": 5,
        "from": 4000000
      },
      "interchainGasPaymaster": "0x504ee9ac43ec5814e00c7d21869a90ec52becb489636bdf893b7df9d606b5d67",
--- a/rust/hyperlane-core/src/accumulator/incremental.rs
+++ b/rust/hyperlane-core/src/accumulator/incremental.rs
@ -7,7 +7,7 @@ use crate::accumulator::{
    H256, TREE_DEPTH, ZERO_HASHES,
 };

-#[derive(BorshDeserialize, BorshSerialize, Debug, Clone, Copy, new, PartialEq, Eq)]
+#[derive(BorshDeserialize, BorshSerialize, Debug, Clone, new, PartialEq, Eq)]
 /// An incremental merkle tree, modeled on the eth2 deposit contract
 pub struct IncrementalMerkle {
    /// The branch of the tree
--- a/rust/hyperlane-core/src/rpc_clients/retry.rs
+++ b/rust/hyperlane-core/src/rpc_clients/retry.rs
@ -34,14 +34,6 @@ pub async fn call_and_retry_n_times<T>(
    ))
 }

-/// Retry calling a fallible async function a predefined number of times
-#[instrument(err, skip(f))]
-pub async fn call_with_retry<T>(
-    f: impl FnMut() -> Pin<Box<dyn Future<Output = ChainResult<T>> + Send>>,
-) -> ChainResult<T> {
-    call_and_retry_n_times(f, DEFAULT_MAX_RPC_RETRIES).await
-}
-
 /// Retry calling a fallible async function indefinitely, until it succeeds
 pub async fn call_and_retry_indefinitely<T>(
    f: impl FnMut() -> Pin<Box<dyn Future<Output = ChainResult<T>> + Send>>,
--- a/rust/hyperlane-core/src/types/indexing.rs
+++ b/rust/hyperlane-core/src/types/indexing.rs
@ -1,6 +1,6 @@
 use derive_new::new;

-use crate::{HyperlaneMessage, MerkleTreeInsertion, Sequenced};
+use crate::{HyperlaneMessage, InterchainGasPayment, MerkleTreeInsertion, Sequenced};

 /// Wrapper struct that adds indexing information to a type
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, new)]
@ -73,3 +73,9 @@ impl From<MerkleTreeInsertion> for Indexed<MerkleTreeInsertion> {
        Indexed::new(value).with_sequence(sequence as _)
    }
 }
+
+impl From<InterchainGasPayment> for Indexed<InterchainGasPayment> {
+    fn from(value: InterchainGasPayment) -> Self {
+        Indexed::new(value)
+    }
+}
--- a/rust/sealevel/programs/mailbox-test/src/functional.rs
+++ b/rust/sealevel/programs/mailbox-test/src/functional.rs
@ -212,7 +212,7 @@ async fn test_dispatch_from_eoa() {
            local_domain: LOCAL_DOMAIN,
            outbox_bump_seed: mailbox_accounts.outbox_bump_seed,
            owner: Some(payer.pubkey()),
-            tree: expected_tree,
+            tree: expected_tree.clone(),
        },
    )
    .await;
--- a/rust/utils/run-locally/src/cosmos/types.rs
+++ b/rust/utils/run-locally/src/cosmos/types.rs
@ -177,10 +177,7 @@ impl AgentConfig {
                amount: "0.05".to_string(),
            },
            contract_address_bytes: 32,
-            index: AgentConfigIndex {
-                from: 1,
-                chunk: 100,
-            },
+            index: AgentConfigIndex { from: 1, chunk: 5 },
        }
    }
 }
--- a/typescript/infra/config/environments/mainnet3/agent.ts
+++ b/typescript/infra/config/environments/mainnet3/agent.ts
@ -231,7 +231,7 @@ const hyperlane: RootAgentConfig = {
  validators: {
    docker: {
      repo,
-      tag: '59451d6-20240612-171611',
+      tag: '47fbe58-20240617-173324',
    },
    rpcConsensusType: RpcConsensusType.Quorum,
    chains: validatorChainConfig(Contexts.Hyperlane),
@ -265,7 +265,7 @@ const releaseCandidate: RootAgentConfig = {
  validators: {
    docker: {
      repo,
-      tag: 'c9c5d37-20240510-014327',
+      tag: '47fbe58-20240617-173324',
    },
    rpcConsensusType: RpcConsensusType.Quorum,
    chains: validatorChainConfig(Contexts.ReleaseCandidate),