feat: support multiple validator sets for a single app context in the relayer's observed validator metrics (#4495)

### Description

- Fixes https://github.com/hyperlane-xyz/hyperlane-monorepo/issues/4030
- The general idea of this change is that if the function
`set_validator_latest_checkpoints` is called multiple times in a single
message's metadata building, the reset of metrics relating to previously
set validators now won't always occur. Atm, if two different multisig
ISM validator sets are aggregated, they race each other to update the
metric. With this change I expect the contention here to not occur
anymore, and to still be able to handle changes in validator sets
- This is a bit of a quick fix rather than something super robust tbh -
I think in a vacuum I'd prefer something not dependent on time but
instead dependent on a context of what message is being processed or
something. But at the moment we're seeing very noisy alerts because of
this issue, so would like to fix it ASAP.

### Drive-by changes

<!--
Are there any minor or drive-by changes also included?
-->

### Related issues

<!--
- Fixes #[issue number here]
-->

### Backward compatibility

<!--
Are these changes backward compatible? Are there any infrastructure
implications, e.g. changes that would prohibit deploying older commits
using this infra tooling?

Yes/No
-->

### Testing

<!--
What kind of testing have these changes undergone?

None/Manual/Unit Tests
-->
pull/4502/head
Trevor Porter 2 months ago committed by GitHub
parent 78d6fcf5aa
commit 57b0735472
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 36
      rust/main/hyperlane-base/src/metrics/core.rs

@ -1,6 +1,7 @@
use std::collections::{HashMap, HashSet};
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::sync::OnceLock;
use std::time;
use eyre::Result;
use hyperlane_core::{HyperlaneDomain, H160};
@ -458,11 +459,19 @@ struct AppContextKey {
app_context: String,
}
/// If this period has elapsed since a validator was last updated in the metrics,
/// it will be removed from the metrics.
const MIN_VALIDATOR_METRIC_RESET_PERIOD: time::Duration = time::Duration::from_secs(60 * 3);
/// Manages metrics for observing sets of validators.
pub struct ValidatorObservabilityMetricManager {
observed_validator_latest_index: IntGaugeVec,
app_context_validators: RwLock<HashMap<AppContextKey, HashSet<H160>>>,
// AppContextKey -> Validator -> Last updated at
// Used to track the last time a validator was updated in the metrics, allowing
// for the removal of validators that have not been updated in a while to support
// changing validator sets.
app_context_validators: RwLock<HashMap<AppContextKey, HashMap<H160, time::Instant>>>,
}
impl ValidatorObservabilityMetricManager {
@ -490,10 +499,23 @@ impl ValidatorObservabilityMetricManager {
let mut app_context_validators = self.app_context_validators.write().await;
// First, clear out all previous metrics for the app context.
let mut new_set = HashMap::new();
// First, attempt to clear out all previous metrics for the app context.
// This is necessary because the set of validators may have changed.
if let Some(prev_validators) = app_context_validators.get(&key) {
for validator in prev_validators {
// If the validator was last updated in the metrics more than
// a certain period ago, remove it from the metrics.
// Some leniency is given here to allow this function to be called
// multiple times in a short period without clearing out the metrics,
// e.g. when a message's ISM aggregates multiple different validator sets.
for (validator, last_updated_at) in prev_validators {
if last_updated_at.elapsed() < MIN_VALIDATOR_METRIC_RESET_PERIOD {
// If the last metric refresh was too recent, keep the validator
// and the time of its last metric update.
new_set.insert(*validator, *last_updated_at);
continue;
}
// We unwrap because an error here occurs if the # of labels
// provided is incorrect, and we'd like to loudly fail in e2e if that
// happens.
@ -508,8 +530,6 @@ impl ValidatorObservabilityMetricManager {
}
}
let mut set = HashSet::new();
// Then set the new metrics and update the cached set of validators.
for (validator, latest_checkpoint) in latest_checkpoints {
self.observed_validator_latest_index
@ -522,9 +542,9 @@ impl ValidatorObservabilityMetricManager {
// If the latest checkpoint is None, set to -1 to indicate that
// the validator did not provide a valid latest checkpoint index.
.set(latest_checkpoint.map(|i| i as i64).unwrap_or(-1));
set.insert(*validator);
new_set.insert(*validator, time::Instant::now());
}
app_context_validators.insert(key, set);
app_context_validators.insert(key, new_set);
}
/// Gauge for reporting recently observed latest checkpoint indices for validator sets.

Loading…
Cancel
Save