Skip to content

Commit 09fcd78

Browse files
authored
RPC Resolver: If all are unhealthy, return the default RPC (#77)
# What Improved RPC fallback selection logic to choose default-priority (0) entries when no healthy RPCs are available. New (not-yet polled) RPCs are now initialized as `Unhealthy` instead of Healthy with a fake latency based on order of definition. This change removes reliance on order of defining RPCs in rpc-config.yaml followup to #73 # Why Some of our services (prov bootstrap) request RPCs before healthcheck poller has had a chance to run. We thus have no idea which RPC from the list to return and should prefer the one with the highest likelihood of being available. Previously, new entries that have not yet been polled were initialized as Healthy, which according to the new selection method led to the replica IP being selected (highest priority) before it could be proven to be Unhealthy by the poller, which broke prov bootstrap. The previous selection algorithm just selected by order of definition in this case (first RPC defined in the file) which is brittle and darkmagic-y as well. This change ensures we use a more reliable default-priority remote RPC anytime we don't know which RPC is actually available, which is typically a public or private remote RPC to that chain and can be more safely assumed to be available. This default (priority-0) RPC is now mandatory for every chain that might be used in such way and prio 0 now has a special meaning. # Background During prov bootstrap, the healthcheck poller requests a yellowstone RPC immediately on boot without waiting for a healthcheck cycle. Previously, a replica URL was selected due to high prio because not-yet polled RPCs were initialized as Healthy, which made prov boot fail.
1 parent affe197 commit 09fcd78

File tree

1 file changed

+61
-39
lines changed
  • rust/lit-core/lit-blockchain/src/resolver/rpc

1 file changed

+61
-39
lines changed

rust/lit-core/lit-blockchain/src/resolver/rpc/mod.rs

Lines changed: 61 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
1212
use std::sync::{Arc, OnceLock};
1313
use std::time::Duration;
1414
use std::time::SystemTime;
15-
use tracing::trace;
15+
use tracing::{error, trace, warn};
1616
use url::Url;
1717

1818
use ethers::prelude::*;
@@ -43,8 +43,9 @@ pub struct StandardRpcHealthcheckPoller<'a> {
4343

4444
/// Select the best RPC entry from a non-empty list.
4545
/// Order: healthy > higher priority > lower latency > lexicographically smaller URL.
46-
/// Entries missing in `latencies` are treated as unhealthy; if none are healthy, fall back to
47-
/// highest priority.
46+
/// Entries missing in `latencies` are treated as unhealthy; if none are healthy, fall back to the
47+
/// lexicographically smallest URL among default-priority (0) entries.
48+
/// If that also doesn't exist, print error and return the first entry.
4849
///
4950
/// # Panics
5051
/// Panics if `entries` is empty. Callers must ensure the list is non-empty.
@@ -73,14 +74,23 @@ fn select_rpc_entry<'a>(
7374
return entry;
7475
}
7576

76-
// No healthy entries - fall back to highest priority regardless of health.
77-
// Uses same comparator pattern as healthy path for consistency and readability.
78-
entries
79-
.iter()
80-
.min_by(|a, b| {
81-
Reverse(a.priority()).cmp(&Reverse(b.priority())).then_with(|| a.url().cmp(b.url()))
82-
})
83-
.expect("entries is non-empty")
77+
// No healthy entries. This happens when the healthcheck poller has not completed a cycle but the
78+
// contract resolver immediately needs an RPC (e.g. prov bootstrap). In that case, all entries are
79+
// default-unhealthy due to missing latency data. We must avoid picking a high-priority replica
80+
// that may not exist; instead use the default-priority remote RPC that is most likely to be
81+
// available and warn that fallback was used. This makes a priority-0 default entry mandatory
82+
// for every chain that may be used during initialization (yellowstone, litChain)
83+
let fallback_entry =
84+
entries.iter().filter(|entry| entry.priority() == 0).min_by(|a, b| a.url().cmp(b.url()));
85+
86+
if let Some(entry) = fallback_entry {
87+
warn!(url = entry.url(), "RPC healthcheck fallback URL selected");
88+
return entry;
89+
}
90+
91+
let entry = entries.first().expect("entries is non-empty");
92+
error!(url = entry.url(), "No default priority RPC provided; falling back to first entry");
93+
entry
8494
}
8595

8696
impl<'a> StandardRpcHealthcheckPoller<'a> {
@@ -318,11 +328,8 @@ pub trait RpcHealthcheckPoller: Sync {
318328
ArcSwap::from(Arc::new({
319329
let resolver = rpc_resolver.load();
320330
let chains = resolver.config.chains();
321-
let key_values = chains
322-
.values()
323-
.flat_map(|v| v.iter().rev())
324-
.zip((0..).map(|t| Duration::MAX.saturating_sub(Duration::from_secs(t))))
325-
.map(|(k, v)| (k.clone(), Latency::Healthy(v)));
331+
let key_values =
332+
chains.values().flat_map(|v| v.iter()).map(|k| (k.clone(), Latency::Unhealthy));
326333
let mut m = im::hashmap::HashMap::new();
327334
m.extend(key_values);
328335
m
@@ -417,13 +424,9 @@ impl RpcResolver {
417424
config.chains().values().flat_map(|v| v.iter()).collect();
418425

419426
latencies.retain(|e, _| rpc_entries.contains(e));
420-
for (d, rpc_entry) in config.chains().values().flat_map(|v| {
421-
v.iter().enumerate().rev().map(|(i, v)| {
422-
(Duration::MAX.saturating_sub(Duration::from_secs(164 + i as u64)), v)
423-
})
424-
}) {
427+
for rpc_entry in config.chains().values().flat_map(|v| v.iter()) {
425428
if !latencies.contains_key(rpc_entry) {
426-
latencies.insert(rpc_entry.clone(), Latency::Healthy(d));
429+
latencies.insert(rpc_entry.clone(), Latency::Unhealthy);
427430
}
428431
}
429432

@@ -752,19 +755,22 @@ mod tests {
752755
}
753756

754757
#[test]
755-
fn test_select_rpc_entry_falls_back_to_priority_when_none_healthy() {
756-
let e_prio_1 =
757-
RpcEntry::new(RpcKind::EVM, "https://p1".into(), None, None).with_priority(1);
758-
let e_prio_2 =
759-
RpcEntry::new(RpcKind::EVM, "https://p2".into(), None, None).with_priority(2);
758+
fn test_select_rpc_entry_falls_back_to_default_priority_when_none_healthy() {
759+
let e_default_zeta =
760+
RpcEntry::new(RpcKind::EVM, "https://zeta".into(), None, None).with_priority(0);
761+
let e_default_alpha =
762+
RpcEntry::new(RpcKind::EVM, "https://alpha".into(), None, None).with_priority(0);
763+
let e_high_prio =
764+
RpcEntry::new(RpcKind::EVM, "https://high".into(), None, None).with_priority(10);
760765

761-
let entries = vec![e_prio_1.clone(), e_prio_2.clone()];
766+
let entries = vec![e_default_zeta.clone(), e_high_prio.clone(), e_default_alpha.clone()];
762767
let mut latencies = im::hashmap::HashMap::new();
763-
latencies.insert(e_prio_1.clone(), Latency::Unhealthy);
764-
latencies.insert(e_prio_2.clone(), Latency::Unhealthy);
768+
latencies.insert(e_default_zeta.clone(), Latency::Unhealthy);
769+
latencies.insert(e_default_alpha.clone(), Latency::Unhealthy);
770+
latencies.insert(e_high_prio.clone(), Latency::Unhealthy);
765771

766772
let selected = select_rpc_entry(&entries, &latencies);
767-
assert_eq!(selected.url(), e_prio_2.url());
773+
assert_eq!(selected.url(), e_default_alpha.url());
768774
}
769775

770776
#[test]
@@ -783,17 +789,33 @@ mod tests {
783789
}
784790

785791
#[test]
786-
fn test_select_rpc_entry_unknown_entries_fallback_respects_priority() {
787-
let e_low_prio =
788-
RpcEntry::new(RpcKind::EVM, "https://low".into(), None, None).with_priority(1);
792+
fn test_select_rpc_entry_unknown_entries_fallback_prefers_default_priority() {
793+
let e_default_beta =
794+
RpcEntry::new(RpcKind::EVM, "https://beta".into(), None, None).with_priority(0);
795+
let e_default_alpha =
796+
RpcEntry::new(RpcKind::EVM, "https://alpha".into(), None, None).with_priority(0);
789797
let e_high_prio =
790798
RpcEntry::new(RpcKind::EVM, "https://high".into(), None, None).with_priority(10);
791799

792-
let entries = vec![e_low_prio.clone(), e_high_prio.clone()];
800+
let entries = vec![e_default_beta.clone(), e_high_prio.clone(), e_default_alpha.clone()];
793801
let latencies = im::hashmap::HashMap::new();
794802

795803
let selected = select_rpc_entry(&entries, &latencies);
796-
assert_eq!(selected.url(), e_high_prio.url());
804+
assert_eq!(selected.url(), e_default_alpha.url());
805+
}
806+
807+
#[test]
808+
fn test_select_rpc_entry_fallback_uses_first_when_no_default_priority() {
809+
let e_first =
810+
RpcEntry::new(RpcKind::EVM, "https://first".into(), None, None).with_priority(5);
811+
let e_second =
812+
RpcEntry::new(RpcKind::EVM, "https://second".into(), None, None).with_priority(10);
813+
814+
let entries = vec![e_first.clone(), e_second.clone()];
815+
let latencies = im::hashmap::HashMap::new();
816+
817+
let selected = select_rpc_entry(&entries, &latencies);
818+
assert_eq!(selected.url(), e_first.url());
797819
}
798820

799821
#[test]
@@ -822,9 +844,9 @@ mod tests {
822844
#[test]
823845
fn test_select_rpc_entry_url_tiebreaker_in_fallback_path() {
824846
let e_alpha = RpcEntry::new(RpcKind::EVM, "https://alpha.example.com".into(), None, None)
825-
.with_priority(5);
847+
.with_priority(0);
826848
let e_zeta = RpcEntry::new(RpcKind::EVM, "https://zeta.example.com".into(), None, None)
827-
.with_priority(5);
849+
.with_priority(0);
828850

829851
for entries in
830852
[vec![e_alpha.clone(), e_zeta.clone()], vec![e_zeta.clone(), e_alpha.clone()]]
@@ -837,7 +859,7 @@ mod tests {
837859
assert_eq!(
838860
selected.url(),
839861
e_alpha.url(),
840-
"Fallback path should use same URL tie-breaking as healthy path"
862+
"Fallback path should use lexicographically smallest default URL"
841863
);
842864
}
843865
}

0 commit comments

Comments
 (0)