Skip to content

Commit 2ebabb7

Browse files
committed
fix(iroh): Keep track of non-best-addr path validation and switch to them if best addr is soon outdated
1 parent 31895bf commit 2ebabb7

File tree

4 files changed

+149
-21
lines changed

4 files changed

+149
-21
lines changed

iroh/src/magicsock/node_map/best_addr.rs

Lines changed: 77 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,51 @@ use n0_future::time::{Duration, Instant};
66
use tracing::{debug, info};
77

88
/// How long we trust a UDP address as the exclusive path (without using relay) without having heard a Pong reply.
9-
const TRUST_UDP_ADDR_DURATION: Duration = Duration::from_millis(6500);
9+
pub(super) const TRUST_UDP_ADDR_DURATION: Duration = Duration::from_millis(6500);
10+
11+
/// The grace period at which we consider switching away from our best addr
12+
/// to another address that we've received data on.
13+
///
14+
/// The trusted address lifecycle goes as follows:
15+
/// - A UDP DICSO pong is received, this validates that the path is for sure valid.
16+
/// - The disco path that seems to have the lowest latency is the path we use to send on.
17+
/// - We trust this path as a path to send on for at least TRUST_UDP_ADDR_DURATION.
18+
/// - This time is extended every time we receive a UDP DISCO pong on the address.
19+
/// - This time is *also* extended every time we receive *application* payloads on this
20+
/// address (i.e. QUIC datagrams).
21+
/// - If our best address becomes outdated (TRUST_UDP_ADDR_DURATION expires) without
22+
/// another pong or payload data, then we'll start sending over the relay, too!
23+
/// (we switch to ConnectionType::Mixed)
24+
///
25+
/// However, we might not get any UDP DISCO pongs because they're UDP packets and get
26+
/// lost under e.g. high load.
27+
///
28+
/// This is usually fine, because we also receive on the best addr, and that extends its
29+
/// "trust period" just as well.
30+
///
31+
/// However, if *additionally* we send on a different address than the one we receive on,
32+
/// then this extension doesn't happen.
33+
///
34+
/// To fix this, we also apply the same path validation logic to non-best addresses.
35+
/// I.e. we keep track of when they last received a pong, at which point we consider them
36+
/// validated, and then extend this validation period when we receive application data
37+
/// while they're valid (or when we receive another pong).
38+
///
39+
/// Now, when our best address becomes outdated, we need to switch to another valid path.
40+
///
41+
/// We could switch to another path once the best address becomes outdated, but then we'd
42+
/// already start sending on the relay for a couple of iterations!
43+
///
44+
/// So instead, we switch to another path when it looks like the best address becomes
45+
/// outdated.
46+
/// Not just any path, but the path that we're currently receiving from for this node.
47+
///
48+
/// Since we might not be receiving constantly from the remote side (e.g. if it's a
49+
/// one-sided transfer), we need to take care to do consider switching early enough.
50+
///
51+
/// So this duration is chosen as at least 1 keep alive interval (1s default in iroh atm)
52+
/// + at maximum 400ms of latency spike.
53+
const TRUST_UDP_ADDR_SOON_OUTDATED: Duration = Duration::from_millis(1400);
1054

1155
#[derive(Debug, Default)]
1256
pub(super) struct BestAddr(Option<BestAddrInner>);
@@ -38,12 +82,12 @@ pub(super) enum Source {
3882
}
3983

4084
impl Source {
41-
fn trust_until(&self, from: Instant) -> Instant {
85+
fn trust_duration(&self) -> Duration {
4286
match self {
43-
Source::ReceivedPong => from + TRUST_UDP_ADDR_DURATION,
87+
Source::ReceivedPong => TRUST_UDP_ADDR_DURATION,
4488
// TODO: Fix time
45-
Source::BestCandidate => from + Duration::from_secs(60 * 60),
46-
Source::Udp => from + TRUST_UDP_ADDR_DURATION,
89+
Source::BestCandidate => Duration::from_secs(60 * 60),
90+
Source::Udp => TRUST_UDP_ADDR_DURATION,
4791
}
4892
}
4993
}
@@ -115,30 +159,48 @@ impl BestAddr {
115159
latency: Duration,
116160
source: Source,
117161
confirmed_at: Instant,
162+
now: Instant,
118163
) {
119164
match self.0.as_mut() {
120165
None => {
121166
self.insert(addr, latency, source, confirmed_at);
122167
}
123168
Some(state) => {
124169
let candidate = AddrLatency { addr, latency };
125-
if !state.is_trusted(confirmed_at) || candidate.is_better_than(&state.addr) {
170+
if !state.is_trusted(now) || candidate.is_better_than(&state.addr) {
126171
self.insert(addr, latency, source, confirmed_at);
127172
} else if state.addr.addr == addr {
128173
state.confirmed_at = confirmed_at;
129-
state.trust_until = Some(source.trust_until(confirmed_at));
174+
state.trust_until = Some(confirmed_at + source.trust_duration());
130175
}
131176
}
132177
}
133178
}
134179

135-
/// Reset the expiry, if the passed in addr matches the currently used one.
136-
#[cfg(not(wasm_browser))]
137-
pub fn reconfirm_if_used(&mut self, addr: SocketAddr, source: Source, confirmed_at: Instant) {
138-
if let Some(state) = self.0.as_mut() {
139-
if state.addr.addr == addr {
140-
state.confirmed_at = confirmed_at;
141-
state.trust_until = Some(source.trust_until(confirmed_at));
180+
pub fn insert_if_soon_outdated_or_reconfirm(
181+
&mut self,
182+
addr: SocketAddr,
183+
latency: Duration,
184+
source: Source,
185+
confirmed_at: Instant,
186+
now: Instant,
187+
) {
188+
match self.0.as_mut() {
189+
None => {
190+
self.insert(addr, latency, source, confirmed_at);
191+
}
192+
Some(state) => {
193+
// If the current best addr will soon be outdated
194+
// and the given candidate will be trusted for longer
195+
if !state.is_trusted(now + TRUST_UDP_ADDR_SOON_OUTDATED)
196+
&& state.confirmed_at < confirmed_at
197+
{
198+
println!("best addr will soon not be trusted, let's switch");
199+
self.insert(addr, latency, source, confirmed_at);
200+
} else if state.addr.addr == addr {
201+
state.confirmed_at = confirmed_at;
202+
state.trust_until = Some(confirmed_at + source.trust_duration());
203+
}
142204
}
143205
}
144206
}
@@ -150,7 +212,7 @@ impl BestAddr {
150212
source: Source,
151213
confirmed_at: Instant,
152214
) {
153-
let trust_until = source.trust_until(confirmed_at);
215+
let trust_until = confirmed_at + source.trust_duration();
154216

155217
if self
156218
.0

iroh/src/magicsock/node_map/node_state.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -978,6 +978,7 @@ impl NodeState {
978978
latency,
979979
best_addr::Source::ReceivedPong,
980980
now,
981+
now,
981982
);
982983
}
983984

@@ -1064,18 +1065,26 @@ impl NodeState {
10641065
debug_assert!(false, "node map inconsistency by_ip_port <-> direct addr");
10651066
return;
10661067
};
1067-
state.last_payload_msg = Some(now);
1068+
state.receive_payload(now);
10681069
self.last_used = Some(now);
1070+
if let Some((latency, confirmed_at)) = state.valid_best_addr_candidate(now) {
10691071
self.udp_paths
10701072
.best_addr
1071-
.reconfirm_if_used(addr.into(), BestAddrSource::Udp, now);
1073+
.insert_if_soon_outdated_or_reconfirm(
1074+
addr.into(),
1075+
latency,
1076+
BestAddrSource::Udp,
1077+
confirmed_at,
1078+
now,
1079+
);
1080+
}
10721081
}
10731082

10741083
pub(super) fn receive_relay(&mut self, url: &RelayUrl, src: NodeId, now: Instant) {
10751084
match self.relay_url.as_mut() {
10761085
Some((current_home, state)) if current_home == url => {
10771086
// We received on the expected url. update state.
1078-
state.last_payload_msg = Some(now);
1087+
state.receive_payload(now);
10791088
}
10801089
Some((_current_home, _state)) => {
10811090
// we have a different url. we only update on ping, not on receive_relay.

iroh/src/magicsock/node_map/path_state.rs

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@ use super::{
1414
node_state::{ControlMsg, PongReply, SESSION_ACTIVE_TIMEOUT},
1515
IpPort, PingRole, Source,
1616
};
17-
use crate::{disco::SendAddr, magicsock::HEARTBEAT_INTERVAL};
17+
use crate::{
18+
disco::SendAddr,
19+
magicsock::{node_map::best_addr::TRUST_UDP_ADDR_DURATION, HEARTBEAT_INTERVAL},
20+
};
1821

1922
/// The minimum time between pings to an endpoint.
2023
///
@@ -53,6 +56,9 @@ pub(super) struct PathState {
5356
///
5457
/// This excludes DISCO messages.
5558
pub(super) last_payload_msg: Option<Instant>,
59+
/// Whether the last payload msg was within [`TRUST_UDP_ADDR_DURATION`] of either
60+
/// the last trusted payload message or a recent pong.
61+
pub(super) last_payload_trusted: bool,
5662
/// Sources is a map of [`Source`]s to [`Instant`]s, keeping track of all the ways we have
5763
/// learned about this path
5864
///
@@ -73,6 +79,7 @@ impl PathState {
7379
call_me_maybe_time: None,
7480
recent_pong: None,
7581
last_payload_msg: None,
82+
last_payload_trusted: false,
7683
sources,
7784
}
7885
}
@@ -100,6 +107,7 @@ impl PathState {
100107
call_me_maybe_time: None,
101108
recent_pong: None,
102109
last_payload_msg: Some(now),
110+
last_payload_trusted: false,
103111
sources,
104112
}
105113
}
@@ -129,6 +137,26 @@ impl PathState {
129137
}
130138
}
131139
self.recent_pong = Some(r);
140+
141+
if let Some(last_payload_msg) = self.last_payload_msg {
142+
self.last_payload_trusted = self.within_trusted_pong_duration(last_payload_msg);
143+
}
144+
}
145+
146+
fn within_trusted_pong_duration(&self, instant: Instant) -> bool {
147+
if let Some(pong) = &self.recent_pong {
148+
return pong.pong_at <= instant && instant < pong.pong_at + TRUST_UDP_ADDR_DURATION;
149+
}
150+
151+
false
152+
}
153+
154+
pub(super) fn receive_payload(&mut self, now: Instant) {
155+
self.last_payload_msg = Some(now);
156+
157+
if self.within_trusted_pong_duration(now) {
158+
self.last_payload_trusted = true;
159+
}
132160
}
133161

134162
#[cfg(test)]
@@ -141,6 +169,7 @@ impl PathState {
141169
call_me_maybe_time: None,
142170
recent_pong: Some(r),
143171
last_payload_msg: None,
172+
last_payload_trusted: false,
144173
sources: HashMap::new(),
145174
}
146175
}
@@ -187,6 +216,21 @@ impl PathState {
187216
.copied()
188217
}
189218

219+
fn last_confirmed_at(&self) -> Option<Instant> {
220+
let last_trusted_payload_msg = if self.last_payload_trusted {
221+
self.last_payload_msg
222+
} else {
223+
None
224+
};
225+
226+
self.recent_pong
227+
.as_ref()
228+
.map(|pong| pong.pong_at)
229+
.into_iter()
230+
.chain(last_trusted_payload_msg)
231+
.max()
232+
}
233+
190234
/// The last control or DISCO message **about** this path.
191235
///
192236
/// This is the most recent instant among:
@@ -222,6 +266,18 @@ impl PathState {
222266
self.recent_pong.as_ref().map(|p| p.latency)
223267
}
224268

269+
/// Returns the latency and confirmed_at time if this path would make for a valid best addr `now`
270+
pub(crate) fn valid_best_addr_candidate(&self, now: Instant) -> Option<(Duration, Instant)> {
271+
let latency = self.recent_pong.as_ref()?.latency;
272+
let confirmed_at = self.last_confirmed_at()?;
273+
274+
if confirmed_at <= now && now < confirmed_at + TRUST_UDP_ADDR_DURATION {
275+
return Some((latency, confirmed_at));
276+
}
277+
278+
None
279+
}
280+
225281
pub(super) fn needs_ping(&self, now: &Instant) -> bool {
226282
match self.last_ping {
227283
None => true,

iroh/src/magicsock/node_map/udp_paths.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ impl NodeUdpPaths {
9898
/// it should be `&self` and not `&mut self`. This is only possible once the state from
9999
/// [`NodeUdpPaths`] is no longer modified from outside.
100100
pub(super) fn send_addr(&mut self, now: Instant, have_ipv6: bool) -> UdpSendAddr {
101-
self.assign_best_addr_from_candidates_if_empty();
101+
self.assign_best_addr_from_candidates_if_empty(now);
102102
match self.best_addr.state(now) {
103103
best_addr::State::Valid(addr) => UdpSendAddr::Valid(addr.addr),
104104
best_addr::State::Outdated(addr) => UdpSendAddr::Outdated(addr.addr),
@@ -139,7 +139,7 @@ impl NodeUdpPaths {
139139
/// If somehow we end up in a state where we failed to set a best_addr, while we do have
140140
/// valid candidates, this will chose a candidate and set best_addr again. Most likely
141141
/// this is a bug elsewhere though.
142-
fn assign_best_addr_from_candidates_if_empty(&mut self) {
142+
fn assign_best_addr_from_candidates_if_empty(&mut self, now: Instant) {
143143
if !self.best_addr.is_empty() {
144144
return;
145145
}
@@ -173,6 +173,7 @@ impl NodeUdpPaths {
173173
pong.latency,
174174
best_addr::Source::BestCandidate,
175175
pong.pong_at,
176+
now,
176177
)
177178
}
178179
}

0 commit comments

Comments
 (0)