Skip to content

Commit bb51fae

Browse files
authored
Merge pull request #25 from omnia-network/dev
v1.3.3
2 parents 3a72dd7 + 55065ed commit bb51fae

File tree

6 files changed

+66
-35
lines changed

6 files changed

+66
-35
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/gateway-state/src/lib.rs

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -127,18 +127,18 @@ impl GatewayState {
127127
&self,
128128
canister_id: CanisterPrincipal,
129129
client_key: ClientKey,
130-
) -> ClientEntry {
130+
) -> ClientRemovalResult {
131131
// START OF THE CRITICAL SECTION
132132
if let Entry::Occupied(mut entry) = self.inner.data.entry(canister_id) {
133133
let poller_state = entry.get_mut();
134134

135135
// even if this is the last client session for the canister, do not remove the canister from the gateway state
136136
// this will be done by the poller task
137-
// returns 'ClientEntry::Removed' if the client was removed, 'ClientEntry::Vacant' if there was no such client
137+
// returns 'ClientRemovalResult::Removed' if the client was removed, 'ClientRemovalResult::Vacant' if there was no such client
138138
return {
139139
match poller_state.remove(&client_key) {
140-
Some(_) => ClientEntry::Removed(client_key),
141-
None => ClientEntry::Vacant,
140+
Some(_) => ClientRemovalResult::Removed(client_key),
141+
None => ClientRemovalResult::Vacant,
142142
}
143143
};
144144
}
@@ -148,7 +148,7 @@ impl GatewayState {
148148
// indeed, a client session might get an error before the poller side of the channel has been dropped - but after the poller state has been removed -
149149
// in such a case, the client state has already been removed by the poller, together with the whole poller state
150150
// therefore there is no need to do anything else here and we pretend that there is no such entry
151-
ClientEntry::Vacant
151+
ClientRemovalResult::Vacant
152152
}
153153

154154
/// SAFETY:
@@ -160,17 +160,20 @@ impl GatewayState {
160160
/// Therefore, this function is executed atomically.
161161
///
162162
/// This function shall be called only if it is guaranteed that the canister entry exists in the gateway state.
163-
pub fn remove_canister_if_empty(&self, canister_id: CanisterPrincipal) -> CanisterEntry {
163+
pub fn remove_canister_if_empty(
164+
&self,
165+
canister_id: CanisterPrincipal,
166+
) -> CanisterRemovalResult {
164167
// remove_if returns None if the condition is not met, otherwise it returns the Some(<entry>)
165-
// if Some, the poller state is empty and therefore the poller shall terminate - return 'CanisterEntry::RemovedEmpty'
166-
// if None, the poller state is not empty and therefore there are still clients connected and the poller shall not terminate - return 'CanisterEntry::NotEmpty'
168+
// if Some, the poller state is empty and therefore the poller shall terminate - return 'CanisterRemovalResult::Empty'
169+
// if None, the poller state is not empty and therefore there are still clients connected and the poller shall not terminate - return 'CanisterRemovalResult::NotEmpty'
167170
match self
168171
.inner
169172
.data
170173
.remove_if(&canister_id, |_, poller_state| poller_state.is_empty())
171174
{
172-
Some(_) => CanisterEntry::RemovedEmpty,
173-
None => CanisterEntry::NotEmpty,
175+
Some(_) => CanisterRemovalResult::Empty,
176+
None => CanisterRemovalResult::NotEmpty,
174177
}
175178
}
176179

@@ -211,13 +214,19 @@ impl GatewayStateInner {
211214
/// and the state associated to each client
212215
pub type PollerState = Arc<DashMap<ClientKey, ClientSender>>;
213216

214-
pub enum ClientEntry {
217+
/// Determines whether the client was removed from the poller state or if there was no such client
218+
pub enum ClientRemovalResult {
219+
/// The client was removed from the poller state
215220
Removed(ClientKey),
221+
/// The client was not present in the poller state
216222
Vacant,
217223
}
218224

219-
pub enum CanisterEntry {
220-
RemovedEmpty,
225+
/// Determines whether the canister was removed from the gateway state or not (in case there are still clients connected)
226+
pub enum CanisterRemovalResult {
227+
/// The canister was removed from the gateway state
228+
Empty,
229+
/// The canister was not removed from the gateway state
221230
NotEmpty,
222231
}
223232

src/ic-websocket-gateway/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "ic_websocket_gateway"
3-
version = "1.3.2"
3+
version = "1.3.3"
44
edition.workspace = true
55
rust-version.workspace = true
66
repository.workspace = true

src/ic-websocket-gateway/src/canister_poller.rs

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ use canister_utils::{
33
ws_get_messages, CanisterOutputCertifiedMessages, CanisterToClientMessage,
44
CanisterWsGetMessagesArguments, IcError, IcWsCanisterMessage,
55
};
6-
use gateway_state::{CanisterEntry, CanisterPrincipal, ClientSender, GatewayState, PollerState};
7-
use ic_agent::{Agent, AgentError};
6+
use gateway_state::{
7+
CanisterPrincipal, CanisterRemovalResult, ClientSender, GatewayState, PollerState,
8+
};
9+
use ic_agent::{agent::RejectCode, Agent, AgentError};
810
use std::{sync::Arc, time::Duration};
911
use tokio::{sync::mpsc::Sender, time::timeout};
1012
use tracing::{error, span, trace, warn, Instrument, Level, Span};
@@ -13,11 +15,15 @@ pub(crate) const POLLING_TIMEOUT_MS: u64 = 5_000;
1315

1416
type PollingTimeout = Duration;
1517

18+
/// Result of the polling iteration
1619
#[derive(Debug, PartialEq, Eq)]
1720
pub(crate) enum PollingStatus {
21+
/// No messages polled
1822
NoMessagesPolled,
23+
/// Some messages polled
1924
MessagesPolled(CanisterOutputCertifiedMessages),
20-
PollerTimedOut,
25+
/// Request timed out
26+
TimedOut,
2127
}
2228

2329
/// Poller which periodically queries a canister for new messages and relays them to the client
@@ -130,7 +136,7 @@ impl CanisterPoller {
130136
return Ok(());
131137
}
132138
},
133-
PollingStatus::PollerTimedOut => {
139+
PollingStatus::TimedOut => {
134140
// if the poller timed out, it already waited way too long... return immediately so that the next polling iteration can be started
135141
warn!("Poller timed out. Polling immediately");
136142
return Ok(());
@@ -196,7 +202,7 @@ impl CanisterPoller {
196202
Ok(Err(IcError::Cdk(e))) => Err(format!("Unrecoverable CDK error: {:?}", e)),
197203
Err(e) => {
198204
warn!("Poller took too long to retrieve messages: {:?}", e);
199-
Ok(PollingStatus::PollerTimedOut)
205+
Ok(PollingStatus::TimedOut)
200206
},
201207
}
202208
}
@@ -221,7 +227,7 @@ impl CanisterPoller {
221227
.get(&canister_output_message.client_key)
222228
.as_deref()
223229
{
224-
let canister_message_span = span!(parent: client_session_span, Level::TRACE, "Canister Message", message_key = canister_to_client_message.key);
230+
let canister_message_span = span!(parent: client_session_span, Level::TRACE, "Canister Message", message_key = canister_to_client_message.key, %self.canister_id);
225231
canister_message_span.follows_from(Span::current().id());
226232
let canister_message = canister_message_span.in_scope(|| {
227233
trace!("Start relaying message",);
@@ -313,8 +319,8 @@ impl CanisterPoller {
313319
.gateway_state
314320
.remove_canister_if_empty(self.canister_id)
315321
{
316-
CanisterEntry::RemovedEmpty => true,
317-
CanisterEntry::NotEmpty => false,
322+
CanisterRemovalResult::Empty => true,
323+
CanisterRemovalResult::NotEmpty => false,
318324
}
319325
}
320326
}
@@ -353,7 +359,6 @@ fn is_recoverable_error(e: &AgentError) -> bool {
353359
AgentError::InvalidReplicaUrl(_)
354360
| AgentError::TimeoutWaitingForResponse()
355361
| AgentError::InvalidCborData(_)
356-
| AgentError::ReplicaError(_)
357362
| AgentError::HttpError(_)
358363
| AgentError::InvalidReplicaStatus
359364
| AgentError::RequestStatusDoneNoReply(_)
@@ -366,6 +371,9 @@ fn is_recoverable_error(e: &AgentError) -> bool {
366371
| AgentError::TransportError(_)
367372
| AgentError::CallDataMismatch { .. }
368373
| AgentError::InvalidRejectCode(_) => true,
374+
// in case of a replica error, we recover only if the error is transient
375+
// all other errors (SysFatal, DestinationInvalid, CanisterReject, CanisterError) are considered permanent
376+
AgentError::ReplicaError(e) => e.reject_code == RejectCode::SysTransient,
369377
_ => false,
370378
}
371379
}

src/ic-websocket-gateway/src/client_session_handler.rs

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use crate::{
55
};
66
use canister_utils::{ws_close, CanisterWsCloseArguments, ClientKey, IcWsCanisterMessage};
77
use futures_util::StreamExt;
8-
use gateway_state::{CanisterPrincipal, ClientEntry, GatewayState, PollerState};
8+
use gateway_state::{CanisterPrincipal, ClientRemovalResult, GatewayState, PollerState};
99
use ic_agent::Agent;
1010
use std::sync::Arc;
1111
use tokio::{
@@ -135,21 +135,31 @@ impl ClientSessionHandler {
135135
.gateway_state
136136
.insert_client_channel_and_get_new_poller_state(
137137
canister_id,
138-
client_key,
138+
client_key.clone(),
139139
// important not to clone 'client_channel_tx' as otherwise the client session will not receive None in case of a poller error
140140
client_channel_tx.take().expect("must be set only once"),
141141
client_session_span.clone(),
142142
);
143+
debug!("Client added to gateway state");
143144

144145
client_session_span.record("canister_id", canister_id.to_string());
145146

147+
// ensure this is done after the gateway state has been updated
146148
// TODO: figure out if it is guaranteed that all threads see the updated state of the gateway
147149
// before relaying the message to the IC
148-
client_session
150+
if let Err(e) = client_session
149151
.relay_client_message(ws_open_message)
150152
.instrument(client_session_span.clone())
151153
.await
152-
.map_err(|e| format!("Could not relay WS open message to IC: {:?}", e))?;
154+
{
155+
// if the message could not be relayed to the IC, remove the client from the gateway state
156+
// before returning the error and terminating the session handler
157+
self.gateway_state
158+
.remove_client(canister_id, client_key.clone());
159+
debug!("Client removed from gateway state");
160+
161+
return Err(format!("Could not relay WS open message to IC: {:?}", e))?;
162+
}
153163

154164
client_session_span.in_scope(|| {
155165
debug!("Client session setup");
@@ -180,9 +190,10 @@ impl ClientSessionHandler {
180190

181191
let canister_id = self.get_canister_id(&client_session);
182192
let client_key = self.get_client_key(&client_session);
183-
// remove client from poller state
193+
// remove client from gateway state
184194
self.gateway_state
185195
.remove_client(canister_id, client_key.clone());
196+
debug!("Client removed from gateway state");
186197

187198
self.call_ws_close(&canister_id, client_key).await;
188199

@@ -194,20 +205,26 @@ impl ClientSessionHandler {
194205
continue;
195206
},
196207
Err(e) => {
208+
client_session_span.in_scope(|| {
209+
debug!("Client session error");
210+
});
197211
if let IcWsError::Poller(e) = e {
198212
// no need to remove the client as the whole poller state has already been removed by the poller task
199-
return Err(format!("Poller error: {:?}", e));
213+
let err_msg = format!("Poller error: {:?}", e);
214+
warn!(err_msg);
215+
return Err(err_msg);
200216
}
201217
let canister_id = self.get_canister_id(&client_session);
202218
let client_key = self.get_client_key(&client_session);
203219
// if the error is not due to a a failed poller
204220
// remove client from poller state, if it is present
205221
// error might have happened before the client session was Setup
206222
// if so, there is no need to remove the client as it is not yet in the poller state
207-
if let ClientEntry::Removed(client_key) = self
223+
if let ClientRemovalResult::Removed(client_key) = self
208224
.gateway_state
209225
.remove_client_if_exists(canister_id, client_key)
210226
{
227+
debug!("Client removed from gateway state");
211228
self.call_ws_close(&canister_id, client_key).await;
212229

213230
// return Err as the session had an error and cannot be updated anymore

src/ic-websocket-gateway/src/tests/canister_poller.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -345,10 +345,7 @@ mod test {
345345
let mut poller = create_poller(polling_interval_ms, client_channel_tx);
346346

347347
// check that the poller times out
348-
assert_eq!(
349-
Ok(PollingStatus::PollerTimedOut),
350-
poller.poll_canister().await
351-
);
348+
assert_eq!(Ok(PollingStatus::TimedOut), poller.poll_canister().await);
352349

353350
// check that the poller does not wait for a polling interval after timing out
354351
let start_polling_instant = tokio::time::Instant::now();

0 commit comments

Comments
 (0)