-
Notifications
You must be signed in to change notification settings - Fork 88
fix(lit-core): LIT-4016 - Enhance error handling during epoch changes #710
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
f2f46fd
bb21608
d99e8b2
cb696f8
25dab7b
75ff638
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,4 +1,5 @@ | ||||||
| import { ethers } from 'ethers'; | ||||||
| import { EventEmitter } from 'eventemitter3'; | ||||||
|
|
||||||
| import { | ||||||
| canonicalAccessControlConditionFormatter, | ||||||
|
|
@@ -63,7 +64,6 @@ import { | |||||
| NodeClientErrorV0, | ||||||
| NodeClientErrorV1, | ||||||
| NodeCommandServerKeysResponse, | ||||||
| NodeErrorV3, | ||||||
| RejectedNodePromises, | ||||||
| SendNodeCommand, | ||||||
| SessionSigsMap, | ||||||
|
|
@@ -132,7 +132,7 @@ const FALLBACK_RPC_URLS = [ | |||||
| 'https://eth.llamarpc.com', | ||||||
| ]; | ||||||
|
|
||||||
| export class LitCore { | ||||||
| export class LitCore extends EventEmitter { | ||||||
| config: LitNodeClientConfigWithDefaults = { | ||||||
| alertWhenUnauthorized: false, | ||||||
| debug: true, | ||||||
|
|
@@ -165,6 +165,8 @@ export class LitCore { | |||||
|
|
||||||
| // ========== Constructor ========== | ||||||
| constructor(config: LitNodeClientConfig | CustomNetwork) { | ||||||
| super(); | ||||||
|
|
||||||
| if (!(config.litNetwork in LIT_NETWORKS)) { | ||||||
| const supportedNetwork = Object.values(LIT_NETWORK).join(', '); | ||||||
|
|
||||||
|
|
@@ -308,20 +310,16 @@ export class LitCore { | |||||
|
|
||||||
| // ========== Scoped Class Helpers ========== | ||||||
| private async _handleStakingContractStateChange(state: StakingStates) { | ||||||
| log(`New state detected: "${state}"`); | ||||||
|
|
||||||
| const validatorData = await this._getValidatorData(); | ||||||
| try { | ||||||
| log(`New state detected: "${state}"`); | ||||||
|
|
||||||
| if (state === StakingStates.Active) { | ||||||
| // We always want to track the most recent epoch number on _all_ networks | ||||||
|
|
||||||
| this._epochState = await this._fetchCurrentEpochState( | ||||||
| validatorData.epochInfo | ||||||
| ); | ||||||
| if (state === StakingStates.Active) { | ||||||
| const validatorData = await this._getValidatorData(); | ||||||
|
|
||||||
| if (CENTRALISATION_BY_NETWORK[this.config.litNetwork] !== 'centralised') { | ||||||
| // We don't need to handle node urls changing on centralised networks, since their validator sets are static | ||||||
| try { | ||||||
| if ( | ||||||
| CENTRALISATION_BY_NETWORK[this.config.litNetwork] !== 'centralised' | ||||||
| ) { | ||||||
| log( | ||||||
| 'State found to be new validator set locked, checking validator set' | ||||||
| ); | ||||||
|
|
@@ -330,39 +328,55 @@ export class LitCore { | |||||
| const delta: string[] = validatorData.bootstrapUrls.filter((item) => | ||||||
| existingNodeUrls.includes(item) | ||||||
| ); | ||||||
| // if the sets differ we reconnect. | ||||||
|
|
||||||
| // check if the node sets are non-matching and re-connect if they do not. | ||||||
| if (delta.length > 1) { | ||||||
| // check if the node sets are non-matching and re-connect if they do not. | ||||||
| /* | ||||||
| TODO: This covers *most* cases where a node may come in or out of the active | ||||||
| set which we will need to re attest to the execution environments. | ||||||
| However, the sdk currently does not know if there is an active network operation pending. | ||||||
| Such that the state when the request was sent will now mutate when the response is sent back. | ||||||
| The sdk should be able to understand its current execution environment and wait on an active | ||||||
| network request to the previous epoch's node set before changing over. | ||||||
| */ | ||||||
| TODO: This covers *most* cases where a node may come in or out of the active | ||||||
| set which we will need to re attest to the execution environments. | ||||||
| However, the sdk currently does not know if there is an active network operation pending. | ||||||
| Such that the state when the request was sent will now mutate when the response is sent back. | ||||||
| The sdk should be able to understand its current execution environment and wait on an active | ||||||
| network request to the previous epoch's node set before changing over. | ||||||
| */ | ||||||
| log( | ||||||
| 'Active validator sets changed, new validators ', | ||||||
| delta, | ||||||
| 'starting node connection' | ||||||
| ); | ||||||
| await this.connect(); // Will update `epochInfo` | ||||||
| } | ||||||
|
|
||||||
| await this.connect(); | ||||||
| } catch (err: unknown) { | ||||||
| // FIXME: We should emit an error event so that consumers know that we are de-synced and can connect() again | ||||||
| // But for now, our every-30-second network sync will fix things in at most 30s from now. | ||||||
| // this.ready = false; Should we assume core is invalid if we encountered errors refreshing from an epoch change? | ||||||
| const { message = '' } = err as | ||||||
| | Error | ||||||
| | NodeClientErrorV0 | ||||||
| | NodeClientErrorV1; | ||||||
| logError( | ||||||
| 'Error while attempting to reconnect to nodes after epoch transition:', | ||||||
| message | ||||||
| } else { | ||||||
| // In case of centralised networks, we don't run `connect()` flow, so we will manually update epochInfo here | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is a centralized network treated differently? The only difference should be in the attestation but the handshake should be invariant to the network?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My understanding is that for centralized networks, the node list doesn't change, so re-handshaking with all the nodes on epoch change would be redundant; if that's not true, we should definitely fix it...
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we can treat them the same way as decentralized I would say we do so, specially considering the "centralization" is more like a special thing instead of something really relevant in the network design Also this way we can test nodes change in local (centralized)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree - let's change this for v8 |
||||||
| this._epochState = await this._fetchCurrentEpochState( | ||||||
| validatorData.epochInfo | ||||||
| ); | ||||||
| } | ||||||
| } | ||||||
| } catch (err: unknown) { | ||||||
| // Ensure that any methods that check `this.ready` throw errors to the caller, and any consumers can check appropriately | ||||||
| this.ready = false; | ||||||
|
|
||||||
| const { message = '' } = err as | ||||||
| | Error | ||||||
| | NodeClientErrorV0 | ||||||
| | NodeClientErrorV1; | ||||||
| logError( | ||||||
| 'Error while attempting to reconnect to nodes after epoch transition:', | ||||||
| message | ||||||
| ); | ||||||
|
|
||||||
| // Signal to any listeners that we've encountered a fatal error | ||||||
| this.emit( | ||||||
| 'error', | ||||||
| new Error( | ||||||
| 'Error while attempting to reconnect to nodes after epoch transition:' + | ||||||
| message | ||||||
| ) | ||||||
| ); | ||||||
|
|
||||||
| // Signal to any listeners that we're 'disconnected' from LIT network | ||||||
| this.emit('disconnected', true); | ||||||
|
||||||
| this.emit('disconnected', true); | |
| this.emit('connected', false); |
As a listener I would expect to listen to connection status as this is likely the status I'm interested in (to know if I can use Lit), instead of just one of those events
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Emitting disconnect and connect events is a pretty stand/common pattern, since the action taken is typically very different for each case; it's very common to define onDisconnect and onConnect functions in the app logic layer and just bind them directly as event handlers. If we emit a single connection event with the state of the connection, then every event handler in the app layer is probably going to start with an if() else {} block that needs to check the payload emitted to the event, and identify the type, and call the appropriate connect or disconnect handler.
I personally think emitting separate events is a nice devex -- but I've got no argument against adding another connection type event that is emitted every time the connection state changes, either connecting or disconnecting, if you think that would be a good addition to the API 👍
One thing to consider is that we can also strongly type disconnectHandler and connectHandler functions -- and the payload provided to the listener would likely make sense to actually be a different type. Actually, I think that my making this always emitt a boolean is probably a poor API, now that you mention it -- what do you think about modifying the disconnect event to emit the Error that was encountered during the disconnect, e.g. this.emit('disconnected', { reason: error })?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great points. The thing that tickled me was that we were emitting events with a boolean that is always the same, seems like it is neither a connected status API nor a onConnect + onDisconnect events
But your last suggestion I think nails the thing. And we can make something similar for connected when we see the need for it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed this call to stop listening -- if the code inside our listener calls connect(), it will just chain on any already-pending connect() logic (see connect() logic and promise chain on `this._connectingPromise.
This doesn't make the client self-healing, but it does mean that a failure won't leave the client in a state where it is no longer listening for further epoch change events.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But will it chain the promises since we specifically check that if a pending connection is open then just return it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We always re-set _connectionPromise to null when a connect() call finishes -- so if we are connecting, multiple calls to connect() will all return the same promise. This means that if we're still processing connect() when we receive another epoch change event, it will be effectively a no-op.
It occurred to me that we could implement cancellation across this entire callstack, and then cancel the entire existing call to connect() (along with any pending fetch() calls that it is running), and then run it again from the top. I'd like to implement a much more robust network handling layer based on the v7 branch and land this as-is for now, since it's an improvement over what we've got.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't we throw/disconnect when the new state is
Paused?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good call, if so -- if
Pausedstate on the contract means the existing node set can no longer serve requests, we should do exactly that. I was under the impression thatPausedmeant epochs might not be changing, but the current epoch of nodes still functioned. If I'm wrong, we should definitely change it as you suggested