Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions apps/fortuna/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,60 @@ Please add the changed files in the `.sqlx` folder to your git commit.
The Fortuna binary has a command-line interface to perform useful operations on the contract, such as
registering a new randomness provider, or drawing a random value. To see the available commands, simply run `cargo run`.

## Multiple Replica Setup

Fortuna supports running multiple replica instances for high availability and reliability. This prevents service interruption if one instance goes down and distributes the workload across multiple instances.

### How Replica Assignment Works

- Each replica is assigned a unique `replica_id` (0, 1, 2, etc.)
- Requests are distributed using modulo assignment: `sequence_number % total_replicas`
- Each replica primarily handles requests assigned to its ID
- After a configurable delay, replicas will process requests from other replicas as backup (failover)

### Example Configurations

**Two Replica Setup (Blue/Green):**
```yaml
# Replica 0 (Blue) - handles even sequence numbers (0, 2, 4, ...)
keeper:
replica_config:
replica_id: 0
total_replicas: 2
backup_delay_seconds: 30

# Replica 1 (Green) - handles odd sequence numbers (1, 3, 5, ...)
keeper:
replica_config:
replica_id: 1
total_replicas: 2
backup_delay_seconds: 30
```

**Three Replica Setup:**
```yaml
# Replica 0 - handles sequence numbers 0, 3, 6, 9, ...
keeper:
replica_config:
replica_id: 0
total_replicas: 3
backup_delay_seconds: 45
```

### Deployment Considerations

1. **Separate Wallets**: Each replica MUST use a different private key to avoid nonce conflicts
2. **Backup Delay**: Set `backup_delay_seconds` long enough to allow primary replica to process requests, but short enough for acceptable failover time (recommended: 30-60 seconds)
3. **Monitoring**: Monitor each replica's processing metrics to ensure proper load distribution
4. **Gas Management**: Each replica needs sufficient ETH balance for gas fees

### Failover Behavior

- Primary replica processes requests immediately
- Backup replicas wait for `backup_delay_seconds` before checking if request is still unfulfilled
- If request is already fulfilled during the delay, backup replica skips processing
- This prevents duplicate transactions and wasted gas while ensuring reliability

## Local Development

To start an instance of the webserver for local testing, you first need to perform a few setup steps:
Expand Down
18 changes: 18 additions & 0 deletions apps/fortuna/config.sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ chains:
# blocks after 5 blocks, then again after 10 blocks, and finally after 20 blocks.
block_delays: [5, 10, 20]



# Historical commitments -- delete this block for local development purposes
commitments:
# prettier-ignore
Expand Down Expand Up @@ -86,3 +88,19 @@ keeper:
value: 0xabcd
# For production, you can store the private key in a file.
# file: keeper-key.txt
# Multi-replica configuration documentation

# Optional: Multi-replica configuration for high availability and load distribution
# Uncomment and configure for production deployments with multiple Fortuna instances
# replica_config:
# replica_id: 0 # Unique identifier for this replica (0, 1, 2, ...)
# total_replicas: 2 # Total number of replica instances running
# backup_delay_seconds: 30 # Seconds to wait before processing other replicas' requests
#
# Example configurations:
#
# Two-replica setup (Blue/Green):
# - Replica 0: handles even sequence numbers (0, 2, 4, ...)
# - Replica 1: handles odd sequence numbers (1, 3, 5, ...)
#
# IMPORTANT: Each replica must use a different private_key to avoid nonce conflicts!
27 changes: 27 additions & 0 deletions apps/fortuna/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,18 @@ impl Config {
}
}

if let Some(replica_config) = &config.keeper.replica_config {
if replica_config.total_replicas == 0 {
return Err(anyhow!("Keeper replica configuration is invalid. total_replicas must be greater than 0."));
}
if replica_config.replica_id >= replica_config.total_replicas {
return Err(anyhow!("Keeper replica configuration is invalid. replica_id must be less than total_replicas."));
}
if replica_config.backup_delay_seconds == 0 {
return Err(anyhow!("Keeper replica configuration is invalid. backup_delay_seconds must be greater than 0 to prevent race conditions."));
}
}

Ok(config)
}

Expand Down Expand Up @@ -333,6 +345,18 @@ fn default_chain_sample_interval() -> u64 {
1
}

#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
pub struct ReplicaConfig {
pub replica_id: u64,
pub total_replicas: u64,
#[serde(default = "default_backup_delay_seconds")]
pub backup_delay_seconds: u64,
}

fn default_backup_delay_seconds() -> u64 {
30
}

/// Configuration values for the keeper service that are shared across chains.
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
pub struct KeeperConfig {
Expand All @@ -342,6 +366,9 @@ pub struct KeeperConfig {
/// This key *does not need to be a registered provider*. In particular, production deployments
/// should ensure this is a different key in order to reduce the severity of security breaches.
pub private_key: SecretString,

#[serde(default)]
pub replica_config: Option<ReplicaConfig>,
}

// A secret is a string that can be provided either as a literal in the config,
Expand Down
7 changes: 7 additions & 0 deletions apps/fortuna/src/keeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ pub async fn run_keeper_threads(
contract: contract.clone(),
gas_limit,
escalation_policy: chain_eth_config.escalation_policy.to_policy(),
keeper_config: crate::config::KeeperConfig {
private_key: crate::config::SecretString {
value: None,
file: None,
},
replica_config: None,
},
metrics: metrics.clone(),
fulfilled_requests_cache,
history,
Expand Down
1 change: 1 addition & 0 deletions apps/fortuna/src/keeper/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ pub struct ProcessParams {
pub gas_limit: U256,
pub escalation_policy: EscalationPolicy,
pub chain_state: BlockchainState,
pub keeper_config: crate::config::KeeperConfig,
pub metrics: Arc<KeeperMetrics>,
pub history: Arc<History>,
pub fulfilled_requests_cache: Arc<RwLock<HashSet<u64>>>,
Expand Down
60 changes: 60 additions & 0 deletions apps/fortuna/src/keeper/process_event.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,66 @@ pub async fn process_event_with_backoff(
return Ok(());
}

let is_primary_replica =
if let Some(replica_config) = &process_param.keeper_config.replica_config {
let assigned_replica = event.sequence_number % replica_config.total_replicas;
if assigned_replica != replica_config.replica_id {
tracing::debug!(
sequence_number = event.sequence_number,
assigned_replica = assigned_replica,
our_replica_id = replica_config.replica_id,
"Processing request as backup replica"
);
false
} else {
true
}
} else {
true // No replica config, process all requests
};

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can merge these 2 logs

if !is_primary_replica {
if let Some(replica_config) = &process_param.keeper_config.replica_config {
tracing::info!(
sequence_number = event.sequence_number,
delay_seconds = replica_config.backup_delay_seconds,
"Waiting before processing as backup replica"
);
tokio::time::sleep(tokio::time::Duration::from_secs(
replica_config.backup_delay_seconds,
))
.await;
}

match chain_state
.contract
.get_request(event.provider_address, event.sequence_number)
.await
{
Ok(Some(_)) => {
tracing::info!(
sequence_number = event.sequence_number,
"Request still open after delay, processing as backup replica"
);
}
Ok(None) => {
tracing::debug!(
sequence_number = event.sequence_number,
"Request already fulfilled by primary replica during delay, skipping"
);
return Ok(());
}
Err(e) => {
tracing::warn!(
sequence_number = event.sequence_number,
error = ?e,
"Error checking request status after delay, skipping"
);
return Ok(());
}
}
}

let account_label = AccountLabel {
chain_id: chain_state.id.clone(),
address: chain_state.provider_address.to_string(),
Expand Down
Loading