Skip to content

Commit ef894d5

Browse files
committed
Merge branch 'rumenov/rejjf' into 'master'
chore: move the initialization of some variables only when they are needed and not immediately when the replica is started. See merge request dfinity-lab/public/ic!12590
2 parents 7fe64b2 + 300442b commit ef894d5

File tree

5 files changed

+71
-128
lines changed

5 files changed

+71
-128
lines changed

rs/replay/src/player.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,10 +249,10 @@ impl Player {
249249
}
250250

251251
let subnet_type = get_subnet_type(
252-
registry.as_ref(),
252+
&log,
253253
subnet_id,
254254
registry.get_latest_version(),
255-
&log,
255+
registry.as_ref(),
256256
);
257257
let metrics_registry = MetricsRegistry::new();
258258
let subnet_config = SubnetConfig::new(subnet_type);

rs/replica/src/main.rs

Lines changed: 16 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,16 @@ use ic_config::Config;
55
use ic_crypto_sha::Sha256;
66
use ic_crypto_tls_interfaces::TlsHandshake;
77
use ic_http_endpoints_metrics::MetricsHttpEndpoint;
8-
use ic_interfaces_registry::{LocalStoreCertifiedTimeReader, RegistryClient};
98
use ic_logger::{info, new_replica_logger_from_config};
109
use ic_metrics::MetricsRegistry;
1110
use ic_onchain_observability_server::spawn_onchain_observability_grpc_server_and_register_metrics;
12-
use ic_registry_client_helpers::subnet::SubnetRegistry;
1311
use ic_replica::setup;
1412
use ic_sys::PAGE_SIZE;
1513
use ic_types::consensus::CatchUpPackage;
1614
use ic_types::{replica_version::REPLICA_BINARY_HASH, PrincipalId, ReplicaVersion, SubnetId};
1715
use nix::unistd::{setpgid, Pid};
1816
use static_assertions::assert_eq_size;
19-
use std::{env, io, path::PathBuf, str::FromStr, sync::Arc, time::Duration};
17+
use std::{env, fs, io, path::PathBuf, str::FromStr, sync::Arc, time::Duration};
2018
use tokio::signal::unix::{signal, SignalKind};
2119

2220
#[cfg(target_os = "linux")]
@@ -30,7 +28,6 @@ use jemallocator::Jemalloc;
3028
#[cfg(target_os = "linux")]
3129
static ALLOC: Jemalloc = Jemalloc;
3230

33-
use ic_registry_local_store::LocalStoreImpl;
3431
#[cfg(feature = "profiler")]
3532
use pprof::{protos::Message, ProfilerGuard};
3633
#[cfg(feature = "profiler")]
@@ -43,15 +40,15 @@ use std::io::Write;
4340
/// Determine sha256 hash of the current replica binary
4441
///
4542
/// Returns tuple (path of the replica binary, hex encoded sha256 of binary)
46-
fn get_replica_binary_hash() -> std::result::Result<(PathBuf, String), String> {
43+
fn get_replica_binary_hash() -> Result<(PathBuf, String), String> {
4744
let mut hasher = Sha256::new();
4845
let replica_binary_path = env::current_exe()
4946
.map_err(|e| format!("Failed to determine replica binary path: {:?}", e))?;
5047

51-
let mut binary_file = std::fs::File::open(&replica_binary_path)
48+
let mut binary_file = fs::File::open(&replica_binary_path)
5249
.map_err(|e| format!("Failed to open replica binary to calculate hash: {:?}", e))?;
5350

54-
std::io::copy(&mut binary_file, &mut hasher)
51+
io::copy(&mut binary_file, &mut hasher)
5552
.map_err(|e| format!("Failed to calculate hash for replica binary: {:?}", e))?;
5653

5754
Ok((replica_binary_path, hex::encode(hasher.finish())))
@@ -60,10 +57,8 @@ fn get_replica_binary_hash() -> std::result::Result<(PathBuf, String), String> {
6057
fn main() -> io::Result<()> {
6158
// We do not support 32 bits architectures and probably never will.
6259
assert_eq_size!(usize, u64);
63-
6460
// Ensure that the hardcoded constant matches the OS page size.
6561
assert_eq!(ic_sys::sysconf_page_size(), PAGE_SIZE);
66-
6762
// At this point we need to setup a new process group. This is
6863
// done to ensure all our children processes belong to the same
6964
// process group (as policy wise in production we restrict setpgid
@@ -75,7 +70,7 @@ fn main() -> io::Result<()> {
7570
eprintln!("Failed to setup a new process group for replica.");
7671
// This is a generic exit error. At this point sandboxing is
7772
// not turned on so we can do a simple exit with cleanup.
78-
return Err(std::io::Error::new(std::io::ErrorKind::Other, err));
73+
return Err(io::Error::new(io::ErrorKind::Other, err));
7974
}
8075

8176
#[cfg(feature = "profiler")]
@@ -139,6 +134,11 @@ fn main() -> io::Result<()> {
139134
e.print().expect("Failed to print CLI argument error.");
140135
}
141136

137+
// We abort the whole program with a core dump if a single thread panics.
138+
// This way we can capture all the context if a critical error
139+
// happens.
140+
abort_on_panic();
141+
142142
let config_source = setup::get_config_source(&replica_args);
143143
// Setup temp directory for the configuration.
144144
let tmpdir = tempfile::Builder::new()
@@ -148,12 +148,14 @@ fn main() -> io::Result<()> {
148148
let config = Config::load_with_tmpdir(config_source, tmpdir.path().to_path_buf());
149149

150150
let (logger, async_log_guard) = new_replica_logger_from_config(&config.logger);
151-
152151
let metrics_registry = MetricsRegistry::global();
153-
154152
#[cfg(target_os = "linux")]
155153
metrics_registry.register(jemalloc_metrics::JemallocMetrics::new());
156154

155+
let cup = setup::get_catch_up_package(&replica_args, &logger)
156+
.as_ref()
157+
.map(|c| CatchUpPackage::try_from(c).expect("deserializing CUP failed"));
158+
157159
// Set the replica verison and report as metric
158160
setup::set_replica_version(&replica_args, &logger);
159161
{
@@ -172,17 +174,13 @@ fn main() -> io::Result<()> {
172174
}
173175

174176
let (registry, crypto) = setup::setup_crypto_registry(
175-
config.clone(),
177+
&config,
176178
rt_main.handle().clone(),
177-
Some(&metrics_registry),
179+
&metrics_registry,
178180
logger.clone(),
179181
);
180182

181183
let node_id = crypto.get_node_id();
182-
let cup_proto = setup::get_catch_up_package(&replica_args, &logger);
183-
let cup = cup_proto
184-
.as_ref()
185-
.map(|c| CatchUpPackage::try_from(c).expect("deserializing CUP failed"));
186184

187185
let subnet_id = match &replica_args {
188186
Ok(args) => {
@@ -198,23 +196,6 @@ fn main() -> io::Result<()> {
198196
Err(_) => setup::get_subnet_id(node_id, registry.as_ref(), cup.as_ref(), &logger),
199197
};
200198

201-
let subnet_type = setup::get_subnet_type(
202-
registry.as_ref(),
203-
subnet_id,
204-
registry.get_latest_version(),
205-
&logger,
206-
);
207-
208-
// Read the root subnet id from registry
209-
let root_subnet_id = registry
210-
.get_root_subnet_id(
211-
cup.as_ref()
212-
.map(|c| c.content.registry_version())
213-
.unwrap_or_else(|| registry.get_latest_version()),
214-
)
215-
.expect("cannot read from registry")
216-
.expect("cannot find root subnet id");
217-
218199
// Set node_id and subnet_id in the logging context
219200
let mut context = logger.get_context();
220201
context.node_id = format!("{}", node_id.get());
@@ -228,13 +209,6 @@ fn main() -> io::Result<()> {
228209
let _ = REPLICA_BINARY_HASH.set(hash);
229210
}
230211

231-
// We abort the whole program with a core dump if a single thread panics.
232-
// This way we can capture all the context if a critical error
233-
// happens.
234-
abort_on_panic();
235-
236-
setup::create_consensus_pool_dir(&config);
237-
238212
let crypto = Arc::new(crypto);
239213
let _metrics_endpoint = MetricsHttpEndpoint::new(
240214
rt_http.handle().clone(),
@@ -245,10 +219,6 @@ fn main() -> io::Result<()> {
245219
&logger.inner_logger.root,
246220
);
247221

248-
let registry_certified_time_reader: Arc<dyn LocalStoreCertifiedTimeReader> = Arc::new(
249-
LocalStoreImpl::new(config.registry_client.local_store.clone()),
250-
);
251-
252222
info!(logger, "Constructing IC stack");
253223
let (_, _, _p2p_thread_joiner, _, _xnet_endpoint) =
254224
ic_replica::setup_ic_stack::construct_ic_stack(
@@ -260,12 +230,9 @@ fn main() -> io::Result<()> {
260230
config.clone(),
261231
node_id,
262232
subnet_id,
263-
subnet_type,
264-
root_subnet_id,
265233
registry,
266234
crypto,
267235
cup,
268-
registry_certified_time_reader,
269236
)?;
270237

271238
info!(logger, "Constructed IC stack");

rs/replica/src/setup.rs

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,9 @@ use ic_registry_client_helpers::subnet::{SubnetListRegistry, SubnetRegistry};
1111
use ic_registry_local_store::LocalStoreImpl;
1212
use ic_registry_subnet_type::SubnetType;
1313
use ic_types::{
14-
consensus::catchup::CatchUpPackage,
15-
{NodeId, RegistryVersion, ReplicaVersion, SubnetId},
14+
consensus::catchup::CatchUpPackage, NodeId, RegistryVersion, ReplicaVersion, SubnetId,
1615
};
17-
use std::{convert::TryFrom, env, path::PathBuf, sync::Arc};
16+
use std::{env, path::PathBuf, sync::Arc};
1817

1918
/// Parse command-line args into `ReplicaArgs`
2019
pub fn parse_args() -> Result<ReplicaArgs, clap::Error> {
@@ -129,10 +128,10 @@ pub fn get_subnet_id(
129128

130129
/// Return the subnet type of the given subnet.
131130
pub fn get_subnet_type(
132-
registry: &dyn RegistryClient,
131+
logger: &ReplicaLogger,
133132
subnet_id: SubnetId,
134133
registry_version: RegistryVersion,
135-
logger: &ReplicaLogger,
134+
registry: &dyn RegistryClient,
136135
) -> SubnetType {
137136
loop {
138137
match registry.get_subnet_record(subnet_id, registry_version) {
@@ -181,26 +180,20 @@ pub fn get_config_source(replica_args: &Result<ReplicaArgs, clap::Error>) -> Con
181180
}
182181
}
183182

184-
/// Create the consensus pool directory (if none exists)
185-
pub fn create_consensus_pool_dir(config: &Config) {
186-
std::fs::create_dir_all(&config.artifact_pool.consensus_pool_path).unwrap_or_else(|err| {
187-
panic!(
188-
"Failed to create consensus pool directory {}: {}",
189-
config.artifact_pool.consensus_pool_path.display(),
190-
err
191-
)
192-
});
193-
}
194-
195183
pub fn setup_crypto_registry(
196-
config: Config,
184+
config: &Config,
197185
tokio_runtime_handle: tokio::runtime::Handle,
198-
metrics_registry: Option<&MetricsRegistry>,
186+
metrics_registry: &MetricsRegistry,
199187
logger: ReplicaLogger,
200188
) -> (std::sync::Arc<RegistryClientImpl>, CryptoComponent) {
201-
let data_provider = Arc::new(LocalStoreImpl::new(config.registry_client.local_store));
189+
let data_provider = Arc::new(LocalStoreImpl::new(
190+
config.registry_client.local_store.clone(),
191+
));
202192

203-
let registry = Arc::new(RegistryClientImpl::new(data_provider, metrics_registry));
193+
let registry = Arc::new(RegistryClientImpl::new(
194+
data_provider,
195+
Some(metrics_registry),
196+
));
204197

205198
// The registry must be initialized before setting up the crypto component
206199
if let Err(e) = registry.fetch_and_start_polling() {
@@ -282,14 +275,14 @@ pub fn setup_crypto_provider(
282275
tokio_runtime_handle: tokio::runtime::Handle,
283276
registry: Arc<dyn RegistryClient>,
284277
replica_logger: ReplicaLogger,
285-
metrics_registry: Option<&MetricsRegistry>,
278+
metrics_registry: &MetricsRegistry,
286279
) -> CryptoComponent {
287280
CryptoConfig::check_dir_has_required_permissions(&config.crypto_root).unwrap();
288281
CryptoComponent::new(
289282
config,
290283
Some(tokio_runtime_handle),
291284
registry,
292285
replica_logger,
293-
metrics_registry,
286+
Some(metrics_registry),
294287
)
295288
}

rs/replica/src/setup_ic_stack.rs

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use crate::setup::get_subnet_type;
12
use ic_btc_adapter_client::{setup_bitcoin_adapter_clients, BitcoinAdapterClients};
23
use ic_btc_consensus::BitcoinPayloadBuilder;
34
use ic_config::{artifact_pool::ArtifactPoolConfig, subnet_config::SubnetConfig, Config};
@@ -16,7 +17,8 @@ use ic_logger::{info, ReplicaLogger};
1617
use ic_messaging::MessageRoutingImpl;
1718
use ic_metrics::MetricsRegistry;
1819
use ic_pprof::Pprof;
19-
use ic_registry_subnet_type::SubnetType;
20+
use ic_registry_client_helpers::subnet::SubnetRegistry;
21+
use ic_registry_local_store::LocalStoreImpl;
2022
use ic_replica_setup_ic_network::{
2123
create_networking_stack, init_artifact_pools, P2PStateSyncClient,
2224
};
@@ -27,6 +29,17 @@ use ic_xnet_endpoint::{XNetEndpoint, XNetEndpointConfig};
2729
use ic_xnet_payload_builder::XNetPayloadBuilderImpl;
2830
use std::sync::Arc;
2931

32+
/// Create the consensus pool directory (if none exists)
33+
fn create_consensus_pool_dir(config: &Config) {
34+
std::fs::create_dir_all(&config.artifact_pool.consensus_pool_path).unwrap_or_else(|err| {
35+
panic!(
36+
"Failed to create consensus pool directory {}: {}",
37+
config.artifact_pool.consensus_pool_path.display(),
38+
err
39+
)
40+
});
41+
}
42+
3043
#[allow(clippy::too_many_arguments, clippy::type_complexity)]
3144
pub fn construct_ic_stack(
3245
log: &ReplicaLogger,
@@ -37,12 +50,9 @@ pub fn construct_ic_stack(
3750
config: Config,
3851
node_id: NodeId,
3952
subnet_id: SubnetId,
40-
subnet_type: SubnetType,
41-
root_subnet_id: SubnetId,
4253
registry: Arc<dyn RegistryClient + Send + Sync>,
4354
crypto: Arc<CryptoComponent>,
4455
catch_up_package: Option<CatchUpPackage>,
45-
local_store_time_reader: Arc<dyn LocalStoreCertifiedTimeReader>,
4656
) -> std::io::Result<(
4757
// TODO: remove this return value since it is used only in tests
4858
Arc<StateManagerImpl>,
@@ -54,13 +64,10 @@ pub fn construct_ic_stack(
5464
XNetEndpoint,
5565
)> {
5666
// ---------- ARTIFACT POOLS DEPS FOLLOW ----------
67+
create_consensus_pool_dir(&config);
5768
// Determine the correct catch-up package.
5869
let catch_up_package = {
5970
use ic_types::consensus::HasHeight;
60-
let make_registry_cup = || {
61-
ic_consensus::dkg::make_registry_cup(&*registry, subnet_id, None)
62-
.expect("Couldn't create a registry CUP")
63-
};
6471
match catch_up_package {
6572
// The replica was started on a CUP persisted by the orchestrator.
6673
Some(cup_from_orc) => {
@@ -79,6 +86,10 @@ pub fn construct_ic_stack(
7986
// This case is only possible if the replica is started without an orchestrator which
8087
// is currently only possible in the local development mode with `dfx`.
8188
None => {
89+
let make_registry_cup = || {
90+
ic_consensus::dkg::make_registry_cup(&*registry, subnet_id, None)
91+
.expect("Couldn't create a registry CUP")
92+
};
8293
let registry_cup = CatchUpPackage::try_from(&make_registry_cup())
8394
.expect("deserializing CUP failed");
8495
info!(
@@ -90,6 +101,11 @@ pub fn construct_ic_stack(
90101
}
91102
}
92103
};
104+
let root_subnet_id = registry
105+
.get_root_subnet_id(catch_up_package.content.registry_version())
106+
.expect("cannot read from registry")
107+
.expect("cannot find root subnet id");
108+
93109
let artifact_pool_config = ArtifactPoolConfig::from(config.artifact_pool.clone());
94110
let artifact_pools = init_artifact_pools(
95111
node_id,
@@ -99,8 +115,14 @@ pub fn construct_ic_stack(
99115
log.clone(),
100116
catch_up_package,
101117
);
102-
103118
// ---------- REPLICATED STATE DEPS FOLLOW ----------
119+
let subnet_type = get_subnet_type(
120+
log,
121+
subnet_id,
122+
registry.get_latest_version(),
123+
registry.as_ref(),
124+
);
125+
104126
let consensus_pool_cache = artifact_pools.consensus_pool.read().unwrap().get_cache();
105127
let verifier = Arc::new(VerifierImpl::new(crypto.clone()));
106128
let state_manager = Arc::new(StateManagerImpl::new(
@@ -219,6 +241,9 @@ pub fn construct_ic_stack(
219241
// ---------- CONSENSUS AND P2P DEPS FOLLOW ----------
220242
let state_sync = StateSync::new(state_manager.clone(), log.clone());
221243
let sev_handshake = Arc::new(Sev::new(node_id, registry.clone()));
244+
let local_store_cert_time_reader: Arc<dyn LocalStoreCertifiedTimeReader> = Arc::new(
245+
LocalStoreImpl::new(config.registry_client.local_store.clone()),
246+
);
222247
let (ingress_ingestion_service, p2p_runner) = create_networking_stack(
223248
metrics_registry,
224249
log.clone(),
@@ -245,7 +270,7 @@ pub fn construct_ic_stack(
245270
execution_services.ingress_history_reader,
246271
artifact_pools,
247272
cycles_account_manager,
248-
local_store_time_reader,
273+
local_store_cert_time_reader,
249274
canister_http_adapter_client,
250275
config.nns_registry_replicator.poll_delay_duration_ms,
251276
);

0 commit comments

Comments
 (0)