Skip to content

Commit 20ae3f5

Browse files
committed
Use L3 cache topology instead of NUMA topology for default plotting threads
1 parent ca2f3f6 commit 20ae3f5

File tree

2 files changed

+42
-17
lines changed

2 files changed

+42
-17
lines changed

crates/subspace-farmer/src/bin/subspace-farmer/commands/farm.rs

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ use subspace_farmer::utils::readers_and_pieces::ReadersAndPieces;
3131
use subspace_farmer::utils::ss58::parse_ss58_reward_address;
3232
use subspace_farmer::utils::{
3333
all_cpu_cores, create_plotting_thread_pool_manager, parse_cpu_cores_sets,
34-
run_future_in_dedicated_thread, thread_pool_core_indices, AsyncJoinOnDrop,
34+
recommended_number_of_farming_threads, run_future_in_dedicated_thread,
35+
thread_pool_core_indices, AsyncJoinOnDrop,
3536
};
3637
use subspace_farmer::{Identity, NodeClient, NodeRpcClient};
3738
use subspace_farmer_components::plotting::PlottedSector;
@@ -113,7 +114,8 @@ pub(crate) struct FarmingArgs {
113114
#[arg(long)]
114115
sector_downloading_concurrency: Option<NonZeroUsize>,
115116
/// Defines how many sectors farmer will encode concurrently, defaults to 1 on UMA system and
116-
/// number of NUMA nodes on NUMA system. It is further restricted by
117+
/// number of NUMA nodes on NUMA system or L3 cache groups on large CPUs. It is further
118+
/// restricted by
117119
/// `--sector-downloading-concurrency` and setting this option higher than
118120
/// `--sector-downloading-concurrency` will have no effect.
119121
#[arg(long)]
@@ -130,7 +132,8 @@ pub(crate) struct FarmingArgs {
130132
#[arg(long)]
131133
farming_thread_pool_size: Option<NonZeroUsize>,
132134
/// Size of one thread pool used for plotting, defaults to number of logical CPUs available
133-
/// on UMA system and number of logical CPUs available in NUMA node on NUMA system.
135+
/// on UMA system and number of logical CPUs available in NUMA node on NUMA system or L3 cache
136+
/// groups on large CPUs.
134137
///
135138
/// Number of thread pools is defined by `--sector-encoding-concurrency` option, different
136139
/// thread pools might have different number of threads if NUMA nodes do not have the same size.
@@ -151,7 +154,8 @@ pub(crate) struct FarmingArgs {
151154
plotting_cpu_cores: Option<String>,
152155
/// Size of one thread pool used for replotting, typically smaller pool than for plotting
153156
/// to not affect farming as much, defaults to half of the number of logical CPUs available on
154-
/// UMA system and number of logical CPUs available in NUMA node on NUMA system.
157+
/// UMA system and number of logical CPUs available in NUMA node on NUMA system or L3 cache
158+
/// groups on large CPUs.
155159
///
156160
/// Number of thread pools is defined by `--sector-encoding-concurrency` option, different
157161
/// thread pools might have different number of threads if NUMA nodes do not have the same size.
@@ -495,22 +499,16 @@ where
495499
.unwrap_or(plotting_thread_pool_core_indices.len() + 1),
496500
));
497501

498-
let all_cpu_cores = all_cpu_cores();
499502
let plotting_thread_pool_manager = create_plotting_thread_pool_manager(
500503
plotting_thread_pool_core_indices
501504
.into_iter()
502505
.zip(replotting_thread_pool_core_indices),
503506
)?;
504507
let farming_thread_pool_size = farming_thread_pool_size
505508
.map(|farming_thread_pool_size| farming_thread_pool_size.get())
506-
.unwrap_or_else(|| {
507-
all_cpu_cores
508-
.first()
509-
.expect("Not empty according to function description; qed")
510-
.cpu_cores()
511-
.len()
512-
});
509+
.unwrap_or_else(recommended_number_of_farming_threads);
513510

511+
let all_cpu_cores = all_cpu_cores();
514512
if all_cpu_cores.len() > 1 {
515513
info!(numa_nodes = %all_cpu_cores.len(), "NUMA system detected");
516514

crates/subspace-farmer/src/utils.rs

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use crate::thread_pool_manager::{PlottingThreadPoolManager, PlottingThreadPoolPa
99
use futures::channel::oneshot;
1010
use futures::channel::oneshot::Canceled;
1111
use futures::future::Either;
12+
use hwlocality::object::types::ObjectType;
1213
use rayon::{ThreadBuilder, ThreadPool, ThreadPoolBuildError, ThreadPoolBuilder};
1314
use std::future::Future;
1415
use std::num::{NonZeroUsize, ParseIntError};
@@ -187,7 +188,30 @@ impl CpuCoreSet {
187188
}
188189
}
189190

190-
/// Get all cpu cores, grouped into sets according to NUMA nodes.
191+
/// Recommended number of thread pool size for farming, equal to number of CPU cores in the first
192+
/// NUMA node
193+
pub fn recommended_number_of_farming_threads() -> usize {
194+
#[cfg(feature = "numa")]
195+
match hwlocality::Topology::new().map(std::sync::Arc::new) {
196+
Ok(topology) => {
197+
return topology
198+
// Iterate over NUMA nodes
199+
.objects_at_depth(hwlocality::object::depth::Depth::NUMANode)
200+
// For each NUMA nodes get CPU set
201+
.filter_map(|node| node.cpuset())
202+
// Get number of CPU cores
203+
.map(|cpuset| cpuset.iter_set().count())
204+
.find(|&count| count > 0)
205+
.unwrap_or_else(num_cpus::get);
206+
}
207+
Err(error) => {
208+
warn!(%error, "Failed to get NUMA topology");
209+
}
210+
}
211+
num_cpus::get()
212+
}
213+
214+
/// Get all cpu cores, grouped into sets according to NUMA nodes or L3 cache groups on large CPUs.
191215
///
192216
/// Returned vector is guaranteed to have at least one element and have non-zero number of CPU cores
193217
/// in each set.
@@ -196,8 +220,8 @@ pub fn all_cpu_cores() -> Vec<CpuCoreSet> {
196220
match hwlocality::Topology::new().map(std::sync::Arc::new) {
197221
Ok(topology) => {
198222
let cpu_cores = topology
199-
// Iterate over NUMA nodes
200-
.objects_at_depth(hwlocality::object::depth::Depth::NUMANode)
223+
// Iterate over groups of L3 caches
224+
.objects_with_type(ObjectType::L3Cache)
201225
// For each NUMA nodes get CPU set
202226
.filter_map(|node| node.cpuset())
203227
// For each CPU set extract individual cores
@@ -214,7 +238,7 @@ pub fn all_cpu_cores() -> Vec<CpuCoreSet> {
214238
}
215239
}
216240
Err(error) => {
217-
warn!(%error, "Failed to get CPU topology");
241+
warn!(%error, "Failed to get L3 cache topology");
218242
}
219243
}
220244
vec![CpuCoreSet {
@@ -227,6 +251,9 @@ pub fn all_cpu_cores() -> Vec<CpuCoreSet> {
227251
/// Parse space-separated set of groups of CPU cores (individual cores are coma-separated) into
228252
/// vector of CPU core sets that can be used for creation of plotting/replotting thread pools.
229253
pub fn parse_cpu_cores_sets(s: &str) -> Result<Vec<CpuCoreSet>, ParseIntError> {
254+
#[cfg(feature = "numa")]
255+
let topology = hwlocality::Topology::new().map(std::sync::Arc::new).ok();
256+
230257
s.split(' ')
231258
.map(|s| {
232259
let cores = s
@@ -237,7 +264,7 @@ pub fn parse_cpu_cores_sets(s: &str) -> Result<Vec<CpuCoreSet>, ParseIntError> {
237264
Ok(CpuCoreSet {
238265
cores,
239266
#[cfg(feature = "numa")]
240-
topology: hwlocality::Topology::new().map(std::sync::Arc::new).ok(),
267+
topology: topology.clone(),
241268
})
242269
})
243270
.collect()

0 commit comments

Comments
 (0)