Skip to content

Commit 9f8480e

Browse files
authored
Configure tuning parameters via CMake interface (#183)
* Configure tuning parameters via CMake interface * Install tuning_params.hpp
1 parent a5fdb08 commit 9f8480e

File tree

3 files changed

+76
-12
lines changed

3 files changed

+76
-12
lines changed

CMakeLists.txt

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,64 @@ option(ALUMINUM_ENABLE_BENCHMARKS
144144
"Build benchmarks."
145145
OFF)
146146

147+
# Tuning parameters (in the order they appear in the file). Recall:
148+
# Cache values previously set are not modified. These only take effect
149+
# if the cache values do not already exist.
150+
#
151+
# See extended documentation in cmake/tuning_params.hpp.in.
152+
set(AL_PE_NUM_CONCURRENT_OPS 4
153+
CACHE STRING
154+
"Number of concurrent operations the progress engine will perform")
155+
156+
set(AL_PE_NUM_STREAMS 64
157+
CACHE STRING
158+
"Max number of streams the progress engine supports")
159+
160+
set(AL_PE_NUM_PIPELINE_STAGES 2
161+
CACHE STRING
162+
"Max number of pipeline stages the progress engine supports")
163+
164+
set(AL_PE_INPUT_QUEUE_SIZE 8192
165+
CACHE STRING
166+
"Max number of entries in each stream's input queue")
167+
168+
option(AL_PE_ADD_DEFAULT_STREAM
169+
"Automatically add a default stream entry form the progress engine"
170+
OFF)
171+
172+
option(AL_PE_STREAM_QUEUE_CACHE
173+
"Use thread-local cache to map streams to input queues"
174+
OFF)
175+
176+
option(AL_PE_START_ON_DEMAND
177+
"Delay starting the progress engine until needed"
178+
ON)
179+
180+
set(AL_SYNC_MEM_PREALLOC 1024
181+
CACHE STRING
182+
"Amount of sync object memory to preallocate in the pool")
183+
184+
set(AL_DEFAULT_CACHE_LINE_SIZE 64) # x86_64
185+
if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^ppc")
186+
set(AL_DEFAULT_CACHE_LINE_SIZE 128) # power
187+
endif ()
188+
# TODO: I'd like to also detect A64FX but fugaku head nodes are x86
189+
# and cross-compiles are wonky and that's too much work. For now, just
190+
# manually set this on the command line on A64FX.
191+
set(AL_CACHE_LINE_SIZE ${AL_DEFAULT_CACHE_LINE_SIZE}
192+
CACHE STRING
193+
"Cache line size in bytes (x86: 64; POWER: 128; A64FX: 256)")
194+
195+
set(AL_DESTRUCTIVE_INTERFERENCE_SIZE 128
196+
CACHE STRING
197+
"Minimum size in bytes to avoid destructive interference")
198+
199+
set(AL_CUDA_STREAM_POOL_SIZE 5
200+
CACHE STRING
201+
"Number of CUDA streams in the default stream pool")
202+
203+
# END Tuning parameters
204+
147205
if (ALUMINUM_HAS_GPU
148206
AND NOT ALUMINUM_ENABLE_NCCL
149207
AND NOT ALUMINUM_ENABLE_MPI_CUDA
@@ -406,6 +464,10 @@ configure_file(
406464
"${CMAKE_SOURCE_DIR}/cmake/Al_config.hpp.in"
407465
"${CMAKE_BINARY_DIR}/Al_config.hpp" @ONLY)
408466

467+
configure_file(
468+
"${CMAKE_SOURCE_DIR}/cmake/tuning_params.hpp.in"
469+
"${CMAKE_BINARY_DIR}/aluminum/tuning_params.hpp" @ONLY)
470+
409471
# Macro for setting full paths to source files.
410472
macro(set_source_path VAR)
411473
unset(__tmp_names)
@@ -475,6 +537,9 @@ install(FILES
475537
DESTINATION ${CMAKE_INSTALL_DIR})
476538
install(FILES
477539
"${CMAKE_BINARY_DIR}/Al_config.hpp" DESTINATION ${INCLUDE_INSTALL_DIRS})
540+
install(FILES
541+
"${CMAKE_BINARY_DIR}/aluminum/tuning_params.hpp"
542+
DESTINATION ${INCLUDE_INSTALL_DIRS}/aluminum)
478543

479544
# Install the CMake modules we need
480545
install(FILES
Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,21 +33,21 @@
3333
#pragma once
3434

3535
/** Number of concurrent operations the progress engine will perform. */
36-
#define AL_PE_NUM_CONCURRENT_OPS 4
36+
#define AL_PE_NUM_CONCURRENT_OPS @AL_PE_NUM_CONCURRENT_OPS@
3737
/** Max number of streams the progress engine supports. */
38-
#define AL_PE_NUM_STREAMS 64
38+
#define AL_PE_NUM_STREAMS @AL_PE_NUM_STREAMS@
3939
/** Max number of pipeline stages the progress engine supports. */
40-
#define AL_PE_NUM_PIPELINE_STAGES 2
40+
#define AL_PE_NUM_PIPELINE_STAGES @AL_PE_NUM_PIPELINE_STAGES@
4141
/** Max number of entries in each stream's input queue. */
42-
#define AL_PE_INPUT_QUEUE_SIZE 8192
42+
#define AL_PE_INPUT_QUEUE_SIZE @AL_PE_INPUT_QUEUE_SIZE@
4343
/**
4444
* Whether to have a default stream entry for the progress engine
4545
* added automatically.
4646
*
4747
* This makes sense when using MPI, but not so when using the
4848
* host-transfer backend, which does not use the default stream.
4949
*/
50-
// #define AL_PE_ADD_DEFAULT_STREAM 1
50+
#cmakedefine AL_PE_ADD_DEFAULT_STREAM
5151
/**
5252
* Whether to use a thread-local cache to map streams to input queues
5353
* for the progress engine.
@@ -56,25 +56,25 @@
5656
* is unlikely to help, since searching it will take as long as
5757
* searching the actual list.
5858
*/
59-
// #define AL_PE_STREAM_QUEUE_CACHE 1
59+
#cmakedefine AL_PE_STREAM_QUEUE_CACHE
6060

6161
/**
6262
* Whether to delay starting the progress engine until it is actually
6363
* needed. This results in a one-time penalty on the first call to an
6464
* operation that uses the progress engine, but only a quick check
6565
* thereafter.
6666
*/
67-
#define AL_PE_START_ON_DEMAND 1
67+
#cmakedefine AL_PE_START_ON_DEMAND
6868

6969
/** Amount of sync object memory to preallocate in the pool. */
70-
#define AL_SYNC_MEM_PREALLOC 1024
70+
#define AL_SYNC_MEM_PREALLOC @AL_SYNC_MEM_PREALLOC@
7171

7272
/**
7373
* Cache line size in bytes.
7474
*
7575
* On x86 this is usually 64. On POWER this is 128. On A64FX this is 256.
7676
*/
77-
#define AL_CACHE_LINE_SIZE 64
77+
#define AL_CACHE_LINE_SIZE @AL_CACHE_LINE_SIZE@
7878

7979
/**
8080
* Minimum size in bytes to avoid destructive interference.
@@ -83,7 +83,7 @@
8383
* be twice the cache line size, because Intel processors can fetch
8484
* two adjacent cache lines (see Intel Optimization Manual, 3.7.3).
8585
*/
86-
#define AL_DESTRUCTIVE_INTERFERENCE_SIZE 128
86+
#define AL_DESTRUCTIVE_INTERFERENCE_SIZE @AL_DESTRUCTIVE_INTERFERENCE_SIZE@
8787

8888
/** Number of CUDA streams in the default stream pool. */
89-
#define AL_CUDA_STREAM_POOL_SIZE 5
89+
#define AL_CUDA_STREAM_POOL_SIZE @AL_CUDA_STREAM_POOL_SIZE@

include/aluminum/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ set_source_path(THIS_DIR_HEADERS
99
progress.hpp
1010
state.hpp
1111
trace.hpp
12-
tuning_params.hpp
1312
)
1413
set_source_path(THIS_DIR_CUDA_HEADERS
1514
cuda.hpp

0 commit comments

Comments
 (0)