Skip to content

Commit eeedec7

Browse files
added environment variables (#653)
which enable configuring QuEST's execution after compilation, before QuEST environment initialisation, solving some of the issues lamented in #645 and generally being more sensible/convenient. It also patched an esoteric bug in the parsing of floating-point numbers, affecting functions like initInlinePauliStrSum(). Refactor included: - adding (basic) utilities for parsing environment variables. - changing PERMIT_NODES_TO_SHARE_GPU and DEFAULT_EPSILON_ENV_VAR_NOT_A_REAL from macros to environment variables. The latter empowers users to disable all numerically-sensitive validation without modifying or recompiling their code. - patching the parsing of non-quadruple-precision floats which would previously see numbers beyond the qcomp-range silently over or underflow instead of throwing an error (see commit a66f797). - inserted whitespaces into cmake error message about MacOS multithreading to make the advised commands clearer. A subsequent commit will refactor some unit-testing macros to non-QuEST-managed environment variables.
1 parent f28691c commit eeedec7

22 files changed

+652
-167
lines changed

.github/workflows/test_paid.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,12 +257,15 @@ jobs:
257257
-DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
258258
-DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
259259
-DTEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}
260-
-DPERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}
261260
-DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
262261
263262
- name: Compile
264263
run: cmake --build ${{ env.build_dir }} --parallel
265264

265+
# permit use of single GPU by multiple MPI processes (detriments performance)
266+
- name: Set env-var to permit GPU sharing
267+
run: echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV
268+
266269
# cannot use ctests when distributed, grr!
267270
- name: Run GPU + distributed v4 mixed tests (4 nodes sharing 1 GPU)
268271
run: |

CMakeLists.txt

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -231,15 +231,6 @@ if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4)
231231
message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.")
232232
endif()
233233

234-
option(
235-
PERMIT_NODES_TO_SHARE_GPU
236-
"Whether to permit multiple distributed nodes to share a single GPU at the detriment of performance. Turned OFF by default."
237-
OFF
238-
)
239-
if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
240-
message(STATUS "Permitting nodes to share GPUs is turned ${PERMIT_NODES_TO_SHARE_GPU}. Set PERMIT_NODES_TO_SHARE_GPU to modify.")
241-
endif()
242-
243234
# Deprecated API
244235
option(
245236
ENABLE_DEPRECATED_API
@@ -318,7 +309,7 @@ if (ENABLE_MULTITHREADING)
318309
if (NOT OpenMP_FOUND)
319310
set(ErrorMsg "Could not find OpenMP, necessary for enabling multithreading.")
320311
if (APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
321-
string(APPEND ErrorMsg " Try first calling `brew install libomp` then `export OpenMP_ROOT=$(brew --prefix)/opt/libomp`")
312+
string(APPEND ErrorMsg " Try first calling \n\tbrew install libomp\nthen\n\texport OpenMP_ROOT=$(brew --prefix)/opt/libomp")
322313
endif()
323314
message(FATAL_ERROR ${ErrorMsg})
324315
endif()
@@ -434,14 +425,6 @@ else()
434425
endif()
435426

436427

437-
if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
438-
target_compile_definitions(
439-
QuEST PRIVATE
440-
PERMIT_NODES_TO_SHARE_GPU=$<IF:$<BOOL:${PERMIT_NODES_TO_SHARE_GPU}>,1,0>
441-
)
442-
endif()
443-
444-
445428
# add math library
446429
if (NOT MSVC)
447430
target_link_libraries(QuEST PRIVATE ${MATH_LIBRARY})

docs/launch.md

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,19 @@ Launching your [compiled](compile.md) QuEST application can be as straightforwar
2222
> - <a href="#launch_tests">Tests</a>
2323
> * <a href="#launch_v4">v4</a>
2424
> * <a href="#launch_v3">v3</a>
25+
> - <a href="#launch_configuring">Configuring</a>
2526
> - <a href="#launch_multithreading">Multithreading</a>
2627
> * <a href="#launch_choosing-threads">Choosing threads</a>
2728
> * <a href="#launch_monitoring-utilisation">Monitoring utilisation</a>
2829
> * <a href="#launch_improving-performance">Improving performance</a>
2930
> - <a href="#launch_gpu-acceleration">GPU-acceleration</a>
3031
> * <a href="#launch_launching">Launching</a>
3132
> * <a href="#launch_monitoring">Monitoring</a>
32-
> * <a href="#launch_configuring">Configuring</a>
33+
> * <a href="#launch_configuring-1">Configuring</a>
3334
> * <a href="#launch_benchmarking">Benchmarking</a>
3435
> - <a href="#launch_distribution">Distribution</a>
3536
> * <a href="#launch_launching-1">Launching</a>
36-
> * <a href="#launch_configuring-1">Configuring</a>
37+
> * <a href="#launch_configuring-2">Configuring</a>
3738
> * <a href="#launch_benchmarking-1">Benchmarking</a>
3839
> - <a href="#launch_multi-gpu">Multi-GPU</a>
3940
> - <a href="#launch_supercomputers">Supercomputers</a>
@@ -243,6 +244,21 @@ ctest
243244

244245

245246

247+
---------------------
248+
249+
<!-- permit doxygen to reference section -->
250+
<a id="launch_configuring"></a>
251+
252+
## Configuring
253+
254+
QuEST execution can be configured prior to runtime using the below [environment variables](https://en.wikipedia.org/wiki/Environment_variable).
255+
256+
- [`PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga7e12922138caa68ddaa6221e40f62dda)
257+
- [`DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#ga55810d6f3d23de810cd9b12a2bbb8cc2)
258+
259+
260+
261+
246262
---------------------
247263

248264

@@ -429,7 +445,7 @@ Usage of GPU-acceleration can be (inadvisably) forced using [`createForcedQureg(
429445

430446

431447
<!-- permit doxygen to reference section -->
432-
<a id="launch_configuring"></a>
448+
<a id="launch_configuring-1"></a>
433449

434450
### Configuring
435451

@@ -514,7 +530,7 @@ mpirun -np 1024 --oversubscribe ./mytests
514530
515531

516532
<!-- permit doxygen to reference section -->
517-
<a id="launch_configuring-1"></a>
533+
<a id="launch_configuring-2"></a>
518534

519535
### Configuring
520536

quest/include/environment.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ typedef struct {
4040
// deployment modes which cannot be directly changed after compilation
4141
int isCuQuantumEnabled;
4242

43+
// deployment configurations which can be changed via environment variables
44+
int isGpuSharingEnabled;
45+
4346
// distributed configuration
4447
int rank;
4548
int numNodes;

quest/include/modes.h

Lines changed: 67 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,6 @@
5656

5757
// define optional-macro defaults (mostly to list them)
5858

59-
#ifndef PERMIT_NODES_TO_SHARE_GPU
60-
#define PERMIT_NODES_TO_SHARE_GPU 0
61-
#endif
62-
6359
#ifndef INCLUDE_DEPRECATED_FUNCTIONS
6460
#define INCLUDE_DEPRECATED_FUNCTIONS 0
6561
#endif
@@ -74,11 +70,6 @@
7470
#if 0
7571

7672

77-
/// @notyetdoced
78-
/// @macrodoc
79-
const int PERMIT_NODES_TO_SHARE_GPU = 0;
80-
81-
8273
/// @notyetdoced
8374
/// @macrodoc
8475
const int INCLUDE_DEPRECATED_FUNCTIONS = 0;
@@ -93,6 +84,73 @@
9384

9485

9586

87+
// document environment variables
88+
89+
// spoof env-vars as consts to doc (hackily and hopefully temporarily)
90+
#if 0
91+
92+
93+
/** @envvardoc
94+
*
95+
* Specifies whether to permit multiple MPI processes to deploy to the same GPU.
96+
*
97+
* @attention
98+
* This environment variable has no effect when either (or both) of distribution or
99+
* GPU-acceleration are disabled.
100+
*
101+
* In multi-GPU execution, which combines distribution with GPU-acceleration, it is
102+
* prudent to assign each GPU to at most one MPI process in order to avoid superfluous
103+
* slowdown. Hence by default, initQuESTEnv() will forbid assigning multiple MPI processes
104+
* to the same GPU. This environment variable can be set to `1` to disable this validation,
105+
* permitting sharing of a single GPU, as is often useful for debugging or unit testing
106+
* (for example, testing multi-GPU execution when only a single GPU is available).
107+
*
108+
* @warning
109+
* Permitting GPU sharing may cause unintended behaviour when additionally using cuQuantum.
110+
*
111+
* @envvarvalues
112+
* - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
113+
* - permit sharing: @p 1, @p '1'
114+
*
115+
* @author Tyson Jones
116+
*/
117+
const int PERMIT_NODES_TO_SHARE_GPU = 0;
118+
119+
120+
/** @envvardoc
121+
*
122+
* Specifies the default validation epsilon.
123+
*
124+
* Specifying `DEFAULT_VALIDATION_EPSILON` to a positive, real number overrides the
125+
* precision-specific default (`1E-5`, `1E-12`, `1E-13` for single, double and quadruple
126+
* precision respectively). The specified epsilon is used by QuEST for numerical validation
127+
* unless overriden at runtime via setValidationEpsilon(), in which case it can be
128+
* restored to that specified by this environment variable using setValidationEpsilonToDefault().
129+
*
130+
* @envvarvalues
131+
* - setting @p DEFAULT_VALIDATION_EPSILON=0 disables numerical validation, as if the value
132+
* were instead infinity.
133+
* - setting @p DEFAULT_VALIDATION_EPSILON='' is equivalent to _not_ specifying the variable,
134+
* adopting instead the precision-specific default above.
135+
* - setting @p DEFAULT_VALIDATION_EPSILON=x where `x` is a positive, valid `qreal` in any
136+
* format accepted by `C` or `C++` (e.g. `0.01`, `1E-2`, `+1e-2`) will use `x` as the
137+
* default validation epsilon.
138+
*
139+
* @constraints
140+
* The function initQuESTEnv() will throw a validation error if:
141+
* - The specified epsilon must be `0` or positive.
142+
* - The specified epsilon must not exceed that maximum or minimum value which can be stored
143+
* in a `qreal`, which is specific to its precision.
144+
*
145+
* @author Tyson Jones
146+
*/
147+
const qreal DEFAULT_VALIDATION_EPSILON = 0;
148+
149+
150+
#endif
151+
152+
153+
96154
// user flags for choosing automatic deployment; only accessible by C++
97155
// backend and C++ users; C users must hardcode -1
98156

quest/include/precision.h

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -121,34 +121,19 @@
121121

122122

123123
/*
124-
* RE-CONFIGURABLE DEFAULT VALIDATION PRECISION
124+
* DEFAULT VALIDATION PRECISION
125125
*
126-
* which is compile-time overridable by pre-defining DEFAULT_VALIDATION_EPSILON (e.g.
127-
* in user code before importing QuEST, or passed as a preprocessor constant by the
128-
* compiler using argument -D), and runtime overridable using setValidationEpsilon()
126+
* which is pre-run-time overridable by specifying the corresponding environment variable.
129127
*/
130128

131-
#ifndef DEFAULT_VALIDATION_EPSILON
132-
133-
#if FLOAT_PRECISION == 1
134-
#define DEFAULT_VALIDATION_EPSILON 1E-5
135-
136-
#elif FLOAT_PRECISION == 2
137-
#define DEFAULT_VALIDATION_EPSILON 1E-12
138-
139-
#elif FLOAT_PRECISION == 4
140-
#define DEFAULT_VALIDATION_EPSILON 1E-13
141-
142-
#endif
143-
144-
#endif
129+
#if FLOAT_PRECISION == 1
130+
#define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
145131

146-
// spoofing above macros as typedefs and consts to doc
147-
#if 0
132+
#elif FLOAT_PRECISION == 2
133+
#define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
148134

149-
/// @notyetdoced
150-
/// @macrodoc
151-
const qreal DEFAULT_VALIDATION_EPSILON = 1E-12;
135+
#elif FLOAT_PRECISION == 4
136+
#define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
152137

153138
#endif
154139

quest/src/api/environment.cpp

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111

1212
#include "quest/src/core/errors.hpp"
1313
#include "quest/src/core/memory.hpp"
14+
#include "quest/src/core/parser.hpp"
1415
#include "quest/src/core/printer.hpp"
16+
#include "quest/src/core/envvars.hpp"
1517
#include "quest/src/core/autodeployer.hpp"
1618
#include "quest/src/core/validation.hpp"
1719
#include "quest/src/core/randomiser.hpp"
@@ -75,6 +77,9 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
7577
// this leads to undefined behaviour in distributed mode, as per the MPI
7678
validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);
7779

80+
envvars_validateAndLoadEnvVars(caller);
81+
validateconfig_setEpsilonToDefault();
82+
7883
// ensure the chosen deployment is compiled and supported by hardware.
7984
// note that these error messages will be printed by every node because
8085
// validation occurs before comm_init() below, so all processes spawned
@@ -102,12 +107,17 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
102107
if (useGpuAccel)
103108
gpu_bindLocalGPUsToNodes();
104109

105-
// each MPI process must use a unique GPU. This is critical when
106-
// initializing cuQuantum, so we don't re-init cuStateVec on any
107-
// paticular GPU (causing runtime error), but still ensures we
108-
// keep good performance in our custom backend GPU code; there is
109-
// no reason to use multi-nodes-per-GPU except for dev/debugging.
110-
if (useGpuAccel && useDistrib && ! PERMIT_NODES_TO_SHARE_GPU)
110+
// consult environment variable to decide whether to allow GPU sharing
111+
// (default = false) which informs whether below validation is triggered
112+
bool permitGpuSharing = envvars_getWhetherGpuSharingIsPermitted();
113+
114+
// each MPI process should ordinarily use a unique GPU. This is
115+
// critical when initializing cuQuantum so that we don't re-init
116+
// cuStateVec on any paticular GPU (which can apparently cause a
117+
// so-far-unwitnessed runtime error), but is otherwise essential
118+
// for good performance. GPU sharing is useful for unit testing
119+
// however permitting a single GPU to test CUDA+MPI deployment
120+
if (useGpuAccel && useDistrib && ! permitGpuSharing)
111121
validate_newEnvNodesEachHaveUniqueGpu(caller);
112122

113123
/// @todo
@@ -132,10 +142,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
132142
error_allocOfQuESTEnvFailed();
133143

134144
// bind deployment info to global instance
135-
globalEnvPtr->isMultithreaded = useMultithread;
136-
globalEnvPtr->isGpuAccelerated = useGpuAccel;
137-
globalEnvPtr->isDistributed = useDistrib;
138-
globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
145+
globalEnvPtr->isMultithreaded = useMultithread;
146+
globalEnvPtr->isGpuAccelerated = useGpuAccel;
147+
globalEnvPtr->isDistributed = useDistrib;
148+
globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
149+
globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;
139150

140151
// bind distributed info
141152
globalEnvPtr->rank = (useDistrib)? comm_getRank() : 0;
@@ -188,10 +199,11 @@ void printDeploymentInfo() {
188199

189200
print_table(
190201
"deployment", {
191-
{"isMpiEnabled", globalEnvPtr->isDistributed},
192-
{"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
193-
{"isOmpEnabled", globalEnvPtr->isMultithreaded},
194-
{"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
202+
{"isMpiEnabled", globalEnvPtr->isDistributed},
203+
{"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
204+
{"isOmpEnabled", globalEnvPtr->isMultithreaded},
205+
{"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
206+
{"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
195207
});
196208
}
197209

quest/src/comm/comm_routines.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ using std::vector;
7676
*
7777
* - look into UCX CUDA multi-rail:
7878
* https://docs.nvidia.com/networking/display/hpcxv215/unified+communication+-+x+framework+library#src-119764120_UnifiedCommunicationXFrameworkLibrary-Multi-RailMulti-Rail
79+
*
80+
* - by default, we validate to prevent sharing a GPU between multiple MPI processes since it is
81+
* easy to do unintentionally yet is rarely necessary (outside of unit testing) and can severely
82+
* degrade performance. If we motivated a strong non-testing use-case for this however, we could
83+
* improve performance through use of CUDA's Multi-Process Service (MPS) which will prevent
84+
* serialisation of memcpy to distinct memory partitions and improve kernel scheduling.
7985
*/
8086

8187

quest/src/core/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ target_sources(QuEST
44
PRIVATE
55
accelerator.cpp
66
autodeployer.cpp
7+
envvars.cpp
78
errors.cpp
89
localiser.cpp
910
memory.cpp

0 commit comments

Comments
 (0)