QuEST-Kit
diff --git a/‎.github/workflows/test_paid.yml
Lines changed: 4 additions & 1 deletion b/‎.github/workflows/test_paid.yml
Lines changed: 4 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 18 deletions b/‎CMakeLists.txt
Lines changed: 1 addition & 18 deletions
diff --git a/‎docs/launch.md
Lines changed: 20 additions & 4 deletions b/‎docs/launch.md
Lines changed: 20 additions & 4 deletions
diff --git a/‎quest/include/environment.h
Lines changed: 3 additions & 0 deletions b/‎quest/include/environment.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎quest/include/modes.h
Lines changed: 67 additions & 9 deletions b/‎quest/include/modes.h
Lines changed: 67 additions & 9 deletions
diff --git a/‎quest/include/precision.h
Lines changed: 8 additions & 23 deletions b/‎quest/include/precision.h
Lines changed: 8 additions & 23 deletions
diff --git a/‎quest/src/api/environment.cpp
Lines changed: 26 additions & 14 deletions b/‎quest/src/api/environment.cpp
Lines changed: 26 additions & 14 deletions
diff --git a/‎quest/src/comm/comm_routines.cpp
Lines changed: 6 additions & 0 deletions b/‎quest/src/comm/comm_routines.cpp
Lines changed: 6 additions & 0 deletions
diff --git a/‎quest/src/core/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎quest/src/core/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
@@ -257,12 +257,15 @@ jobs:
           -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
           -DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
           -DTEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}
-          -DPERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}
           -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
 
       - name: Compile
         run: cmake --build ${{ env.build_dir }} --parallel
 
+      # permit use of single GPU by multiple MPI processes (detriments performance)
+      - name: Set env-var to permit GPU sharing
+        run: echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV
+
       # cannot use ctests when distributed, grr!
       - name: Run GPU + distributed v4 mixed tests (4 nodes sharing 1 GPU)
         run: |
 
@@ -231,15 +231,6 @@ if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4)
   message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.")
 endif()
 
-option(
-  PERMIT_NODES_TO_SHARE_GPU
-  "Whether to permit multiple distributed nodes to share a single GPU at the detriment of performance. Turned OFF by default."
-  OFF
-)
-if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
-  message(STATUS "Permitting nodes to share GPUs is turned ${PERMIT_NODES_TO_SHARE_GPU}. Set PERMIT_NODES_TO_SHARE_GPU to modify.")
-endif()
-
 # Deprecated API
 option(
   ENABLE_DEPRECATED_API
@@ -318,7 +309,7 @@ if (ENABLE_MULTITHREADING)
   if (NOT OpenMP_FOUND)
     set(ErrorMsg "Could not find OpenMP, necessary for enabling multithreading.")
     if (APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-      string(APPEND ErrorMsg " Try first calling `brew install libomp` then `export OpenMP_ROOT=$(brew --prefix)/opt/libomp`")
+      string(APPEND ErrorMsg " Try first calling \n\tbrew install libomp\nthen\n\texport OpenMP_ROOT=$(brew --prefix)/opt/libomp")
     endif()
     message(FATAL_ERROR ${ErrorMsg})
   endif()
@@ -434,14 +425,6 @@ else()
 endif()
 
 
-if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
-  target_compile_definitions(
-    QuEST PRIVATE 
-    PERMIT_NODES_TO_SHARE_GPU=$<IF:$<BOOL:${PERMIT_NODES_TO_SHARE_GPU}>,1,0>
-  )
-endif()
-
-
 # add math library
 if (NOT MSVC)
   target_link_libraries(QuEST PRIVATE ${MATH_LIBRARY})
 
@@ -22,18 +22,19 @@ Launching your [compiled](compile.md) QuEST application can be as straightforwar
 > - <a href="#launch_tests">Tests</a>
 >    * <a href="#launch_v4">v4</a>
 >    * <a href="#launch_v3">v3</a>
+> - <a href="#launch_configuring">Configuring</a>
 > - <a href="#launch_multithreading">Multithreading</a>
 >    * <a href="#launch_choosing-threads">Choosing threads</a>
 >    * <a href="#launch_monitoring-utilisation">Monitoring utilisation</a>
 >    * <a href="#launch_improving-performance">Improving performance</a>
 > - <a href="#launch_gpu-acceleration">GPU-acceleration</a>
 >    * <a href="#launch_launching">Launching</a>
 >    * <a href="#launch_monitoring">Monitoring</a>
->    * <a href="#launch_configuring">Configuring</a>
+>    * <a href="#launch_configuring-1">Configuring</a>
 >    * <a href="#launch_benchmarking">Benchmarking</a>
 > - <a href="#launch_distribution">Distribution</a>
 >    * <a href="#launch_launching-1">Launching</a>
->    * <a href="#launch_configuring-1">Configuring</a>
+>    * <a href="#launch_configuring-2">Configuring</a>
 >    * <a href="#launch_benchmarking-1">Benchmarking</a>
 > - <a href="#launch_multi-gpu">Multi-GPU</a>
 > - <a href="#launch_supercomputers">Supercomputers</a>
@@ -243,6 +244,21 @@ ctest
 
 
 
+---------------------
+
+<!-- permit doxygen to reference section -->
+<a id="launch_configuring"></a>
+
+## Configuring
+
+QuEST execution can be configured prior to runtime using the below [environment variables](https://en.wikipedia.org/wiki/Environment_variable).
+
+- [`PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga7e12922138caa68ddaa6221e40f62dda)
+- [`DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#ga55810d6f3d23de810cd9b12a2bbb8cc2)
+
+
+
+
 ---------------------
 
 
@@ -429,7 +445,7 @@ Usage of GPU-acceleration can be (inadvisably) forced using [`createForcedQureg(
 
 
 <!-- permit doxygen to reference section -->
-<a id="launch_configuring"></a>
+<a id="launch_configuring-1"></a>
 
 ### Configuring
 
@@ -514,7 +530,7 @@ mpirun -np 1024 --oversubscribe ./mytests
 
 
 <!-- permit doxygen to reference section -->
-<a id="launch_configuring-1"></a>
+<a id="launch_configuring-2"></a>
 
 ### Configuring
 
 
@@ -40,6 +40,9 @@ typedef struct {
     // deployment modes which cannot be directly changed after compilation
     int isCuQuantumEnabled;
 
+    // deployment configurations which can be changed via environment variables
+    int isGpuSharingEnabled;
+
     // distributed configuration
     int rank;
     int numNodes;
 
@@ -56,10 +56,6 @@
 
 // define optional-macro defaults (mostly to list them)
 
-#ifndef PERMIT_NODES_TO_SHARE_GPU
-#define PERMIT_NODES_TO_SHARE_GPU 0
-#endif
-
 #ifndef INCLUDE_DEPRECATED_FUNCTIONS
 #define INCLUDE_DEPRECATED_FUNCTIONS 0
 #endif
@@ -74,11 +70,6 @@
 #if 0
 
 
-    /// @notyetdoced
-    /// @macrodoc
-    const int PERMIT_NODES_TO_SHARE_GPU = 0;
-
-
     /// @notyetdoced
     /// @macrodoc
     const int INCLUDE_DEPRECATED_FUNCTIONS = 0;
@@ -93,6 +84,73 @@
 
 
 
+// document environment variables
+
+// spoof env-vars as consts to doc (hackily and hopefully temporarily)
+#if 0
+
+
+    /** @envvardoc
+     * 
+     * Specifies whether to permit multiple MPI processes to deploy to the same GPU.
+     * 
+     * @attention 
+     * This environment variable has no effect when either (or both) of distribution or 
+     * GPU-acceleration are disabled.
+     * 
+     * In multi-GPU execution, which combines distribution with GPU-acceleration, it is 
+     * prudent to assign each GPU to at most one MPI process in order to avoid superfluous 
+     * slowdown. Hence by default, initQuESTEnv() will forbid assigning multiple MPI processes 
+     * to the same GPU. This environment variable can be set to `1` to disable this validation, 
+     * permitting sharing of a single GPU, as is often useful for debugging or unit testing 
+     * (for example, testing multi-GPU execution when only a single GPU is available).
+     * 
+     * @warning
+     * Permitting GPU sharing may cause unintended behaviour when additionally using cuQuantum.
+     * 
+     * @envvarvalues
+     *  - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
+     *  - permit sharing: @p 1, @p '1'
+     * 
+     * @author Tyson Jones
+     */
+    const int PERMIT_NODES_TO_SHARE_GPU = 0;
+
+
+    /** @envvardoc
+     * 
+     * Specifies the default validation epsilon. 
+     * 
+     * Specifying `DEFAULT_VALIDATION_EPSILON` to a positive, real number overrides the 
+     * precision-specific default (`1E-5`, `1E-12`, `1E-13` for single, double and quadruple 
+     * precision respectively). The specified epsilon is used by QuEST for numerical validation
+     * unless overriden at runtime via setValidationEpsilon(), in which case it can be
+     * restored to that specified by this environment variable using setValidationEpsilonToDefault().
+     * 
+     * @envvarvalues
+     *  - setting @p DEFAULT_VALIDATION_EPSILON=0 disables numerical validation, as if the value
+     *    were instead infinity.
+     *  - setting @p DEFAULT_VALIDATION_EPSILON='' is equivalent to _not_ specifying the variable,
+     *    adopting instead the precision-specific default above.
+     *  - setting @p DEFAULT_VALIDATION_EPSILON=x where `x` is a positive, valid `qreal` in any
+     *    format accepted by `C` or `C++` (e.g. `0.01`, `1E-2`, `+1e-2`) will use `x` as the
+     *    default validation epsilon.
+     * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if:
+     *   - The specified epsilon must be `0` or positive.
+     *   - The specified epsilon must not exceed that maximum or minimum value which can be stored
+     *     in a `qreal`, which is specific to its precision.
+     * 
+     * @author Tyson Jones
+     */
+    const qreal DEFAULT_VALIDATION_EPSILON = 0;
+
+
+#endif
+
+
+
 // user flags for choosing automatic deployment; only accessible by C++ 
 // backend and C++ users; C users must hardcode -1 
 
 
@@ -121,34 +121,19 @@
 
 
 /*
- * RE-CONFIGURABLE DEFAULT VALIDATION PRECISION
+ * DEFAULT VALIDATION PRECISION
  *
- * which is compile-time overridable by pre-defining DEFAULT_VALIDATION_EPSILON (e.g. 
- * in user code before importing QuEST, or passed as a preprocessor constant by the
- * compiler using argument -D), and runtime overridable using setValidationEpsilon()
+ * which is pre-run-time overridable by specifying the corresponding environment variable.
  */
 
-#ifndef DEFAULT_VALIDATION_EPSILON
-
-    #if FLOAT_PRECISION == 1
-        #define DEFAULT_VALIDATION_EPSILON 1E-5
-
-    #elif FLOAT_PRECISION == 2
-        #define DEFAULT_VALIDATION_EPSILON 1E-12
-
-    #elif FLOAT_PRECISION == 4
-        #define DEFAULT_VALIDATION_EPSILON 1E-13
-
-    #endif
-
-#endif
+#if FLOAT_PRECISION == 1
+    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
 
-// spoofing above macros as typedefs and consts to doc
-#if 0
+#elif FLOAT_PRECISION == 2
+    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
 
-    /// @notyetdoced
-    /// @macrodoc
-    const qreal DEFAULT_VALIDATION_EPSILON = 1E-12;
+#elif FLOAT_PRECISION == 4
+    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
 
 #endif
 
 
@@ -11,7 +11,9 @@
 
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/memory.hpp"
+#include "quest/src/core/parser.hpp"
 #include "quest/src/core/printer.hpp"
+#include "quest/src/core/envvars.hpp"
 #include "quest/src/core/autodeployer.hpp"
 #include "quest/src/core/validation.hpp"
 #include "quest/src/core/randomiser.hpp"
@@ -75,6 +77,9 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
     // this leads to undefined behaviour in distributed mode, as per the MPI
     validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);
 
+    envvars_validateAndLoadEnvVars(caller);
+    validateconfig_setEpsilonToDefault();
+
     // ensure the chosen deployment is compiled and supported by hardware.
     // note that these error messages will be printed by every node because
     // validation occurs before comm_init() below, so all processes spawned
@@ -102,12 +107,17 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
     if (useGpuAccel)
         gpu_bindLocalGPUsToNodes();
 
-    // each MPI process must use a unique GPU. This is critical when
-    // initializing cuQuantum, so we don't re-init cuStateVec on any
-    // paticular GPU (causing runtime error), but still ensures we 
-    // keep good performance in our custom backend GPU code; there is
-    // no reason to use multi-nodes-per-GPU except for dev/debugging.
-    if (useGpuAccel && useDistrib && ! PERMIT_NODES_TO_SHARE_GPU)
+    // consult environment variable to decide whether to allow GPU sharing 
+    // (default = false) which informs whether below validation is triggered
+    bool permitGpuSharing = envvars_getWhetherGpuSharingIsPermitted();
+
+    // each MPI process should ordinarily use a unique GPU. This is 
+    // critical when initializing cuQuantum so that we don't re-init 
+    // cuStateVec on any paticular GPU (which can apparently cause a
+    // so-far-unwitnessed runtime error), but is otherwise essential
+    // for good performance. GPU sharing is useful for unit testing
+    // however permitting a single GPU to test CUDA+MPI deployment
+    if (useGpuAccel && useDistrib && ! permitGpuSharing)
         validate_newEnvNodesEachHaveUniqueGpu(caller);
 
     /// @todo
@@ -132,10 +142,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
         error_allocOfQuESTEnvFailed();
 
     // bind deployment info to global instance
-    globalEnvPtr->isMultithreaded    = useMultithread;
-    globalEnvPtr->isGpuAccelerated   = useGpuAccel;
-    globalEnvPtr->isDistributed      = useDistrib;
-    globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
+    globalEnvPtr->isMultithreaded     = useMultithread;
+    globalEnvPtr->isGpuAccelerated    = useGpuAccel;
+    globalEnvPtr->isDistributed       = useDistrib;
+    globalEnvPtr->isCuQuantumEnabled  = useCuQuantum;
+    globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;
 
     // bind distributed info
     globalEnvPtr->rank     = (useDistrib)? comm_getRank()     : 0;
@@ -188,10 +199,11 @@ void printDeploymentInfo() {
 
     print_table(
         "deployment", {
-        {"isMpiEnabled",       globalEnvPtr->isDistributed},
-        {"isGpuEnabled",       globalEnvPtr->isGpuAccelerated},
-        {"isOmpEnabled",       globalEnvPtr->isMultithreaded},
-        {"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
+        {"isMpiEnabled",        globalEnvPtr->isDistributed},
+        {"isGpuEnabled",        globalEnvPtr->isGpuAccelerated},
+        {"isOmpEnabled",        globalEnvPtr->isMultithreaded},
+        {"isCuQuantumEnabled",  globalEnvPtr->isCuQuantumEnabled},
+        {"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
     });
 }
 
 
@@ -76,6 +76,12 @@ using std::vector;
  * 
  * - look into UCX CUDA multi-rail:
  *   https://docs.nvidia.com/networking/display/hpcxv215/unified+communication+-+x+framework+library#src-119764120_UnifiedCommunicationXFrameworkLibrary-Multi-RailMulti-Rail 
+ * 
+ * - by default, we validate to prevent sharing a GPU between multiple MPI processes since it is
+ *   easy to do unintentionally yet is rarely necessary (outside of unit testing) and can severely 
+ *   degrade performance. If we motivated a strong non-testing use-case for this however, we could
+ *   improve performance through use of CUDA's Multi-Process Service (MPS) which will prevent
+ *   serialisation of memcpy to distinct memory partitions and improve kernel scheduling.  
  */
 
 
 
@@ -4,6 +4,7 @@ target_sources(QuEST
   PRIVATE
   accelerator.cpp
   autodeployer.cpp
+  envvars.cpp
   errors.cpp
   localiser.cpp
   memory.cpp