@@ -76,23 +76,31 @@ using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_
7676 * shutdown. If the destructor calls cudaFree and the CUDA runtime has already
7777 * been shut down then tools like cuda-memcheck reports errors.
7878 */
79+ #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
80+ nrn_pragma_omp (declare target)
81+ philox4x32_key_t g_k{};
82+ nrn_pragma_omp (end declare target)
83+ #else
7984philox4x32_key_t * g_k{};
85+ #endif
8086
8187// In a GPU build we need a device-side global pointer to this global state.
8288// This is set to the same unified memory address as `g_k` in
8389// `setup_global_state()` if the GPU is enabled. It would be cleaner to use
8490// __managed__ here, but unfortunately that does not work on machines that do
8591// not have a GPU.
86- #ifdef __CUDACC__
92+ #if defined( __CUDACC__) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
8793CORENRN_DEVICE philox4x32_key_t * g_k_dev;
8894#endif
8995
9096OMP_Mutex g_instance_count_mutex;
97+
9198std::size_t g_instance_count{};
9299
93100constexpr double SHIFT32 = 1.0 / 4294967297.0 ; /* 1/(2^32 + 1) */
94101
95102void setup_global_state () {
103+ #if !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
96104 if (g_k) {
97105 // Already initialised, nothing to do
98106 return ;
@@ -107,15 +115,18 @@ void setup_global_state() {
107115 // there is no point initialising the device global to it.
108116 {
109117 auto const code = cudaMemcpyToSymbol (g_k_dev, &g_k, sizeof (g_k));
118+ static_cast <void >(code);
110119 assert (code == cudaSuccess);
111120 }
112121 // Make sure g_k_dev is updated.
113122 {
114123 auto const code = cudaDeviceSynchronize ();
124+ static_cast <void >(code);
115125 assert (code == cudaSuccess);
116126 }
117127 }
118128#endif
129+ #endif
119130}
120131
121132/* * @brief Get the Random123 global state from either host or device code.
@@ -125,13 +136,26 @@ CORENRN_HOST_DEVICE philox4x32_key_t& get_global_state() {
125136#ifdef __CUDA_ARCH__
126137 // Called from device code
127138 ret = g_k_dev;
139+ #else
140+ #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
141+ ret = &g_k;
128142#else
129143 // Called from host code
130144 ret = g_k;
145+ #endif
131146#endif
132147 assert (ret);
133148 return *ret;
134149}
150+
151+ nrn_pragma_omp (declare target)
152+ /* * @brief Provide a helper function in global namespace that is declared target for OpenMP
153+ * offloading to function correctly with NVHPC
154+ */
155+ CORENRN_HOST_DEVICE philox4x32_ctr_t philox4x32_helper (coreneuron::nrnran123_State* s) {
156+ return philox4x32 (s->c , get_global_state ());
157+ }
158+ nrn_pragma_omp (end declare target)
135159} // namespace
136160
137161namespace coreneuron {
@@ -157,7 +181,7 @@ CORENRN_HOST_DEVICE void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char
157181 s->which_ = which;
158182 }
159183 s->c .v [0 ] = seq;
160- s->r = philox4x32 (s-> c , get_global_state () );
184+ s->r = philox4x32_helper (s );
161185}
162186
163187CORENRN_HOST_DEVICE void nrnran123_getids (nrnran123_State* s, uint32_t * id1, uint32_t * id2) {
@@ -181,7 +205,7 @@ CORENRN_HOST_DEVICE uint32_t nrnran123_ipick(nrnran123_State* s) {
181205 if (which > 3 ) {
182206 which = 0 ;
183207 s->c .v [0 ]++;
184- s->r = philox4x32 (s-> c , get_global_state () );
208+ s->r = philox4x32_helper (s );
185209 }
186210 s->which_ = which;
187211 return rval;
0 commit comments