UWB-Biocomputing · AndrewBMadison · Mar 13, 2025 · Mar 14, 2025 · Apr 30, 2025 · Apr 30, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -48,6 +48,7 @@ if(ENABLE_CUDA)
         project(Graphitti LANGUAGES CXX CUDA C)
 #Verify CUDA package is present
         find_Package(CUDA REQUIRED)
+        include_directories(${CUDA_INCLUDE_DIRS})
 #Set the USE_GPU preprocessor macro so that GPU code will be compiled.
         add_compile_definitions(USE_GPU)
 #Specify the CUDA architecture / gencode that will be targeted
@@ -361,13 +362,15 @@ add_library(Matrix ${Matrix_Source})
 file(GLOB RNG_Source Simulator/Utils/RNG/*.cpp Simulator/Utils/RNG/*.h Simulator/Utils/RNG/*.cu)
 # Remove demo from file list as it contains a main and it will cause compilation errors
 list(REMOVE_ITEM RNG_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/RNG/MersenneTwister_demo.cu")
+if(NOT ENABLE_CUDA)
+        list(REMOVE_ITEM Utils_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/AsyncPhilox_d.cu")
+endif()
 add_library(RNG STATIC ${RNG_Source})
 
 
 # Create Utils library
 file(GLOB Utils_Source Simulator/Utils/*.cpp Simulator/Utils/*.h)
 list(REMOVE_ITEM Utils_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/Factory.cpp")
-
 if(CMAKE_BUILD_TYPE STREQUAL "Profiling")
         if(ENABLE_CUDA)
 # Find NVTX Library
@@ -390,9 +393,13 @@ add_library(Utils ${Utils_Source})
 
 # Only link NVTX if it was found
 if(NVTX_LIBRARY)
+
+        message(STATUS "Adding NVTX: to Utils lib")
         target_link_libraries(Utils PRIVATE ${NVTX_LIBRARY})
 endif()
 
+message(STATUS "Linking RNG against Utils")
+target_link_libraries(RNG PRIVATE Utils)
 
 # Used to locate and run other CMakeLists.txt files from Third Party resources for further compilation of the project.
 add_subdirectory(ThirdParty)

diff --git a/Simulator/Connections/Connections.cpp b/Simulator/Connections/Connections.cpp
@@ -91,7 +91,8 @@ bool Connections::updateConnections(AllVertices &vertices)
 #if defined(USE_GPU)
 void Connections::updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
                                      AllVerticesDeviceProperties *allVerticesDevice,
-                                     AllEdgesDeviceProperties *allEdgesDevice, Layout &layout)
+                                     AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
+                                     cudaStream_t simulationStream)
 {
 }
 #else

diff --git a/Simulator/Connections/Connections.h b/Simulator/Connections/Connections.h
@@ -33,6 +33,11 @@
 // cereal
 #include <cereal/types/memory.hpp>
 
+#ifdef USE_GPU
+   #include <cuda_runtime.h>
+#endif
+
+
 using namespace std;
 
 class Connections {
@@ -85,9 +90,11 @@ class Connections {
    ///  @param  allVerticesDevice    GPU address of the allVertices struct on device memory.
    ///  @param  allEdgesDevice       GPU address of the allEdges struct on device memory.
    ///  @param  layout               Layout information of the graph network.
+   ///  @param  simulationStream     The cuda stream for all synchronous kernels.
    virtual void updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
                                    AllVerticesDeviceProperties *allVerticesDevice,
-                                   AllEdgesDeviceProperties *allEdgesDevice, Layout &layout);
+                                   AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
+                                   cudaStream_t simulationStream);
 #else
 public:
    ///  Update the weight of the edges in the simulation.

diff --git a/Simulator/Connections/Neuro/ConnGrowth.h b/Simulator/Connections/Neuro/ConnGrowth.h
@@ -125,10 +125,12 @@ class ConnGrowth : public Connections {
    ///  @param  allVerticesDevice    GPU address of the AllVertices struct in device memory.
    ///  @param  allEdgesDevice       GPU address of the AllEdges struct in device memory.
    ///  @param  layout               The Layout object.
+   ///  @param  simulationStream     The cuda stream for all synchronous kernels.
    virtual void updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
                                    AllVerticesDeviceProperties *allVerticesDevice,
-                                   AllEdgesDeviceProperties *allEdgesDevice,
-                                   Layout &layout) override;
+                                   AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
+                                   cudaStream_t simulationStream) override;
+
 #else
    ///  Update the weights of the Synapses in the simulation. To be clear,
    ///  iterates through all source and destination neurons and updates their

diff --git a/Simulator/Connections/Neuro/ConnGrowth_d.cpp b/Simulator/Connections/Neuro/ConnGrowth_d.cpp
@@ -26,10 +26,12 @@
  *  @param  allVerticesDevice   GPU address to the AllVertices struct in device memory.
  *  @param  allEdgesDevice      GPU address to the AllEdges struct in device memory.
  *  @param  layout              The Layout object.
+ *  @param  simulationStream    The cuda stream for all synchronous kernels.
  */
 void ConnGrowth::updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
                                     AllVerticesDeviceProperties *allVerticesDevice,
-                                    AllEdgesDeviceProperties *allEdgesDevice, Layout &layout)
+                                    AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
+                                    cudaStream_t simulationStream)
 {
    Simulator &simulator = Simulator::getInstance();
    // For now, we just set the weights to equal the areas. We will later
@@ -64,7 +66,7 @@ void ConnGrowth::updateEdgesWeights(int numVertices, AllVertices &vertices, AllE
                            cudaMemcpyHostToDevice));
 
    blocksPerGrid = (simulator.getTotalVertices() + threadsPerBlock - 1) / threadsPerBlock;
-   updateSynapsesWeightsDevice<<<blocksPerGrid, threadsPerBlock>>>(
+   updateSynapsesWeightsDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream>>>(
       simulator.getTotalVertices(), deltaT, W_d, simulator.getMaxEdgesPerVertex(),
       (AllSpikingNeuronsDeviceProperties *)allVerticesDevice,
       (AllSpikingSynapsesDeviceProperties *)allEdgesDevice, neuronTypeMapD);

diff --git a/Simulator/Core/GPUModel.cpp b/Simulator/Core/GPUModel.cpp
@@ -58,6 +58,12 @@ void GPUModel::allocDeviceStruct()
 
    // Allocate synapse inverse map in device memory
    allocEdgeIndexMap(numVertices);
+
+   // Create the CUDA stream used to launch synchronous GPU kernels during the simulation.
+   // This stream is passed to components like AllEdges and used consistently for kernel launches.
+   // For stream behavior and management, see:
+   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
+   HANDLE_ERROR(cudaStreamCreate(&simulationStream_));
 }
 
 /// Copies device memories to host memories and deallocates them.
@@ -74,7 +80,8 @@ void GPUModel::deleteDeviceStruct()
    HANDLE_ERROR(cudaFree(synapseIMapDevice.incomingEdgeCount_));
    HANDLE_ERROR(cudaFree(synapseIMapDevice.incomingEdgeIndexMap_));
    HANDLE_ERROR(cudaFree(edgeIndexMapDevice_));
-   HANDLE_ERROR(cudaFree(randNoise_d));
+   HANDLE_ERROR(cudaStreamDestroy(simulationStream_));
+   AsyncGenerator_.deleteDeviceStruct();
 }
 
 /// Sets up the Simulation.
@@ -88,14 +95,17 @@ void GPUModel::setupSim()
 
    //initialize Mersenne Twister
    //assuming numVertices >= 100 and is a multiple of 100. Note rng_mt_rng_count must be <= MT_RNG_COUNT
-   int rng_blocks = 25;   //# of blocks the kernel will use
-   int rng_nPerRng
-      = 4;   //# of iterations per thread (thread granularity, # of rands generated per thread)
-   int rng_mt_rng_count = Simulator::getInstance().getTotalVertices()
-                          / rng_nPerRng;   //# of threads to generate for numVertices rand #s
-   int rng_threads = rng_mt_rng_count / rng_blocks;   //# threads per block needed
-   initMTGPU(Simulator::getInstance().getNoiseRngSeed(), rng_blocks, rng_threads, rng_nPerRng,
-             rng_mt_rng_count);
+   // int rng_blocks = 25;   //# of blocks the kernel will use
+   // int rng_nPerRng
+   //    = 4;   //# of iterations per thread (thread granularity, # of rands generated per thread)
+   // int rng_mt_rng_count = Simulator::getInstance().getTotalVertices()
+   //                        / rng_nPerRng;   //# of threads to generate for numVertices rand #s
+   // int rng_threads = rng_mt_rng_count / rng_blocks;   //# threads per block needed
+   // initMTGPU(Simulator::getInstance().getNoiseRngSeed(), rng_blocks, rng_threads, rng_nPerRng,
+   //           rng_mt_rng_count);
+   //cout << "blocks, threads, nPerRng, rng_rng_count: " << rng_blocks << " " << rng_threads << " " << rng_nPerRng << " " << rng_mt_rng_count << endl;
+   AsyncGenerator_.loadAsyncPhilox(Simulator::getInstance().getTotalVertices(),
+                                   Simulator::getInstance().getNoiseRngSeed());
 
 #ifdef PERFORMANCE_METRICS
    cudaEventCreate(&start);
@@ -116,6 +126,9 @@ void GPUModel::setupSim()
 
    // set some parameters used for advanceEdgesDevice
    edges.setAdvanceEdgesDeviceParams();
+   AllVertices &vertices = layout_->getVertices();
+   vertices.SetStream(simulationStream_);
+   edges.SetStream(simulationStream_);
 }
 
 /// Performs any finalization tasks on network following a simulation.
@@ -158,7 +171,8 @@ void GPUModel::advance()
    // }
    cudaMemcpy(randNoise_d, randNoise_h.data(), verts * sizeof(float), cudaMemcpyHostToDevice);
 #else
-   normalMTGPU(randNoise_d);
+   // normalMTGPU(randNoise_d);
+   randNoise_d = AsyncGenerator_.requestSegment();
 #endif
 //LOG4CPLUS_DEBUG(vertexLogger_, "Index: " << index << " Vm: " << Vm);
 #ifdef PERFORMANCE_METRICS
@@ -238,7 +252,8 @@ void GPUModel::updateConnections()
    // Update Connections data
    if (connections_->updateConnections(vertices)) {
       connections_->updateEdgesWeights(Simulator::getInstance().getTotalVertices(), vertices, edges,
-                                       allVerticesDevice_, allEdgesDevice_, getLayout());
+                                       allVerticesDevice_, allEdgesDevice_, getLayout(),
+                                       simulationStream_);
       // create edge index map
       connections_->createEdgeIndexMap();
       // copy index map to the device memory

diff --git a/Simulator/Core/GPUModel.h b/Simulator/Core/GPUModel.h
@@ -25,7 +25,11 @@
 #include "AllSpikingNeurons.h"
 #include "AllSpikingSynapses.h"
 #include "AllVertices.h"
+#include "AsyncPhilox_d.h"
 #include "OperationManager.h"
+#ifdef USE_GPU
+   #include <cuda_runtime.h>
+#endif
 
 #ifdef VALIDATION_MODE
    #include <fstream>
@@ -115,6 +119,15 @@ class GPUModel : public Model {
    /// Pointer to device random noise array.
    float *randNoise_d;
 
+   /// Async RNG class instance used to load generator with a seed and request noise device pointers.
+   AsyncPhilox_d AsyncGenerator_;
+#ifdef VALIDATION_MODE
+   /// Buffer used in the validation mode to copy cgraphitti's noise generation into the device noise buffer.
+   float *randNoise_h;
+#endif
+   /// Cuda Stream for kernel use
+   cudaStream_t simulationStream_;
+
 #if defined(USE_GPU)
    /// Pointer to edge index map in device memory.
    EdgeIndexMapDevice *edgeIndexMapDevice_;
@@ -144,10 +157,10 @@ class GPUModel : public Model {
                    Coordinate dest, BGFLOAT deltaT, edgeType type);
 };
 
-#if defined(__CUDACC__)
-extern "C" {
-void normalMTGPU(float *randNoise_d);
-void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int threads, unsigned int nPerRng,
-               unsigned int mt_rng_count);
-}
-#endif
+// #if defined(__CUDACC__)
+// extern "C" {
+// void normalMTGPU(float *randNoise_d);
+// void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int threads, unsigned int nPerRng,
+//                unsigned int mt_rng_count);
+// }
+// #endif
diff --git a/Simulator/Core/Serializer.cpp b/Simulator/Core/Serializer.cpp
@@ -25,7 +25,9 @@
 
 #include "Serializer.h"
 #include "ConnGrowth.h"
-#include "GPUModel.h"
+#if defined(USE_GPU)
+   #include "GPUModel.h"
+#endif
 #include <fstream>
 
 // About CEREAL_XML_STRING_VALUE

diff --git a/Simulator/Core/Simulator.cpp b/Simulator/Core/Simulator.cpp
@@ -9,7 +9,9 @@
 
 #include "Simulator.h"
 #include "CPUModel.h"
-#include "GPUModel.h"
+#if defined(USE_GPU)
+   #include "GPUModel.h"
+#endif
 #include "OperationManager.h"
 #include "ParameterManager.h"
 #include "Utils/Factory.h"
@@ -173,7 +175,7 @@ void Simulator::simulate()
       double total_time = timer.lap() / 1000000.0;
 
       cout << "\ntotal_time: " << total_time << " seconds" << endl;
-      printPerformanceMetrics(total_time, currentEpoch);
+      printPerformanceMetrics(total_time, currentEpoch_);
       cout << endl;
 #endif
    }

diff --git a/Simulator/Edges/AllEdges.cpp b/Simulator/Edges/AllEdges.cpp
@@ -239,6 +239,20 @@ void AllEdges::createEdgeIndexMap(EdgeIndexMap &edgeIndexMap)
    }
 }
 
+#if defined(USE_GPU)
+/// Set the CUDA stream to be used by GPU edge kernels in derived classes.
+///
+/// This assigns a CUDA stream to the base class, allowing subclasses
+/// (e.g., AllSpikingSynapses_d, AllSTDPSynapses_d) to launch kernels on
+/// the correct stream. The stream is typically created by GPUModel and
+/// passed down during simulation setup.
+///
+/// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller.
+void AllEdges::SetStream(cudaStream_t simulationStream)
+{
+   simulationStream_ = simulationStream;
+}
+#endif
 
 #if !defined(USE_GPU)
 

diff --git a/Simulator/Edges/AllEdges.h b/Simulator/Edges/AllEdges.h
@@ -15,6 +15,9 @@
 #include <vector>
 // cereal
 #include "cereal/types/vector.hpp"
+#ifdef USE_GPU
+   #include <cuda_runtime.h>
+#endif
 
 class AllVertices;
 struct AllEdgesDeviceProperties;
@@ -92,7 +95,20 @@ class AllEdges {
    log4cplus::Logger edgeLogger_;
 
 #if defined(USE_GPU)
+   ///  Cuda Stream for Edge Kernels
+   cudaStream_t simulationStream_;
+
 public:
+   /// Set the CUDA stream to be used by GPU edge kernels in derived classes.
+   ///
+   /// This assigns a CUDA stream to the base class, allowing subclasses
+   /// (e.g., AllSpikingSynapses_d, AllSTDPSynapses_d) to launch kernels on
+   /// the correct stream. The stream is typically created by GPUModel and
+   /// passed down during simulation setup.
+   ///
+   /// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller.
+   void SetStream(cudaStream_t simulationStream);
+
    ///  Allocate GPU memories to store all edges' states,
    ///  and copy them from host to GPU memory.
    virtual void allocEdgeDeviceStruct() = 0;

diff --git a/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp b/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp
@@ -264,7 +264,7 @@ void AllSTDPSynapses::advanceEdges(void *allEdgesDevice, void *allVerticesDevice
    const int threadsPerBlock = 256;
    int blocksPerGrid = (totalEdgeCount_ + threadsPerBlock - 1) / threadsPerBlock;
    // Advance synapses ------------->
-   advanceSTDPSynapsesDevice<<<blocksPerGrid, threadsPerBlock>>>(
+   advanceSTDPSynapsesDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream_>>>(
       totalEdgeCount_, (EdgeIndexMapDevice *)edgeIndexMapDevice, g_simulationStep,
       Simulator::getInstance().getDeltaT(), (AllSTDPSynapsesDeviceProperties *)allEdgesDevice,
       (AllSpikingNeuronsDeviceProperties *)allVerticesDevice, maxSpikes);

diff --git a/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp b/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp
@@ -331,7 +331,7 @@ void AllSpikingSynapses::advanceEdges(void *allEdgesDevice, void *allVerticesDev
    const int threadsPerBlock = 256;
    int blocksPerGrid = (totalEdgeCount_ + threadsPerBlock - 1) / threadsPerBlock;
    // Advance synapses ------------->
-   advanceSpikingSynapsesDevice<<<blocksPerGrid, threadsPerBlock>>>(
+   advanceSpikingSynapsesDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream_>>>(
       totalEdgeCount_, (EdgeIndexMapDevice *)edgeIndexMapDevice, g_simulationStep,
       Simulator::getInstance().getDeltaT(), (AllSpikingSynapsesDeviceProperties *)allEdgesDevice);
 }

diff --git a/Simulator/Utils/Book.h b/Simulator/Utils/Book.h
@@ -22,6 +22,7 @@
 
 #pragma once
 
+#include <cuda_runtime.h>
 #include <stdio.h>
 //! CUDA Exception handler
 static void HandleError(cudaError_t err, const char *file, int line)