diff --git a/CMakeLists.txt b/CMakeLists.txt index f30e63b58..62c8ecf1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,7 @@ if(ENABLE_CUDA) project(Graphitti LANGUAGES CXX CUDA C) #Verify CUDA package is present find_Package(CUDA REQUIRED) + include_directories(${CUDA_INCLUDE_DIRS}) #Set the USE_GPU preprocessor macro so that GPU code will be compiled. add_compile_definitions(USE_GPU) #Specify the CUDA architecture / gencode that will be targeted @@ -361,13 +362,15 @@ add_library(Matrix ${Matrix_Source}) file(GLOB RNG_Source Simulator/Utils/RNG/*.cpp Simulator/Utils/RNG/*.h Simulator/Utils/RNG/*.cu) # Remove demo from file list as it contains a main and it will cause compilation errors list(REMOVE_ITEM RNG_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/RNG/MersenneTwister_demo.cu") +if(NOT ENABLE_CUDA) + list(REMOVE_ITEM Utils_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/AsyncPhilox_d.cu") +endif() add_library(RNG STATIC ${RNG_Source}) # Create Utils library file(GLOB Utils_Source Simulator/Utils/*.cpp Simulator/Utils/*.h) list(REMOVE_ITEM Utils_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/Factory.cpp") - if(CMAKE_BUILD_TYPE STREQUAL "Profiling") if(ENABLE_CUDA) # Find NVTX Library @@ -390,9 +393,13 @@ add_library(Utils ${Utils_Source}) # Only link NVTX if it was found if(NVTX_LIBRARY) + + message(STATUS "Adding NVTX: to Utils lib") target_link_libraries(Utils PRIVATE ${NVTX_LIBRARY}) endif() +message(STATUS "Linking RNG against Utils") +target_link_libraries(RNG PRIVATE Utils) # Used to locate and run other CMakeLists.txt files from Third Party resources for further compilation of the project. add_subdirectory(ThirdParty) diff --git a/Simulator/Connections/Connections.cpp b/Simulator/Connections/Connections.cpp index a8a63fc0e..37f4803e8 100644 --- a/Simulator/Connections/Connections.cpp +++ b/Simulator/Connections/Connections.cpp @@ -91,7 +91,8 @@ bool Connections::updateConnections(AllVertices &vertices) #if defined(USE_GPU) void Connections::updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges, AllVerticesDeviceProperties *allVerticesDevice, - AllEdgesDeviceProperties *allEdgesDevice, Layout &layout) + AllEdgesDeviceProperties *allEdgesDevice, Layout &layout, + cudaStream_t simulationStream) { } #else diff --git a/Simulator/Connections/Connections.h b/Simulator/Connections/Connections.h index d8bcc8596..99f2694e0 100644 --- a/Simulator/Connections/Connections.h +++ b/Simulator/Connections/Connections.h @@ -33,6 +33,11 @@ // cereal #include +#ifdef USE_GPU + #include +#endif + + using namespace std; class Connections { @@ -85,9 +90,11 @@ class Connections { /// @param allVerticesDevice GPU address of the allVertices struct on device memory. /// @param allEdgesDevice GPU address of the allEdges struct on device memory. /// @param layout Layout information of the graph network. + /// @param simulationStream The cuda stream for all synchronous kernels. virtual void updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges, AllVerticesDeviceProperties *allVerticesDevice, - AllEdgesDeviceProperties *allEdgesDevice, Layout &layout); + AllEdgesDeviceProperties *allEdgesDevice, Layout &layout, + cudaStream_t simulationStream); #else public: /// Update the weight of the edges in the simulation. diff --git a/Simulator/Connections/Neuro/ConnGrowth.h b/Simulator/Connections/Neuro/ConnGrowth.h index e6a88078c..399a2665c 100644 --- a/Simulator/Connections/Neuro/ConnGrowth.h +++ b/Simulator/Connections/Neuro/ConnGrowth.h @@ -125,10 +125,12 @@ class ConnGrowth : public Connections { /// @param allVerticesDevice GPU address of the AllVertices struct in device memory. /// @param allEdgesDevice GPU address of the AllEdges struct in device memory. /// @param layout The Layout object. + /// @param simulationStream The cuda stream for all synchronous kernels. virtual void updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges, AllVerticesDeviceProperties *allVerticesDevice, - AllEdgesDeviceProperties *allEdgesDevice, - Layout &layout) override; + AllEdgesDeviceProperties *allEdgesDevice, Layout &layout, + cudaStream_t simulationStream) override; + #else /// Update the weights of the Synapses in the simulation. To be clear, /// iterates through all source and destination neurons and updates their diff --git a/Simulator/Connections/Neuro/ConnGrowth_d.cpp b/Simulator/Connections/Neuro/ConnGrowth_d.cpp index 148834fea..19a6d9395 100644 --- a/Simulator/Connections/Neuro/ConnGrowth_d.cpp +++ b/Simulator/Connections/Neuro/ConnGrowth_d.cpp @@ -26,10 +26,12 @@ * @param allVerticesDevice GPU address to the AllVertices struct in device memory. * @param allEdgesDevice GPU address to the AllEdges struct in device memory. * @param layout The Layout object. + * @param simulationStream The cuda stream for all synchronous kernels. */ void ConnGrowth::updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges, AllVerticesDeviceProperties *allVerticesDevice, - AllEdgesDeviceProperties *allEdgesDevice, Layout &layout) + AllEdgesDeviceProperties *allEdgesDevice, Layout &layout, + cudaStream_t simulationStream) { Simulator &simulator = Simulator::getInstance(); // For now, we just set the weights to equal the areas. We will later @@ -64,7 +66,7 @@ void ConnGrowth::updateEdgesWeights(int numVertices, AllVertices &vertices, AllE cudaMemcpyHostToDevice)); blocksPerGrid = (simulator.getTotalVertices() + threadsPerBlock - 1) / threadsPerBlock; - updateSynapsesWeightsDevice<<>>( + updateSynapsesWeightsDevice<<>>( simulator.getTotalVertices(), deltaT, W_d, simulator.getMaxEdgesPerVertex(), (AllSpikingNeuronsDeviceProperties *)allVerticesDevice, (AllSpikingSynapsesDeviceProperties *)allEdgesDevice, neuronTypeMapD); diff --git a/Simulator/Core/GPUModel.cpp b/Simulator/Core/GPUModel.cpp index 97453ac1c..a421c0193 100644 --- a/Simulator/Core/GPUModel.cpp +++ b/Simulator/Core/GPUModel.cpp @@ -58,6 +58,12 @@ void GPUModel::allocDeviceStruct() // Allocate synapse inverse map in device memory allocEdgeIndexMap(numVertices); + + // Create the CUDA stream used to launch synchronous GPU kernels during the simulation. + // This stream is passed to components like AllEdges and used consistently for kernel launches. + // For stream behavior and management, see: + // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html + HANDLE_ERROR(cudaStreamCreate(&simulationStream_)); } /// Copies device memories to host memories and deallocates them. @@ -74,7 +80,8 @@ void GPUModel::deleteDeviceStruct() HANDLE_ERROR(cudaFree(synapseIMapDevice.incomingEdgeCount_)); HANDLE_ERROR(cudaFree(synapseIMapDevice.incomingEdgeIndexMap_)); HANDLE_ERROR(cudaFree(edgeIndexMapDevice_)); - HANDLE_ERROR(cudaFree(randNoise_d)); + HANDLE_ERROR(cudaStreamDestroy(simulationStream_)); + AsyncGenerator_.deleteDeviceStruct(); } /// Sets up the Simulation. @@ -88,14 +95,17 @@ void GPUModel::setupSim() //initialize Mersenne Twister //assuming numVertices >= 100 and is a multiple of 100. Note rng_mt_rng_count must be <= MT_RNG_COUNT - int rng_blocks = 25; //# of blocks the kernel will use - int rng_nPerRng - = 4; //# of iterations per thread (thread granularity, # of rands generated per thread) - int rng_mt_rng_count = Simulator::getInstance().getTotalVertices() - / rng_nPerRng; //# of threads to generate for numVertices rand #s - int rng_threads = rng_mt_rng_count / rng_blocks; //# threads per block needed - initMTGPU(Simulator::getInstance().getNoiseRngSeed(), rng_blocks, rng_threads, rng_nPerRng, - rng_mt_rng_count); + // int rng_blocks = 25; //# of blocks the kernel will use + // int rng_nPerRng + // = 4; //# of iterations per thread (thread granularity, # of rands generated per thread) + // int rng_mt_rng_count = Simulator::getInstance().getTotalVertices() + // / rng_nPerRng; //# of threads to generate for numVertices rand #s + // int rng_threads = rng_mt_rng_count / rng_blocks; //# threads per block needed + // initMTGPU(Simulator::getInstance().getNoiseRngSeed(), rng_blocks, rng_threads, rng_nPerRng, + // rng_mt_rng_count); + //cout << "blocks, threads, nPerRng, rng_rng_count: " << rng_blocks << " " << rng_threads << " " << rng_nPerRng << " " << rng_mt_rng_count << endl; + AsyncGenerator_.loadAsyncPhilox(Simulator::getInstance().getTotalVertices(), + Simulator::getInstance().getNoiseRngSeed()); #ifdef PERFORMANCE_METRICS cudaEventCreate(&start); @@ -116,6 +126,9 @@ void GPUModel::setupSim() // set some parameters used for advanceEdgesDevice edges.setAdvanceEdgesDeviceParams(); + AllVertices &vertices = layout_->getVertices(); + vertices.SetStream(simulationStream_); + edges.SetStream(simulationStream_); } /// Performs any finalization tasks on network following a simulation. @@ -158,7 +171,8 @@ void GPUModel::advance() // } cudaMemcpy(randNoise_d, randNoise_h.data(), verts * sizeof(float), cudaMemcpyHostToDevice); #else - normalMTGPU(randNoise_d); + // normalMTGPU(randNoise_d); + randNoise_d = AsyncGenerator_.requestSegment(); #endif //LOG4CPLUS_DEBUG(vertexLogger_, "Index: " << index << " Vm: " << Vm); #ifdef PERFORMANCE_METRICS @@ -238,7 +252,8 @@ void GPUModel::updateConnections() // Update Connections data if (connections_->updateConnections(vertices)) { connections_->updateEdgesWeights(Simulator::getInstance().getTotalVertices(), vertices, edges, - allVerticesDevice_, allEdgesDevice_, getLayout()); + allVerticesDevice_, allEdgesDevice_, getLayout(), + simulationStream_); // create edge index map connections_->createEdgeIndexMap(); // copy index map to the device memory diff --git a/Simulator/Core/GPUModel.h b/Simulator/Core/GPUModel.h index 0a9562e40..4dd20ca70 100644 --- a/Simulator/Core/GPUModel.h +++ b/Simulator/Core/GPUModel.h @@ -25,7 +25,11 @@ #include "AllSpikingNeurons.h" #include "AllSpikingSynapses.h" #include "AllVertices.h" +#include "AsyncPhilox_d.h" #include "OperationManager.h" +#ifdef USE_GPU + #include +#endif #ifdef VALIDATION_MODE #include @@ -115,6 +119,15 @@ class GPUModel : public Model { /// Pointer to device random noise array. float *randNoise_d; + /// Async RNG class instance used to load generator with a seed and request noise device pointers. + AsyncPhilox_d AsyncGenerator_; +#ifdef VALIDATION_MODE + /// Buffer used in the validation mode to copy cgraphitti's noise generation into the device noise buffer. + float *randNoise_h; +#endif + /// Cuda Stream for kernel use + cudaStream_t simulationStream_; + #if defined(USE_GPU) /// Pointer to edge index map in device memory. EdgeIndexMapDevice *edgeIndexMapDevice_; @@ -144,10 +157,10 @@ class GPUModel : public Model { Coordinate dest, BGFLOAT deltaT, edgeType type); }; -#if defined(__CUDACC__) -extern "C" { -void normalMTGPU(float *randNoise_d); -void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int threads, unsigned int nPerRng, - unsigned int mt_rng_count); -} -#endif \ No newline at end of file +// #if defined(__CUDACC__) +// extern "C" { +// void normalMTGPU(float *randNoise_d); +// void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int threads, unsigned int nPerRng, +// unsigned int mt_rng_count); +// } +// #endif \ No newline at end of file diff --git a/Simulator/Core/Serializer.cpp b/Simulator/Core/Serializer.cpp index 2911ffab8..d1b2c0f7f 100644 --- a/Simulator/Core/Serializer.cpp +++ b/Simulator/Core/Serializer.cpp @@ -25,7 +25,9 @@ #include "Serializer.h" #include "ConnGrowth.h" -#include "GPUModel.h" +#if defined(USE_GPU) + #include "GPUModel.h" +#endif #include // About CEREAL_XML_STRING_VALUE diff --git a/Simulator/Core/Simulator.cpp b/Simulator/Core/Simulator.cpp index e4a42d9df..593cd5c48 100644 --- a/Simulator/Core/Simulator.cpp +++ b/Simulator/Core/Simulator.cpp @@ -9,7 +9,9 @@ #include "Simulator.h" #include "CPUModel.h" -#include "GPUModel.h" +#if defined(USE_GPU) + #include "GPUModel.h" +#endif #include "OperationManager.h" #include "ParameterManager.h" #include "Utils/Factory.h" @@ -173,7 +175,7 @@ void Simulator::simulate() double total_time = timer.lap() / 1000000.0; cout << "\ntotal_time: " << total_time << " seconds" << endl; - printPerformanceMetrics(total_time, currentEpoch); + printPerformanceMetrics(total_time, currentEpoch_); cout << endl; #endif } diff --git a/Simulator/Edges/AllEdges.cpp b/Simulator/Edges/AllEdges.cpp index 802e4c183..27dd694f7 100644 --- a/Simulator/Edges/AllEdges.cpp +++ b/Simulator/Edges/AllEdges.cpp @@ -239,6 +239,20 @@ void AllEdges::createEdgeIndexMap(EdgeIndexMap &edgeIndexMap) } } +#if defined(USE_GPU) +/// Set the CUDA stream to be used by GPU edge kernels in derived classes. +/// +/// This assigns a CUDA stream to the base class, allowing subclasses +/// (e.g., AllSpikingSynapses_d, AllSTDPSynapses_d) to launch kernels on +/// the correct stream. The stream is typically created by GPUModel and +/// passed down during simulation setup. +/// +/// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller. +void AllEdges::SetStream(cudaStream_t simulationStream) +{ + simulationStream_ = simulationStream; +} +#endif #if !defined(USE_GPU) diff --git a/Simulator/Edges/AllEdges.h b/Simulator/Edges/AllEdges.h index 69f9e8f7c..cdf0c2b6f 100644 --- a/Simulator/Edges/AllEdges.h +++ b/Simulator/Edges/AllEdges.h @@ -15,6 +15,9 @@ #include // cereal #include "cereal/types/vector.hpp" +#ifdef USE_GPU + #include +#endif class AllVertices; struct AllEdgesDeviceProperties; @@ -92,7 +95,20 @@ class AllEdges { log4cplus::Logger edgeLogger_; #if defined(USE_GPU) + /// Cuda Stream for Edge Kernels + cudaStream_t simulationStream_; + public: + /// Set the CUDA stream to be used by GPU edge kernels in derived classes. + /// + /// This assigns a CUDA stream to the base class, allowing subclasses + /// (e.g., AllSpikingSynapses_d, AllSTDPSynapses_d) to launch kernels on + /// the correct stream. The stream is typically created by GPUModel and + /// passed down during simulation setup. + /// + /// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller. + void SetStream(cudaStream_t simulationStream); + /// Allocate GPU memories to store all edges' states, /// and copy them from host to GPU memory. virtual void allocEdgeDeviceStruct() = 0; diff --git a/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp b/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp index 2d91d1f11..a7c3804fc 100644 --- a/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp +++ b/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp @@ -264,7 +264,7 @@ void AllSTDPSynapses::advanceEdges(void *allEdgesDevice, void *allVerticesDevice const int threadsPerBlock = 256; int blocksPerGrid = (totalEdgeCount_ + threadsPerBlock - 1) / threadsPerBlock; // Advance synapses -------------> - advanceSTDPSynapsesDevice<<>>( + advanceSTDPSynapsesDevice<<>>( totalEdgeCount_, (EdgeIndexMapDevice *)edgeIndexMapDevice, g_simulationStep, Simulator::getInstance().getDeltaT(), (AllSTDPSynapsesDeviceProperties *)allEdgesDevice, (AllSpikingNeuronsDeviceProperties *)allVerticesDevice, maxSpikes); diff --git a/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp b/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp index e6bb33be0..a6954cf82 100644 --- a/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp +++ b/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp @@ -331,7 +331,7 @@ void AllSpikingSynapses::advanceEdges(void *allEdgesDevice, void *allVerticesDev const int threadsPerBlock = 256; int blocksPerGrid = (totalEdgeCount_ + threadsPerBlock - 1) / threadsPerBlock; // Advance synapses -------------> - advanceSpikingSynapsesDevice<<>>( + advanceSpikingSynapsesDevice<<>>( totalEdgeCount_, (EdgeIndexMapDevice *)edgeIndexMapDevice, g_simulationStep, Simulator::getInstance().getDeltaT(), (AllSpikingSynapsesDeviceProperties *)allEdgesDevice); } diff --git a/Simulator/Utils/Book.h b/Simulator/Utils/Book.h index f71b1e952..c0dbb6919 100644 --- a/Simulator/Utils/Book.h +++ b/Simulator/Utils/Book.h @@ -22,6 +22,7 @@ #pragma once +#include #include //! CUDA Exception handler static void HandleError(cudaError_t err, const char *file, int line) diff --git a/Simulator/Utils/RNG/AsyncPhilox_d.cu b/Simulator/Utils/RNG/AsyncPhilox_d.cu new file mode 100644 index 000000000..a9b61c2fb --- /dev/null +++ b/Simulator/Utils/RNG/AsyncPhilox_d.cu @@ -0,0 +1,183 @@ +/** + * @file AsyncPhilox_d.cu + * + * @ingroup Simulator/Utils/RNG + * + * @brief Asynchronous Philox RNG using curand to fill GPU buffers + * + * AsyncPhilox_d class maintains two large GPU buffers for noise. + * GPUModel calls loadAsyncPhilox to initialize states and + * fill the buffers, then, each advance requestSegment + * returns a float* slice of a buffer for use in + * advanceVertices + */ + +#include "AsyncPhilox_d.h" +#include "NvtxHelper.h" +#include +#include +#include + +/// @brief Kernel to generate Gaussian (normal) random numbers using Philox. +/// +/// Each thread loads its own Philox RNG state, generates one or more +/// random floats using a strided loop, and writes them into the output buffer. +/// The updated RNG state is written back to global device memory. +/// +/// @param states Array of Philox RNG states, one per thread. +/// @param output Output buffer for generated random floats. +/// @param bufferSize Total number of floats to generate (length of output buffer). +__global__ void generatePhilox(curandStatePhilox4_32_10_t *states, float *output, int bufferSize) +{ + // Compute a unique global index for this thread + int threadId = threadIdx.x; + int blockId = blockIdx.x; + int threadsPerBlock = blockDim.x; + int totalThreads = gridDim.x * threadsPerBlock; + int gid = blockId * threadsPerBlock + threadId; + + // Load this thread’s Philox state + curandStatePhilox4_32_10_t local = states[gid]; + + // Stride‐loop: write one random per iteration until we cover bufferSize + for (int idx = gid; idx < bufferSize; idx += totalThreads) { + output[idx] = curand_normal(&local); + } + + // Store back the updated state + states[gid] = local; +} + +/// @brief Kernel to initialize Philox RNG states for each thread. +/// +/// Each thread initializes its entry in the RNG state array using a fixed seed. +/// This is typically called once before generating random numbers. +/// +/// @param states Array to hold initialized Philox RNG states. +/// @param seed Seed value used to initialize curand. +/// @param totalThreads Total number of threads that will use RNG states. +__global__ void initPhilox(curandStatePhilox4_32_10_t *states, unsigned long seed, int totalThreads) +{ + int gid = blockIdx.x * blockDim.x + threadIdx.x; + if (gid >= totalThreads) + return; + curand_init(seed, gid, 0, &states[gid]); +} + +/// Initializes generator and allocates device memory +/// @param samplesPerSegment Number of total vertices +/// @param seed RNG seed. +void AsyncPhilox_d::loadAsyncPhilox(int samplesPerSegment, unsigned long seed) +{ + // hostBuffer = nullptr; + // cudaHostAlloc(&hostBuffer, samplesPerSegment * sizeof(float), cudaHostAllocDefault); + // logfile = std::fopen("philox_output_32_10.bin", "wb"); + //consoleLogger_ = log4cplus::Logger::getInstance(LOG4CPLUS_TEXT("console")); + segmentSize_ = samplesPerSegment; + seed_ = seed; + currentBuffer_ = 0; + segmentIndex_ = 0; + + totalSegments_ = 10; + +#ifdef ENABLE_NVTX + nvtxMarker_ = 10000 / totalSegments; // make a marker every nvtxMarker buffer fills; + nvtxCurrentMarker_ = nvtxMarker_; // count down to color flip +#endif + bufferSize_ = segmentSize_ * totalSegments_; + numBlocks_ = 64; //placeholder num of blocks + numThreads_ = 64; + + totalThreads_ = numThreads_ * numBlocks_; + + int leastPriority, greatestPriority; + HANDLE_ERROR(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority)); + // └─ leastPriority is the numerically largest value → lowest actual priority + // └─ greatestPriority is the numerically smallest value → highest actual priority + + // Create internal stream + HANDLE_ERROR(cudaStreamCreateWithPriority(&RNG_stream_, cudaStreamNonBlocking, leastPriority)); + + // Allocate two large buffers + HANDLE_ERROR(cudaMalloc(&buffers_d[0], bufferSize_ * sizeof(float))); + HANDLE_ERROR(cudaMalloc(&buffers_d[1], bufferSize_ * sizeof(float))); + + HANDLE_ERROR(cudaMalloc(&spStates_d, totalThreads_ * sizeof(curandStatePhilox4_32_10_t))); + + initPhilox<<>>(spStates_d, seed_, totalThreads_); + + // Pre-fill both buffers + fillBuffer(0); + fillBuffer(1); + HANDLE_ERROR(cudaStreamSynchronize( + RNG_stream_)); //wait for both buffers to be filled before the first request +} + +/// Free device memory +void AsyncPhilox_d::deleteDeviceStruct() +{ + // std::fclose(logfile); + // cudaFree(hostBuffer); + HANDLE_ERROR(cudaFree(buffers_d[0])); + HANDLE_ERROR(cudaFree(buffers_d[1])); + HANDLE_ERROR(cudaFree(spStates_d)); + + HANDLE_ERROR(cudaStreamDestroy(RNG_stream_)); +} + +AsyncPhilox_d::~AsyncPhilox_d() +{ +} + +/// Request a new segment of generated noise. +/// @return Pointer to a slice of device memory containing noise. +float *AsyncPhilox_d::requestSegment() +{ + //LOG4CPLUS_TRACE(consoleLogger_, "request segment"); + //auto start = std::chrono::high_resolution_clock::now(); +#ifdef ENABLE_NVTX + static bool flipColor; +#endif + if (segmentIndex_ >= totalSegments_) { + // Switch buffer and launch async refill on the now-unused one + +#ifdef ENABLE_NVTX + if (nvtxCurrentMarker_ <= 0) { + nvtxPop(); + if (flipColor == true) + nvtxPushColor("10,000 time steps", Color::RED); + else + nvtxPushColor("10,000 time steps", Color::BLUE); + + flipColor = !flipColor; + nvtxCurrentMarker_ = nvtxMarker_; + } else + --nvtxCurrentMarker_; +#endif + + int refillBuffer = currentBuffer_; + currentBuffer_ = 1 - currentBuffer_; + segmentIndex_ = 0; + cudaStreamSynchronize(RNG_stream_); // Ensure refillBuffer is done + fillBuffer(refillBuffer); + } + + float *segmentPtr = buffers_d[currentBuffer_] + segmentIndex_ * segmentSize_; + segmentIndex_ += 1; + + // auto end = std::chrono::high_resolution_clock::now(); + // std::cout << "Segment: " << segmentIndex << ", Launch time: " << (end - start).count() << " ns\n"; + // cudaMemcpy(hostBuffer, segmentPtr, segmentSize * sizeof(float), cudaMemcpyDeviceToHost); + // std::fwrite(hostBuffer, sizeof(float), segmentSize, logfile); + + return segmentPtr; +} + +/// Internal helper to fill a specified buffer with random floats. +/// @param bufferIndex Index (0 or 1) of the buffer to fill. +void AsyncPhilox_d::fillBuffer(int bufferIndex) +{ + //LOG4CPLUS_TRACE(consoleLogger_, "filling buffer:"); + generatePhilox<<>>(spStates_d, buffers_d[bufferIndex], + bufferSize_); +} diff --git a/Simulator/Utils/RNG/AsyncPhilox_d.h b/Simulator/Utils/RNG/AsyncPhilox_d.h new file mode 100644 index 000000000..91c2a5b4c --- /dev/null +++ b/Simulator/Utils/RNG/AsyncPhilox_d.h @@ -0,0 +1,93 @@ +/** + * @file AsyncPhilox_d.h + * + * @ingroup Simulator/Utils/RNG + * + * @brief Asynchronous Philox RNG using curand to fill GPU buffers + * + * AsyncPhilox_d class maintains two large GPU buffers for noise. + * GPUModel calls loadAsyncPhilox to initialize states and + * fill the buffers, then, each advance requestSegment + * returns a float* slice of a buffer for use in + * advanceVertices + */ + +#pragma once +#include "Book.h" +#include +#include +#include +#include +#include +class AsyncPhilox_d { +public: + AsyncPhilox_d() = default; + + ~AsyncPhilox_d(); + + /// Initializes generator and allocates device memory + /// @param samplesPerSegment Number of total vertices + /// @param seed RNG seed. + void loadAsyncPhilox(int samplesPerSegment, unsigned long seed); + + /// Free device memory + void deleteDeviceStruct(); + + /// Request a new segment of generated noise. + /// @return Pointer to a slice of device memory containing noise. + float *requestSegment(); + +private: + /// Number of CUDA blocks to launch per kernel call. + int numBlocks_; + + /// Number of threads per CUDA block. + int numThreads_; + + /// Total number of threads = numBlocks × numThreads. + int totalThreads_; + + /// Number of random floats per segment. + int segmentSize_; + + /// Number of total segments in each buffer. + int totalSegments_; + + /// Number of random floats per buffer. + int bufferSize_; + + /// RNG seed. + unsigned long seed_; + +#ifdef ENABLE_NVTX + /// Marker index for NVTX profiling (if enabled). + int nvtxMarker_; + + /// Tracks current NVTX marker for alternating regions. + int nvtxCurrentMarker_; +#endif + + /// CUDA stream used for asynchronous kernel launches. + cudaStream_t RNG_stream_; + + /// Double-buffered random number output on device. + float *buffers_d[2]; + + /// Index of currently active buffer. + int currentBuffer_; + + /// Index of the next segment to serve. + int segmentIndex_; + + /// Device-side array of Philox curand RNG states. + curandStatePhilox4_32_10_t *spStates_d; + + // FILE* logfile; + // float* hostBuffer; + /// Logger for printing to the console as well as the logging file + log4cplus::Logger consoleLogger_; + + /// Internal helper to fill a specified buffer with random floats. + /// @param bufferIndex Index (0 or 1) of the buffer to fill. + void fillBuffer(int bufferIndex); +}; diff --git a/Simulator/Utils/RNG/MersenneTwister_d.cu b/Simulator/Utils/RNG/MersenneTwister_d.cu index e90151495..2c5c60345 100644 --- a/Simulator/Utils/RNG/MersenneTwister_d.cu +++ b/Simulator/Utils/RNG/MersenneTwister_d.cu @@ -260,5 +260,4 @@ extern "C" void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int t loadMTGPU(MT_DATAFILE); seedMTGPU(seed); -} - +} \ No newline at end of file diff --git a/Simulator/Utils/RNG/MersenneTwister_d.h b/Simulator/Utils/RNG/MersenneTwister_d.h index 04f1cd42b..7f7c888e8 100644 --- a/Simulator/Utils/RNG/MersenneTwister_d.h +++ b/Simulator/Utils/RNG/MersenneTwister_d.h @@ -50,4 +50,4 @@ struct mt_struct_stripped { #define MT_SHIFTC 15 #define MT_SHIFT1 18 -//#endif +//#endif \ No newline at end of file diff --git a/Simulator/Vertices/AllVertices.cpp b/Simulator/Vertices/AllVertices.cpp index 8c20235f0..55b2ba160 100644 --- a/Simulator/Vertices/AllVertices.cpp +++ b/Simulator/Vertices/AllVertices.cpp @@ -89,4 +89,19 @@ void AllVertices::loadEpochInputs(uint64_t currentStep, uint64_t endStep) { // This is an empty implementation so that Neural Network simulation works // normally -} \ No newline at end of file +} + + +#ifdef USE_GPU +/// Set the CUDA stream to be used by GPU vertices kernels in derived classes. +/// +/// This assigns a CUDA stream to the base class, allowing subclasses +/// to launch kernels on the correct stream. The stream is typically +/// created by GPUModel and passed down during simulation setup. +/// +/// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller. +void AllVertices::SetStream(cudaStream_t simulationStream) +{ + simulationStream_ = simulationStream; +} +#endif \ No newline at end of file diff --git a/Simulator/Vertices/AllVertices.h b/Simulator/Vertices/AllVertices.h index 73c013318..7182adfa7 100644 --- a/Simulator/Vertices/AllVertices.h +++ b/Simulator/Vertices/AllVertices.h @@ -32,6 +32,9 @@ using namespace std; #include // cereal #include "cereal/types/vector.hpp" +#if defined(USE_GPU) + #include +#endif // Utility function to convert a vertexType into a string. string vertexTypeToString(vertexType t); @@ -95,7 +98,19 @@ class AllVertices { log4cplus::Logger vertexLogger_; // Logs to Output/Debug/neurons.txt #if defined(USE_GPU) + /// Cuda Stream for Edge Kernels + cudaStream_t simulationStream_; + public: + /// Set the CUDA stream to be used by GPU vertices kernels in derived classes. + /// + /// This assigns a CUDA stream to the base class, allowing subclasses + /// to launch kernels on the correct stream. The stream is typically + /// created by GPUModel and passed down during simulation setup. + /// + /// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller. + void SetStream(cudaStream_t simulationStream); + /// Allocate GPU memories to store all vertices' states, /// and copy them from host to GPU memory. virtual void allocVerticesDeviceStruct() = 0; diff --git a/Simulator/Vertices/Neuro/AllIZHNeurons_d.cpp b/Simulator/Vertices/Neuro/AllIZHNeurons_d.cpp index faf05169e..f5325c54c 100644 --- a/Simulator/Vertices/Neuro/AllIZHNeurons_d.cpp +++ b/Simulator/Vertices/Neuro/AllIZHNeurons_d.cpp @@ -166,7 +166,7 @@ void AllIZHNeurons::advanceVertices(AllEdges &synapses, void *allVerticesDevice, int blocksPerGrid = (vertex_count + threadsPerBlock - 1) / threadsPerBlock; // Advance neurons -------------> - advanceIZHNeuronsDevice<<>>( + advanceIZHNeuronsDevice<<>>( vertex_count, Simulator::getInstance().getMaxEdgesPerVertex(), maxSpikes, Simulator::getInstance().getDeltaT(), g_simulationStep, randNoise, hasFired_, summationPoints_, Vm_, Aconst_, Bconst_, u_, numStepsInRefractoryPeriod_, Vthresh_, Trefract_, diff --git a/Simulator/Vertices/Neuro/AllLIFNeurons_d.cpp b/Simulator/Vertices/Neuro/AllLIFNeurons_d.cpp index 1af6c6b2f..62648ef16 100644 --- a/Simulator/Vertices/Neuro/AllLIFNeurons_d.cpp +++ b/Simulator/Vertices/Neuro/AllLIFNeurons_d.cpp @@ -56,7 +56,7 @@ void AllLIFNeurons::advanceVertices(AllEdges &synapses, void *allVerticesDevice, int blocksPerGrid = (vertex_count + threadsPerBlock - 1) / threadsPerBlock; // Advance neurons -------------> - advanceLIFNeuronsDevice<<>>( + advanceLIFNeuronsDevice<<>>( vertex_count, Simulator::getInstance().getMaxEdgesPerVertex(), maxSpikes, Simulator::getInstance().getDeltaT(), g_simulationStep, randNoise, hasFired_, summationPoints_, Vm_, Trefract_, numStepsInRefractoryPeriod_, Vthresh_, Vreset_, I0_, diff --git a/Simulator/Vertices/Neuro/AllSpikingNeurons_d.cpp b/Simulator/Vertices/Neuro/AllSpikingNeurons_d.cpp index 91edc75f1..9b0af580e 100644 --- a/Simulator/Vertices/Neuro/AllSpikingNeurons_d.cpp +++ b/Simulator/Vertices/Neuro/AllSpikingNeurons_d.cpp @@ -200,7 +200,7 @@ void AllSpikingNeurons::integrateVertexInputs(void *allVerticesDevice, = (Simulator::getInstance().getTotalVertices() + threadsPerBlock - 1) / threadsPerBlock; int vertex_count = Simulator::getInstance().getTotalVertices(); - calcSummationPointDevice<<>>( + calcSummationPointDevice<<>>( vertex_count, summationPoints_, edgeIndexMapDevice, (AllSpikingSynapsesDeviceProperties *)allEdgesDevice); } diff --git a/docs/Developer/AsyncPhilox.md b/docs/Developer/AsyncPhilox.md new file mode 100644 index 000000000..8117c8c97 --- /dev/null +++ b/docs/Developer/AsyncPhilox.md @@ -0,0 +1,40 @@ +# AsyncPhilox_d Class + +## Overview + +`AsyncPhilox_d` is a GPU-based random number generator class that uses NVIDIA's [CURAND](https://docs.nvidia.com/cuda/curand/index.html) library and the **Philox** counter-based RNG engine. It is designed for high-throughput simulations and supports asynchronous random number generation via an internal CUDA stream. + +The class provides a **double-buffered**, asynchronous mechanism to produce random floating-point numbers (normally distributed) directly on the GPU. This enables overlapping random number generation with compute or memory transfer tasks in other streams. + +--- + +## Purpose + +`AsyncPhilox_d` enables: + +- **Per-thread RNG state initialization** on the GPU +- **Segmented random number generation** using `curand_normal` +- **Double-buffering** for non-blocking buffer filling +- **Asynchronous execution** using a CUDA stream created and managed internally + +This design improves parallelism and simulation throughput by decoupling RNG generation from synchronous host and device execution. + +--- + +## CURAND Philox Generator + +Philox is a **counter-based** RNG suitable for parallel applications. It is: + +- **Stateless** across launches (state only encodes seed, counter, and thread ID) +- **Efficient** on GPUs due to its low register and instruction overhead +- **Deterministic**, producing reproducible sequences across threads + +NVIDIA’s CURAND provides the Philox generator via the `curandStatePhilox4_32_10_t` type, which is initialized on a per-thread basis using `curand_init`. + +You can find more details in the [CURAND Device API Overview](https://docs.nvidia.com/cuda/curand/device-api-overview.html). + + +## CUDA Documentation Links + +- [CURAND API Reference](https://docs.nvidia.com/cuda/curand/index.html) +- [Philox Generator in CURAND](https://docs.nvidia.com/cuda/curand/device-api-overview.html#bit-generation-3) diff --git a/docs/Developer/index.md b/docs/Developer/index.md index 770612418..a48dc6c7f 100644 --- a/docs/Developer/index.md +++ b/docs/Developer/index.md @@ -46,7 +46,7 @@ Students, use this [quickstart guide](StudentSetup.md) to help setup, use, and d - [Neuro Implementation](NeuroImplementation.md) - [GraphManager and InputManager classes](GraphAndEventInputs.md) - [Configuration](../User/configuration.md) - +- [AsyncPhilox RNG class](AsyncPhilox.md) --------- [<< Go back to the Graphitti home page](../index.md)