Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
22cd237
building a new class for gpu rng
AndrewBMadison Mar 13, 2025
0e70083
refactor
AndrewBMadison Mar 14, 2025
69b86c9
curand rng class
AndrewBMadison Apr 30, 2025
c740f2a
Merge remote-tracking branch 'origin/SharedDevelopment' into issue-81…
AndrewBMadison Apr 30, 2025
8f537d0
revert change
AndrewBMadison Apr 30, 2025
4014610
fix performance metrics
AndrewBMadison May 12, 2025
6058b88
Testing refactor
AndrewBMadison May 27, 2025
b0fa9ec
fix performance metrics bug
AndrewBMadison May 27, 2025
4ea4b41
added logs and increased size of buffers
AndrewBMadison May 27, 2025
e67a65e
testing various gpu memory allocation sizes
AndrewBMadison May 27, 2025
2fa2f4c
added include paths for CUDA include in cmakelists to fix a bug tryin…
AndrewBMadison May 27, 2025
7278cb2
testing
AndrewBMadison Jun 3, 2025
92bf648
testing full simulation and adding in error handling
AndrewBMadison Jun 3, 2025
8f8f4ac
changed rng to philox and performed an analysis on the distribution
AndrewBMadison Jun 3, 2025
f56d31a
turned off file logging and fixed double free bug
AndrewBMadison Jun 3, 2025
d30ca17
fixed non-blocking stream
AndrewBMadison Jun 5, 2025
e40cdf1
improved nvtx support
AndrewBMadison Jun 5, 2025
a3dd0f2
clang-format
AndrewBMadison Jun 5, 2025
a6cdf91
Merge branch 'SharedDevelopment' into issue-815-async-rng
AndrewBMadison Jun 5, 2025
339bca4
comments and renaming
AndrewBMadison Jun 10, 2025
9219836
Merge branch 'issue-815-async-rng' into AndrewDevelopment
AndrewBMadison Jun 10, 2025
00c418e
clang
AndrewBMadison Jun 10, 2025
51da776
cleaned up changes
AndrewBMadison Jun 10, 2025
383e054
clang
AndrewBMadison Jun 10, 2025
8719559
format
AndrewBMadison Jun 10, 2025
74104c8
format
AndrewBMadison Jun 10, 2025
6d6b56f
fixed cpp version of utils library and added appropriate include for …
AndrewBMadison Jun 10, 2025
f0a60a2
revert clang changes on MersenneTwister
AndrewBMadison Jun 10, 2025
2872d15
fixing cpu errors with cuda includes
AndrewBMadison Jun 10, 2025
522c85f
cleaned up linking and unnecessary includes
AndrewBMadison Jun 11, 2025
55ba665
clang format issues
AndrewBMadison Jun 11, 2025
13af9d7
clang format issues
AndrewBMadison Jun 11, 2025
935fe1f
removed unnecessary file removal
AndrewBMadison Jun 11, 2025
5d092ba
Wrote better comments with doxygen in mind and renamed variables acco…
AndrewBMadison Jun 23, 2025
4fdd643
clang
AndrewBMadison Jun 23, 2025
66a97fa
Added documentation
AndrewBMadison Jun 23, 2025
7aa77eb
fix nvtx variable names
AndrewBMadison Jun 23, 2025
5d7ee8c
Merge branch 'SharedDevelopment' into AndrewDevelopment
stiber Jun 30, 2025
ca319a9
Revert "Merge branch 'SharedDevelopment' into AndrewDevelopment"
AndrewBMadison Jul 2, 2025
21f9063
Manually remerged SharedDevelopment into AndrewDevelopment by applyin…
AndrewBMadison Jul 2, 2025
f269c67
moved deallocation of AsyncGenerator to deleteDeviceStruct
AndrewBMadison Jul 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ if(ENABLE_CUDA)
project(Graphitti LANGUAGES CXX CUDA C)
#Verify CUDA package is present
find_Package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
#Set the USE_GPU preprocessor macro so that GPU code will be compiled.
add_compile_definitions(USE_GPU)
#Specify the CUDA architecture / gencode that will be targeted
Expand Down Expand Up @@ -361,13 +362,15 @@ add_library(Matrix ${Matrix_Source})
file(GLOB RNG_Source Simulator/Utils/RNG/*.cpp Simulator/Utils/RNG/*.h Simulator/Utils/RNG/*.cu)
# Remove demo from file list as it contains a main and it will cause compilation errors
list(REMOVE_ITEM RNG_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/RNG/MersenneTwister_demo.cu")
if(NOT ENABLE_CUDA)
list(REMOVE_ITEM Utils_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/AsyncPhilox_d.cu")
endif()
add_library(RNG STATIC ${RNG_Source})


# Create Utils library
file(GLOB Utils_Source Simulator/Utils/*.cpp Simulator/Utils/*.h)
list(REMOVE_ITEM Utils_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/Factory.cpp")

if(CMAKE_BUILD_TYPE STREQUAL "Profiling")
if(ENABLE_CUDA)
# Find NVTX Library
Expand All @@ -390,9 +393,13 @@ add_library(Utils ${Utils_Source})

# Only link NVTX if it was found
if(NVTX_LIBRARY)

message(STATUS "Adding NVTX: to Utils lib")
target_link_libraries(Utils PRIVATE ${NVTX_LIBRARY})
endif()

message(STATUS "Linking RNG against Utils")
target_link_libraries(RNG PRIVATE Utils)

# Used to locate and run other CMakeLists.txt files from Third Party resources for further compilation of the project.
add_subdirectory(ThirdParty)
Expand Down
3 changes: 2 additions & 1 deletion Simulator/Connections/Connections.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ bool Connections::updateConnections(AllVertices &vertices)
#if defined(USE_GPU)
void Connections::updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
AllVerticesDeviceProperties *allVerticesDevice,
AllEdgesDeviceProperties *allEdgesDevice, Layout &layout)
AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
cudaStream_t simulationStream)
{
}
#else
Expand Down
9 changes: 8 additions & 1 deletion Simulator/Connections/Connections.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
// cereal
#include <cereal/types/memory.hpp>

#ifdef USE_GPU
#include <cuda_runtime.h>
#endif


using namespace std;

class Connections {
Expand Down Expand Up @@ -85,9 +90,11 @@ class Connections {
/// @param allVerticesDevice GPU address of the allVertices struct on device memory.
/// @param allEdgesDevice GPU address of the allEdges struct on device memory.
/// @param layout Layout information of the graph network.
/// @param simulationStream The cuda stream for all synchronous kernels.
virtual void updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
AllVerticesDeviceProperties *allVerticesDevice,
AllEdgesDeviceProperties *allEdgesDevice, Layout &layout);
AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
cudaStream_t simulationStream);
#else
public:
/// Update the weight of the edges in the simulation.
Expand Down
6 changes: 4 additions & 2 deletions Simulator/Connections/Neuro/ConnGrowth.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,12 @@ class ConnGrowth : public Connections {
/// @param allVerticesDevice GPU address of the AllVertices struct in device memory.
/// @param allEdgesDevice GPU address of the AllEdges struct in device memory.
/// @param layout The Layout object.
/// @param simulationStream The cuda stream for all synchronous kernels.
virtual void updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
AllVerticesDeviceProperties *allVerticesDevice,
AllEdgesDeviceProperties *allEdgesDevice,
Layout &layout) override;
AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
cudaStream_t simulationStream) override;

#else
/// Update the weights of the Synapses in the simulation. To be clear,
/// iterates through all source and destination neurons and updates their
Expand Down
6 changes: 4 additions & 2 deletions Simulator/Connections/Neuro/ConnGrowth_d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@
* @param allVerticesDevice GPU address to the AllVertices struct in device memory.
* @param allEdgesDevice GPU address to the AllEdges struct in device memory.
* @param layout The Layout object.
* @param simulationStream The cuda stream for all synchronous kernels.
*/
void ConnGrowth::updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
AllVerticesDeviceProperties *allVerticesDevice,
AllEdgesDeviceProperties *allEdgesDevice, Layout &layout)
AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
cudaStream_t simulationStream)
{
Simulator &simulator = Simulator::getInstance();
// For now, we just set the weights to equal the areas. We will later
Expand Down Expand Up @@ -64,7 +66,7 @@ void ConnGrowth::updateEdgesWeights(int numVertices, AllVertices &vertices, AllE
cudaMemcpyHostToDevice));

blocksPerGrid = (simulator.getTotalVertices() + threadsPerBlock - 1) / threadsPerBlock;
updateSynapsesWeightsDevice<<<blocksPerGrid, threadsPerBlock>>>(
updateSynapsesWeightsDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream>>>(
simulator.getTotalVertices(), deltaT, W_d, simulator.getMaxEdgesPerVertex(),
(AllSpikingNeuronsDeviceProperties *)allVerticesDevice,
(AllSpikingSynapsesDeviceProperties *)allEdgesDevice, neuronTypeMapD);
Expand Down
37 changes: 26 additions & 11 deletions Simulator/Core/GPUModel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ void GPUModel::allocDeviceStruct()

// Allocate synapse inverse map in device memory
allocEdgeIndexMap(numVertices);

// Create the CUDA stream used to launch synchronous GPU kernels during the simulation.
// This stream is passed to components like AllEdges and used consistently for kernel launches.
// For stream behavior and management, see:
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
HANDLE_ERROR(cudaStreamCreate(&simulationStream_));
}

/// Copies device memories to host memories and deallocates them.
Expand All @@ -74,7 +80,8 @@ void GPUModel::deleteDeviceStruct()
HANDLE_ERROR(cudaFree(synapseIMapDevice.incomingEdgeCount_));
HANDLE_ERROR(cudaFree(synapseIMapDevice.incomingEdgeIndexMap_));
HANDLE_ERROR(cudaFree(edgeIndexMapDevice_));
HANDLE_ERROR(cudaFree(randNoise_d));
HANDLE_ERROR(cudaStreamDestroy(simulationStream_));
AsyncGenerator_.deleteDeviceStruct();
}

/// Sets up the Simulation.
Expand All @@ -88,14 +95,17 @@ void GPUModel::setupSim()

//initialize Mersenne Twister
//assuming numVertices >= 100 and is a multiple of 100. Note rng_mt_rng_count must be <= MT_RNG_COUNT
int rng_blocks = 25; //# of blocks the kernel will use
int rng_nPerRng
= 4; //# of iterations per thread (thread granularity, # of rands generated per thread)
int rng_mt_rng_count = Simulator::getInstance().getTotalVertices()
/ rng_nPerRng; //# of threads to generate for numVertices rand #s
int rng_threads = rng_mt_rng_count / rng_blocks; //# threads per block needed
initMTGPU(Simulator::getInstance().getNoiseRngSeed(), rng_blocks, rng_threads, rng_nPerRng,
rng_mt_rng_count);
// int rng_blocks = 25; //# of blocks the kernel will use
// int rng_nPerRng
// = 4; //# of iterations per thread (thread granularity, # of rands generated per thread)
// int rng_mt_rng_count = Simulator::getInstance().getTotalVertices()
// / rng_nPerRng; //# of threads to generate for numVertices rand #s
// int rng_threads = rng_mt_rng_count / rng_blocks; //# threads per block needed
// initMTGPU(Simulator::getInstance().getNoiseRngSeed(), rng_blocks, rng_threads, rng_nPerRng,
// rng_mt_rng_count);
//cout << "blocks, threads, nPerRng, rng_rng_count: " << rng_blocks << " " << rng_threads << " " << rng_nPerRng << " " << rng_mt_rng_count << endl;
AsyncGenerator_.loadAsyncPhilox(Simulator::getInstance().getTotalVertices(),
Simulator::getInstance().getNoiseRngSeed());

#ifdef PERFORMANCE_METRICS
cudaEventCreate(&start);
Expand All @@ -116,6 +126,9 @@ void GPUModel::setupSim()

// set some parameters used for advanceEdgesDevice
edges.setAdvanceEdgesDeviceParams();
AllVertices &vertices = layout_->getVertices();
vertices.SetStream(simulationStream_);
edges.SetStream(simulationStream_);
}

/// Performs any finalization tasks on network following a simulation.
Expand Down Expand Up @@ -158,7 +171,8 @@ void GPUModel::advance()
// }
cudaMemcpy(randNoise_d, randNoise_h.data(), verts * sizeof(float), cudaMemcpyHostToDevice);
#else
normalMTGPU(randNoise_d);
// normalMTGPU(randNoise_d);
randNoise_d = AsyncGenerator_.requestSegment();
#endif
//LOG4CPLUS_DEBUG(vertexLogger_, "Index: " << index << " Vm: " << Vm);
#ifdef PERFORMANCE_METRICS
Expand Down Expand Up @@ -238,7 +252,8 @@ void GPUModel::updateConnections()
// Update Connections data
if (connections_->updateConnections(vertices)) {
connections_->updateEdgesWeights(Simulator::getInstance().getTotalVertices(), vertices, edges,
allVerticesDevice_, allEdgesDevice_, getLayout());
allVerticesDevice_, allEdgesDevice_, getLayout(),
simulationStream_);
// create edge index map
connections_->createEdgeIndexMap();
// copy index map to the device memory
Expand Down
27 changes: 20 additions & 7 deletions Simulator/Core/GPUModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@
#include "AllSpikingNeurons.h"
#include "AllSpikingSynapses.h"
#include "AllVertices.h"
#include "AsyncPhilox_d.h"
#include "OperationManager.h"
#ifdef USE_GPU
#include <cuda_runtime.h>
#endif

#ifdef VALIDATION_MODE
#include <fstream>
Expand Down Expand Up @@ -115,6 +119,15 @@ class GPUModel : public Model {
/// Pointer to device random noise array.
float *randNoise_d;

/// Async RNG class instance used to load generator with a seed and request noise device pointers.
AsyncPhilox_d AsyncGenerator_;
#ifdef VALIDATION_MODE
/// Buffer used in the validation mode to copy cgraphitti's noise generation into the device noise buffer.
float *randNoise_h;
#endif
/// Cuda Stream for kernel use
cudaStream_t simulationStream_;

#if defined(USE_GPU)
/// Pointer to edge index map in device memory.
EdgeIndexMapDevice *edgeIndexMapDevice_;
Expand Down Expand Up @@ -144,10 +157,10 @@ class GPUModel : public Model {
Coordinate dest, BGFLOAT deltaT, edgeType type);
};

#if defined(__CUDACC__)
extern "C" {
void normalMTGPU(float *randNoise_d);
void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int threads, unsigned int nPerRng,
unsigned int mt_rng_count);
}
#endif
// #if defined(__CUDACC__)
// extern "C" {
// void normalMTGPU(float *randNoise_d);
// void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int threads, unsigned int nPerRng,
// unsigned int mt_rng_count);
// }
// #endif
4 changes: 3 additions & 1 deletion Simulator/Core/Serializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@

#include "Serializer.h"
#include "ConnGrowth.h"
#include "GPUModel.h"
#if defined(USE_GPU)
#include "GPUModel.h"
#endif
#include <fstream>

// About CEREAL_XML_STRING_VALUE
Expand Down
6 changes: 4 additions & 2 deletions Simulator/Core/Simulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@

#include "Simulator.h"
#include "CPUModel.h"
#include "GPUModel.h"
#if defined(USE_GPU)
#include "GPUModel.h"
#endif
#include "OperationManager.h"
#include "ParameterManager.h"
#include "Utils/Factory.h"
Expand Down Expand Up @@ -173,7 +175,7 @@ void Simulator::simulate()
double total_time = timer.lap() / 1000000.0;

cout << "\ntotal_time: " << total_time << " seconds" << endl;
printPerformanceMetrics(total_time, currentEpoch);
printPerformanceMetrics(total_time, currentEpoch_);
cout << endl;
#endif
}
Expand Down
14 changes: 14 additions & 0 deletions Simulator/Edges/AllEdges.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,20 @@ void AllEdges::createEdgeIndexMap(EdgeIndexMap &edgeIndexMap)
}
}

#if defined(USE_GPU)
/// Set the CUDA stream to be used by GPU edge kernels in derived classes.
///
/// This assigns a CUDA stream to the base class, allowing subclasses
/// (e.g., AllSpikingSynapses_d, AllSTDPSynapses_d) to launch kernels on
/// the correct stream. The stream is typically created by GPUModel and
/// passed down during simulation setup.
///
/// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller.
void AllEdges::SetStream(cudaStream_t simulationStream)
{
simulationStream_ = simulationStream;
}
#endif

#if !defined(USE_GPU)

Expand Down
16 changes: 16 additions & 0 deletions Simulator/Edges/AllEdges.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#include <vector>
// cereal
#include "cereal/types/vector.hpp"
#ifdef USE_GPU
#include <cuda_runtime.h>
#endif

class AllVertices;
struct AllEdgesDeviceProperties;
Expand Down Expand Up @@ -92,7 +95,20 @@ class AllEdges {
log4cplus::Logger edgeLogger_;

#if defined(USE_GPU)
/// Cuda Stream for Edge Kernels
cudaStream_t simulationStream_;

public:
/// Set the CUDA stream to be used by GPU edge kernels in derived classes.
///
/// This assigns a CUDA stream to the base class, allowing subclasses
/// (e.g., AllSpikingSynapses_d, AllSTDPSynapses_d) to launch kernels on
/// the correct stream. The stream is typically created by GPUModel and
/// passed down during simulation setup.
///
/// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller.
void SetStream(cudaStream_t simulationStream);

/// Allocate GPU memories to store all edges' states,
/// and copy them from host to GPU memory.
virtual void allocEdgeDeviceStruct() = 0;
Expand Down
2 changes: 1 addition & 1 deletion Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ void AllSTDPSynapses::advanceEdges(void *allEdgesDevice, void *allVerticesDevice
const int threadsPerBlock = 256;
int blocksPerGrid = (totalEdgeCount_ + threadsPerBlock - 1) / threadsPerBlock;
// Advance synapses ------------->
advanceSTDPSynapsesDevice<<<blocksPerGrid, threadsPerBlock>>>(
advanceSTDPSynapsesDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream_>>>(
totalEdgeCount_, (EdgeIndexMapDevice *)edgeIndexMapDevice, g_simulationStep,
Simulator::getInstance().getDeltaT(), (AllSTDPSynapsesDeviceProperties *)allEdgesDevice,
(AllSpikingNeuronsDeviceProperties *)allVerticesDevice, maxSpikes);
Expand Down
2 changes: 1 addition & 1 deletion Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ void AllSpikingSynapses::advanceEdges(void *allEdgesDevice, void *allVerticesDev
const int threadsPerBlock = 256;
int blocksPerGrid = (totalEdgeCount_ + threadsPerBlock - 1) / threadsPerBlock;
// Advance synapses ------------->
advanceSpikingSynapsesDevice<<<blocksPerGrid, threadsPerBlock>>>(
advanceSpikingSynapsesDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream_>>>(
totalEdgeCount_, (EdgeIndexMapDevice *)edgeIndexMapDevice, g_simulationStep,
Simulator::getInstance().getDeltaT(), (AllSpikingSynapsesDeviceProperties *)allEdgesDevice);
}
Expand Down
1 change: 1 addition & 0 deletions Simulator/Utils/Book.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#pragma once

#include <cuda_runtime.h>
#include <stdio.h>
//! CUDA Exception handler
static void HandleError(cudaError_t err, const char *file, int line)
Expand Down
Loading