diff --git a/CMakeLists.txt b/CMakeLists.txt
index f30e63b58..62c8ecf1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,7 @@ if(ENABLE_CUDA)
         project(Graphitti LANGUAGES CXX CUDA C)
 #Verify CUDA package is present
         find_Package(CUDA REQUIRED)
+        include_directories(${CUDA_INCLUDE_DIRS})
 #Set the USE_GPU preprocessor macro so that GPU code will be compiled.
         add_compile_definitions(USE_GPU)
 #Specify the CUDA architecture / gencode that will be targeted
@@ -361,13 +362,15 @@ add_library(Matrix ${Matrix_Source})
 file(GLOB RNG_Source Simulator/Utils/RNG/*.cpp Simulator/Utils/RNG/*.h Simulator/Utils/RNG/*.cu)
 # Remove demo from file list as it contains a main and it will cause compilation errors
 list(REMOVE_ITEM RNG_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/RNG/MersenneTwister_demo.cu")
+if(NOT ENABLE_CUDA)
+        list(REMOVE_ITEM Utils_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/AsyncPhilox_d.cu")
+endif()
 add_library(RNG STATIC ${RNG_Source})
 
 
 # Create Utils library
 file(GLOB Utils_Source Simulator/Utils/*.cpp Simulator/Utils/*.h)
 list(REMOVE_ITEM Utils_Source "${CMAKE_CURRENT_SOURCE_DIR}/Simulator/Utils/Factory.cpp")
-
 if(CMAKE_BUILD_TYPE STREQUAL "Profiling")
         if(ENABLE_CUDA)
 # Find NVTX Library
@@ -390,9 +393,13 @@ add_library(Utils ${Utils_Source})
 
 # Only link NVTX if it was found
 if(NVTX_LIBRARY)
+        
+        message(STATUS "Adding NVTX: to Utils lib")
         target_link_libraries(Utils PRIVATE ${NVTX_LIBRARY})
 endif()
 
+message(STATUS "Linking RNG against Utils")
+target_link_libraries(RNG PRIVATE Utils)
 
 # Used to locate and run other CMakeLists.txt files from Third Party resources for further compilation of the project.
 add_subdirectory(ThirdParty)
diff --git a/Simulator/Connections/Connections.cpp b/Simulator/Connections/Connections.cpp
index a8a63fc0e..37f4803e8 100644
--- a/Simulator/Connections/Connections.cpp
+++ b/Simulator/Connections/Connections.cpp
@@ -91,7 +91,8 @@ bool Connections::updateConnections(AllVertices &vertices)
 #if defined(USE_GPU)
 void Connections::updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
                                      AllVerticesDeviceProperties *allVerticesDevice,
-                                     AllEdgesDeviceProperties *allEdgesDevice, Layout &layout)
+                                     AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
+                                     cudaStream_t simulationStream)
 {
 }
 #else
diff --git a/Simulator/Connections/Connections.h b/Simulator/Connections/Connections.h
index d8bcc8596..99f2694e0 100644
--- a/Simulator/Connections/Connections.h
+++ b/Simulator/Connections/Connections.h
@@ -33,6 +33,11 @@
 // cereal
 #include <cereal/types/memory.hpp>
 
+#ifdef USE_GPU
+   #include <cuda_runtime.h>
+#endif
+
+
 using namespace std;
 
 class Connections {
@@ -85,9 +90,11 @@ class Connections {
    ///  @param  allVerticesDevice    GPU address of the allVertices struct on device memory.
    ///  @param  allEdgesDevice       GPU address of the allEdges struct on device memory.
    ///  @param  layout               Layout information of the graph network.
+   ///  @param  simulationStream     The cuda stream for all synchronous kernels.
    virtual void updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
                                    AllVerticesDeviceProperties *allVerticesDevice,
-                                   AllEdgesDeviceProperties *allEdgesDevice, Layout &layout);
+                                   AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
+                                   cudaStream_t simulationStream);
 #else
 public:
    ///  Update the weight of the edges in the simulation.
diff --git a/Simulator/Connections/Neuro/ConnGrowth.h b/Simulator/Connections/Neuro/ConnGrowth.h
index e6a88078c..399a2665c 100644
--- a/Simulator/Connections/Neuro/ConnGrowth.h
+++ b/Simulator/Connections/Neuro/ConnGrowth.h
@@ -125,10 +125,12 @@ class ConnGrowth : public Connections {
    ///  @param  allVerticesDevice    GPU address of the AllVertices struct in device memory.
    ///  @param  allEdgesDevice       GPU address of the AllEdges struct in device memory.
    ///  @param  layout               The Layout object.
+   ///  @param  simulationStream     The cuda stream for all synchronous kernels.
    virtual void updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
                                    AllVerticesDeviceProperties *allVerticesDevice,
-                                   AllEdgesDeviceProperties *allEdgesDevice,
-                                   Layout &layout) override;
+                                   AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
+                                   cudaStream_t simulationStream) override;
+
 #else
    ///  Update the weights of the Synapses in the simulation. To be clear,
    ///  iterates through all source and destination neurons and updates their
diff --git a/Simulator/Connections/Neuro/ConnGrowth_d.cpp b/Simulator/Connections/Neuro/ConnGrowth_d.cpp
index 148834fea..19a6d9395 100644
--- a/Simulator/Connections/Neuro/ConnGrowth_d.cpp
+++ b/Simulator/Connections/Neuro/ConnGrowth_d.cpp
@@ -26,10 +26,12 @@
  *  @param  allVerticesDevice   GPU address to the AllVertices struct in device memory.
  *  @param  allEdgesDevice      GPU address to the AllEdges struct in device memory.
  *  @param  layout              The Layout object.
+ *  @param  simulationStream    The cuda stream for all synchronous kernels.
  */
 void ConnGrowth::updateEdgesWeights(int numVertices, AllVertices &vertices, AllEdges &edges,
                                     AllVerticesDeviceProperties *allVerticesDevice,
-                                    AllEdgesDeviceProperties *allEdgesDevice, Layout &layout)
+                                    AllEdgesDeviceProperties *allEdgesDevice, Layout &layout,
+                                    cudaStream_t simulationStream)
 {
    Simulator &simulator = Simulator::getInstance();
    // For now, we just set the weights to equal the areas. We will later
@@ -64,7 +66,7 @@ void ConnGrowth::updateEdgesWeights(int numVertices, AllVertices &vertices, AllE
                            cudaMemcpyHostToDevice));
 
    blocksPerGrid = (simulator.getTotalVertices() + threadsPerBlock - 1) / threadsPerBlock;
-   updateSynapsesWeightsDevice<<<blocksPerGrid, threadsPerBlock>>>(
+   updateSynapsesWeightsDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream>>>(
       simulator.getTotalVertices(), deltaT, W_d, simulator.getMaxEdgesPerVertex(),
       (AllSpikingNeuronsDeviceProperties *)allVerticesDevice,
       (AllSpikingSynapsesDeviceProperties *)allEdgesDevice, neuronTypeMapD);
diff --git a/Simulator/Core/GPUModel.cpp b/Simulator/Core/GPUModel.cpp
index 97453ac1c..a421c0193 100644
--- a/Simulator/Core/GPUModel.cpp
+++ b/Simulator/Core/GPUModel.cpp
@@ -58,6 +58,12 @@ void GPUModel::allocDeviceStruct()
 
    // Allocate synapse inverse map in device memory
    allocEdgeIndexMap(numVertices);
+
+   // Create the CUDA stream used to launch synchronous GPU kernels during the simulation.
+   // This stream is passed to components like AllEdges and used consistently for kernel launches.
+   // For stream behavior and management, see:
+   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
+   HANDLE_ERROR(cudaStreamCreate(&simulationStream_));
 }
 
 /// Copies device memories to host memories and deallocates them.
@@ -74,7 +80,8 @@ void GPUModel::deleteDeviceStruct()
    HANDLE_ERROR(cudaFree(synapseIMapDevice.incomingEdgeCount_));
    HANDLE_ERROR(cudaFree(synapseIMapDevice.incomingEdgeIndexMap_));
    HANDLE_ERROR(cudaFree(edgeIndexMapDevice_));
-   HANDLE_ERROR(cudaFree(randNoise_d));
+   HANDLE_ERROR(cudaStreamDestroy(simulationStream_));
+   AsyncGenerator_.deleteDeviceStruct();
 }
 
 /// Sets up the Simulation.
@@ -88,14 +95,17 @@ void GPUModel::setupSim()
 
    //initialize Mersenne Twister
    //assuming numVertices >= 100 and is a multiple of 100. Note rng_mt_rng_count must be <= MT_RNG_COUNT
-   int rng_blocks = 25;   //# of blocks the kernel will use
-   int rng_nPerRng
-      = 4;   //# of iterations per thread (thread granularity, # of rands generated per thread)
-   int rng_mt_rng_count = Simulator::getInstance().getTotalVertices()
-                          / rng_nPerRng;   //# of threads to generate for numVertices rand #s
-   int rng_threads = rng_mt_rng_count / rng_blocks;   //# threads per block needed
-   initMTGPU(Simulator::getInstance().getNoiseRngSeed(), rng_blocks, rng_threads, rng_nPerRng,
-             rng_mt_rng_count);
+   // int rng_blocks = 25;   //# of blocks the kernel will use
+   // int rng_nPerRng
+   //    = 4;   //# of iterations per thread (thread granularity, # of rands generated per thread)
+   // int rng_mt_rng_count = Simulator::getInstance().getTotalVertices()
+   //                        / rng_nPerRng;   //# of threads to generate for numVertices rand #s
+   // int rng_threads = rng_mt_rng_count / rng_blocks;   //# threads per block needed
+   // initMTGPU(Simulator::getInstance().getNoiseRngSeed(), rng_blocks, rng_threads, rng_nPerRng,
+   //           rng_mt_rng_count);
+   //cout << "blocks, threads, nPerRng, rng_rng_count: " << rng_blocks << " " << rng_threads << " " << rng_nPerRng << " " << rng_mt_rng_count << endl;
+   AsyncGenerator_.loadAsyncPhilox(Simulator::getInstance().getTotalVertices(),
+                                   Simulator::getInstance().getNoiseRngSeed());
 
 #ifdef PERFORMANCE_METRICS
    cudaEventCreate(&start);
@@ -116,6 +126,9 @@ void GPUModel::setupSim()
 
    // set some parameters used for advanceEdgesDevice
    edges.setAdvanceEdgesDeviceParams();
+   AllVertices &vertices = layout_->getVertices();
+   vertices.SetStream(simulationStream_);
+   edges.SetStream(simulationStream_);
 }
 
 /// Performs any finalization tasks on network following a simulation.
@@ -158,7 +171,8 @@ void GPUModel::advance()
    // }
    cudaMemcpy(randNoise_d, randNoise_h.data(), verts * sizeof(float), cudaMemcpyHostToDevice);
 #else
-   normalMTGPU(randNoise_d);
+   // normalMTGPU(randNoise_d);
+   randNoise_d = AsyncGenerator_.requestSegment();
 #endif
 //LOG4CPLUS_DEBUG(vertexLogger_, "Index: " << index << " Vm: " << Vm);
 #ifdef PERFORMANCE_METRICS
@@ -238,7 +252,8 @@ void GPUModel::updateConnections()
    // Update Connections data
    if (connections_->updateConnections(vertices)) {
       connections_->updateEdgesWeights(Simulator::getInstance().getTotalVertices(), vertices, edges,
-                                       allVerticesDevice_, allEdgesDevice_, getLayout());
+                                       allVerticesDevice_, allEdgesDevice_, getLayout(),
+                                       simulationStream_);
       // create edge index map
       connections_->createEdgeIndexMap();
       // copy index map to the device memory
diff --git a/Simulator/Core/GPUModel.h b/Simulator/Core/GPUModel.h
index 0a9562e40..4dd20ca70 100644
--- a/Simulator/Core/GPUModel.h
+++ b/Simulator/Core/GPUModel.h
@@ -25,7 +25,11 @@
 #include "AllSpikingNeurons.h"
 #include "AllSpikingSynapses.h"
 #include "AllVertices.h"
+#include "AsyncPhilox_d.h"
 #include "OperationManager.h"
+#ifdef USE_GPU
+   #include <cuda_runtime.h>
+#endif
 
 #ifdef VALIDATION_MODE
    #include <fstream>
@@ -115,6 +119,15 @@ class GPUModel : public Model {
    /// Pointer to device random noise array.
    float *randNoise_d;
 
+   /// Async RNG class instance used to load generator with a seed and request noise device pointers.
+   AsyncPhilox_d AsyncGenerator_;
+#ifdef VALIDATION_MODE
+   /// Buffer used in the validation mode to copy cgraphitti's noise generation into the device noise buffer.
+   float *randNoise_h;
+#endif
+   /// Cuda Stream for kernel use
+   cudaStream_t simulationStream_;
+
 #if defined(USE_GPU)
    /// Pointer to edge index map in device memory.
    EdgeIndexMapDevice *edgeIndexMapDevice_;
@@ -144,10 +157,10 @@ class GPUModel : public Model {
                    Coordinate dest, BGFLOAT deltaT, edgeType type);
 };
 
-#if defined(__CUDACC__)
-extern "C" {
-void normalMTGPU(float *randNoise_d);
-void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int threads, unsigned int nPerRng,
-               unsigned int mt_rng_count);
-}
-#endif
\ No newline at end of file
+// #if defined(__CUDACC__)
+// extern "C" {
+// void normalMTGPU(float *randNoise_d);
+// void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int threads, unsigned int nPerRng,
+//                unsigned int mt_rng_count);
+// }
+// #endif
\ No newline at end of file
diff --git a/Simulator/Core/Serializer.cpp b/Simulator/Core/Serializer.cpp
index 2911ffab8..d1b2c0f7f 100644
--- a/Simulator/Core/Serializer.cpp
+++ b/Simulator/Core/Serializer.cpp
@@ -25,7 +25,9 @@
 
 #include "Serializer.h"
 #include "ConnGrowth.h"
-#include "GPUModel.h"
+#if defined(USE_GPU)
+   #include "GPUModel.h"
+#endif
 #include <fstream>
 
 // About CEREAL_XML_STRING_VALUE
diff --git a/Simulator/Core/Simulator.cpp b/Simulator/Core/Simulator.cpp
index e4a42d9df..593cd5c48 100644
--- a/Simulator/Core/Simulator.cpp
+++ b/Simulator/Core/Simulator.cpp
@@ -9,7 +9,9 @@
 
 #include "Simulator.h"
 #include "CPUModel.h"
-#include "GPUModel.h"
+#if defined(USE_GPU)
+   #include "GPUModel.h"
+#endif
 #include "OperationManager.h"
 #include "ParameterManager.h"
 #include "Utils/Factory.h"
@@ -173,7 +175,7 @@ void Simulator::simulate()
       double total_time = timer.lap() / 1000000.0;
 
       cout << "\ntotal_time: " << total_time << " seconds" << endl;
-      printPerformanceMetrics(total_time, currentEpoch);
+      printPerformanceMetrics(total_time, currentEpoch_);
       cout << endl;
 #endif
    }
diff --git a/Simulator/Edges/AllEdges.cpp b/Simulator/Edges/AllEdges.cpp
index 802e4c183..27dd694f7 100644
--- a/Simulator/Edges/AllEdges.cpp
+++ b/Simulator/Edges/AllEdges.cpp
@@ -239,6 +239,20 @@ void AllEdges::createEdgeIndexMap(EdgeIndexMap &edgeIndexMap)
    }
 }
 
+#if defined(USE_GPU)
+/// Set the CUDA stream to be used by GPU edge kernels in derived classes.
+///
+/// This assigns a CUDA stream to the base class, allowing subclasses
+/// (e.g., AllSpikingSynapses_d, AllSTDPSynapses_d) to launch kernels on
+/// the correct stream. The stream is typically created by GPUModel and
+/// passed down during simulation setup.
+///
+/// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller.
+void AllEdges::SetStream(cudaStream_t simulationStream)
+{
+   simulationStream_ = simulationStream;
+}
+#endif
 
 #if !defined(USE_GPU)
 
diff --git a/Simulator/Edges/AllEdges.h b/Simulator/Edges/AllEdges.h
index 69f9e8f7c..cdf0c2b6f 100644
--- a/Simulator/Edges/AllEdges.h
+++ b/Simulator/Edges/AllEdges.h
@@ -15,6 +15,9 @@
 #include <vector>
 // cereal
 #include "cereal/types/vector.hpp"
+#ifdef USE_GPU
+   #include <cuda_runtime.h>
+#endif
 
 class AllVertices;
 struct AllEdgesDeviceProperties;
@@ -92,7 +95,20 @@ class AllEdges {
    log4cplus::Logger edgeLogger_;
 
 #if defined(USE_GPU)
+   ///  Cuda Stream for Edge Kernels
+   cudaStream_t simulationStream_;
+
 public:
+   /// Set the CUDA stream to be used by GPU edge kernels in derived classes.
+   ///
+   /// This assigns a CUDA stream to the base class, allowing subclasses
+   /// (e.g., AllSpikingSynapses_d, AllSTDPSynapses_d) to launch kernels on
+   /// the correct stream. The stream is typically created by GPUModel and
+   /// passed down during simulation setup.
+   ///
+   /// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller.
+   void SetStream(cudaStream_t simulationStream);
+
    ///  Allocate GPU memories to store all edges' states,
    ///  and copy them from host to GPU memory.
    virtual void allocEdgeDeviceStruct() = 0;
diff --git a/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp b/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp
index 2d91d1f11..a7c3804fc 100644
--- a/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp
+++ b/Simulator/Edges/Neuro/AllSTDPSynapses_d.cpp
@@ -264,7 +264,7 @@ void AllSTDPSynapses::advanceEdges(void *allEdgesDevice, void *allVerticesDevice
    const int threadsPerBlock = 256;
    int blocksPerGrid = (totalEdgeCount_ + threadsPerBlock - 1) / threadsPerBlock;
    // Advance synapses ------------->
-   advanceSTDPSynapsesDevice<<<blocksPerGrid, threadsPerBlock>>>(
+   advanceSTDPSynapsesDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream_>>>(
       totalEdgeCount_, (EdgeIndexMapDevice *)edgeIndexMapDevice, g_simulationStep,
       Simulator::getInstance().getDeltaT(), (AllSTDPSynapsesDeviceProperties *)allEdgesDevice,
       (AllSpikingNeuronsDeviceProperties *)allVerticesDevice, maxSpikes);
diff --git a/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp b/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp
index e6bb33be0..a6954cf82 100644
--- a/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp
+++ b/Simulator/Edges/Neuro/AllSpikingSynapses_d.cpp
@@ -331,7 +331,7 @@ void AllSpikingSynapses::advanceEdges(void *allEdgesDevice, void *allVerticesDev
    const int threadsPerBlock = 256;
    int blocksPerGrid = (totalEdgeCount_ + threadsPerBlock - 1) / threadsPerBlock;
    // Advance synapses ------------->
-   advanceSpikingSynapsesDevice<<<blocksPerGrid, threadsPerBlock>>>(
+   advanceSpikingSynapsesDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream_>>>(
       totalEdgeCount_, (EdgeIndexMapDevice *)edgeIndexMapDevice, g_simulationStep,
       Simulator::getInstance().getDeltaT(), (AllSpikingSynapsesDeviceProperties *)allEdgesDevice);
 }
diff --git a/Simulator/Utils/Book.h b/Simulator/Utils/Book.h
index f71b1e952..c0dbb6919 100644
--- a/Simulator/Utils/Book.h
+++ b/Simulator/Utils/Book.h
@@ -22,6 +22,7 @@
 
 #pragma once
 
+#include <cuda_runtime.h>
 #include <stdio.h>
 //! CUDA Exception handler
 static void HandleError(cudaError_t err, const char *file, int line)
diff --git a/Simulator/Utils/RNG/AsyncPhilox_d.cu b/Simulator/Utils/RNG/AsyncPhilox_d.cu
new file mode 100644
index 000000000..a9b61c2fb
--- /dev/null
+++ b/Simulator/Utils/RNG/AsyncPhilox_d.cu
@@ -0,0 +1,183 @@
+/**
+ * @file AsyncPhilox_d.cu
+ * 
+ * @ingroup Simulator/Utils/RNG
+ * 
+ * @brief Asynchronous Philox RNG using curand to fill GPU buffers
+ * 
+ * AsyncPhilox_d class maintains two large GPU buffers for noise.
+ * GPUModel calls loadAsyncPhilox to initialize states and
+ * fill the buffers, then, each advance requestSegment
+ * returns a float* slice of a buffer for use in
+ * advanceVertices
+ */
+
+#include "AsyncPhilox_d.h"
+#include "NvtxHelper.h"
+#include <cassert>
+#include <chrono>
+#include <iostream>
+
+/// @brief Kernel to generate Gaussian (normal) random numbers using Philox.
+///
+/// Each thread loads its own Philox RNG state, generates one or more
+/// random floats using a strided loop, and writes them into the output buffer.
+/// The updated RNG state is written back to global device memory.
+///
+/// @param states      Array of Philox RNG states, one per thread.
+/// @param output      Output buffer for generated random floats.
+/// @param bufferSize  Total number of floats to generate (length of output buffer).
+__global__ void generatePhilox(curandStatePhilox4_32_10_t *states, float *output, int bufferSize)
+{
+   // Compute a unique global index for this thread
+   int threadId = threadIdx.x;
+   int blockId = blockIdx.x;
+   int threadsPerBlock = blockDim.x;
+   int totalThreads = gridDim.x * threadsPerBlock;
+   int gid = blockId * threadsPerBlock + threadId;
+
+   // Load this thread’s Philox state
+   curandStatePhilox4_32_10_t local = states[gid];
+
+   // Stride‐loop: write one random per iteration until we cover bufferSize
+   for (int idx = gid; idx < bufferSize; idx += totalThreads) {
+      output[idx] = curand_normal(&local);
+   }
+
+   // Store back the updated state
+   states[gid] = local;
+}
+
+/// @brief Kernel to initialize Philox RNG states for each thread.
+///
+/// Each thread initializes its entry in the RNG state array using a fixed seed.
+/// This is typically called once before generating random numbers.
+///
+/// @param states        Array to hold initialized Philox RNG states.
+/// @param seed          Seed value used to initialize curand.
+/// @param totalThreads  Total number of threads that will use RNG states.
+__global__ void initPhilox(curandStatePhilox4_32_10_t *states, unsigned long seed, int totalThreads)
+{
+   int gid = blockIdx.x * blockDim.x + threadIdx.x;
+   if (gid >= totalThreads)
+      return;
+   curand_init(seed, gid, 0, &states[gid]);
+}
+
+/// Initializes generator and allocates device memory
+/// @param samplesPerSegment Number of total vertices
+/// @param seed RNG seed.
+void AsyncPhilox_d::loadAsyncPhilox(int samplesPerSegment, unsigned long seed)
+{
+   // hostBuffer = nullptr;
+   // cudaHostAlloc(&hostBuffer, samplesPerSegment * sizeof(float), cudaHostAllocDefault);
+   // logfile = std::fopen("philox_output_32_10.bin", "wb");
+   //consoleLogger_ = log4cplus::Logger::getInstance(LOG4CPLUS_TEXT("console"));
+   segmentSize_ = samplesPerSegment;
+   seed_ = seed;
+   currentBuffer_ = 0;
+   segmentIndex_ = 0;
+
+   totalSegments_ = 10;
+
+#ifdef ENABLE_NVTX
+   nvtxMarker_ = 10000 / totalSegments;   // make a marker every nvtxMarker buffer fills;
+   nvtxCurrentMarker_ = nvtxMarker_;      // count down to color flip
+#endif
+   bufferSize_ = segmentSize_ * totalSegments_;
+   numBlocks_ = 64;   //placeholder num of blocks
+   numThreads_ = 64;
+
+   totalThreads_ = numThreads_ * numBlocks_;
+
+   int leastPriority, greatestPriority;
+   HANDLE_ERROR(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
+   // └─ leastPriority is the numerically largest value → lowest actual priority
+   // └─ greatestPriority is the numerically smallest value → highest actual priority
+
+   // Create internal stream
+   HANDLE_ERROR(cudaStreamCreateWithPriority(&RNG_stream_, cudaStreamNonBlocking, leastPriority));
+
+   // Allocate two large buffers
+   HANDLE_ERROR(cudaMalloc(&buffers_d[0], bufferSize_ * sizeof(float)));
+   HANDLE_ERROR(cudaMalloc(&buffers_d[1], bufferSize_ * sizeof(float)));
+
+   HANDLE_ERROR(cudaMalloc(&spStates_d, totalThreads_ * sizeof(curandStatePhilox4_32_10_t)));
+
+   initPhilox<<<totalThreads_ + 255 / 256, 256, 0, RNG_stream_>>>(spStates_d, seed_, totalThreads_);
+
+   // Pre-fill both buffers
+   fillBuffer(0);
+   fillBuffer(1);
+   HANDLE_ERROR(cudaStreamSynchronize(
+      RNG_stream_));   //wait for both buffers to be filled before the first request
+}
+
+/// Free device memory
+void AsyncPhilox_d::deleteDeviceStruct()
+{
+   // std::fclose(logfile);
+   // cudaFree(hostBuffer);
+   HANDLE_ERROR(cudaFree(buffers_d[0]));
+   HANDLE_ERROR(cudaFree(buffers_d[1]));
+   HANDLE_ERROR(cudaFree(spStates_d));
+
+   HANDLE_ERROR(cudaStreamDestroy(RNG_stream_));
+}
+
+AsyncPhilox_d::~AsyncPhilox_d()
+{
+}
+
+/// Request a new segment of generated noise.
+/// @return Pointer to a slice of device memory containing noise.
+float *AsyncPhilox_d::requestSegment()
+{
+   //LOG4CPLUS_TRACE(consoleLogger_, "request segment");
+   //auto start = std::chrono::high_resolution_clock::now();
+#ifdef ENABLE_NVTX
+   static bool flipColor;
+#endif
+   if (segmentIndex_ >= totalSegments_) {
+      // Switch buffer and launch async refill on the now-unused one
+
+#ifdef ENABLE_NVTX
+      if (nvtxCurrentMarker_ <= 0) {
+         nvtxPop();
+         if (flipColor == true)
+            nvtxPushColor("10,000 time steps", Color::RED);
+         else
+            nvtxPushColor("10,000 time steps", Color::BLUE);
+
+         flipColor = !flipColor;
+         nvtxCurrentMarker_ = nvtxMarker_;
+      } else
+         --nvtxCurrentMarker_;
+#endif
+
+      int refillBuffer = currentBuffer_;
+      currentBuffer_ = 1 - currentBuffer_;
+      segmentIndex_ = 0;
+      cudaStreamSynchronize(RNG_stream_);   // Ensure refillBuffer is done
+      fillBuffer(refillBuffer);
+   }
+
+   float *segmentPtr = buffers_d[currentBuffer_] + segmentIndex_ * segmentSize_;
+   segmentIndex_ += 1;
+
+   // auto end = std::chrono::high_resolution_clock::now();
+   // std::cout << "Segment: " << segmentIndex << ", Launch time: " << (end - start).count() << " ns\n";
+   // cudaMemcpy(hostBuffer, segmentPtr, segmentSize * sizeof(float), cudaMemcpyDeviceToHost);
+   // std::fwrite(hostBuffer, sizeof(float), segmentSize, logfile);
+
+   return segmentPtr;
+}
+
+/// Internal helper to fill a specified buffer with random floats.
+/// @param bufferIndex Index (0 or 1) of the buffer to fill.
+void AsyncPhilox_d::fillBuffer(int bufferIndex)
+{
+   //LOG4CPLUS_TRACE(consoleLogger_, "filling buffer:");
+   generatePhilox<<<numBlocks_, numThreads_, 0, RNG_stream_>>>(spStates_d, buffers_d[bufferIndex],
+                                                               bufferSize_);
+}
diff --git a/Simulator/Utils/RNG/AsyncPhilox_d.h b/Simulator/Utils/RNG/AsyncPhilox_d.h
new file mode 100644
index 000000000..91c2a5b4c
--- /dev/null
+++ b/Simulator/Utils/RNG/AsyncPhilox_d.h
@@ -0,0 +1,93 @@
+/**
+ * @file AsyncPhilox_d.h
+ * 
+ * @ingroup Simulator/Utils/RNG
+ * 
+ * @brief Asynchronous Philox RNG using curand to fill GPU buffers
+ * 
+ * AsyncPhilox_d class maintains two large GPU buffers for noise.
+ * GPUModel calls loadAsyncPhilox to initialize states and
+ * fill the buffers, then, each advance requestSegment
+ * returns a float* slice of a buffer for use in
+ * advanceVertices
+ */
+
+#pragma once
+#include "Book.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+#include <log4cplus/loggingmacros.h>
+class AsyncPhilox_d {
+public:
+   AsyncPhilox_d() = default;
+
+   ~AsyncPhilox_d();
+
+   /// Initializes generator and allocates device memory
+   /// @param samplesPerSegment Number of total vertices
+   /// @param seed RNG seed.
+   void loadAsyncPhilox(int samplesPerSegment, unsigned long seed);
+
+   /// Free device memory
+   void deleteDeviceStruct();
+
+   /// Request a new segment of generated noise.
+   /// @return Pointer to a slice of device memory containing noise.
+   float *requestSegment();
+
+private:
+   /// Number of CUDA blocks to launch per kernel call.
+   int numBlocks_;
+
+   /// Number of threads per CUDA block.
+   int numThreads_;
+
+   /// Total number of threads = numBlocks × numThreads.
+   int totalThreads_;
+
+   /// Number of random floats per segment.
+   int segmentSize_;
+
+   /// Number of total segments in each buffer.
+   int totalSegments_;
+
+   /// Number of random floats per buffer.
+   int bufferSize_;
+
+   /// RNG seed.
+   unsigned long seed_;
+
+#ifdef ENABLE_NVTX
+   /// Marker index for NVTX profiling (if enabled).
+   int nvtxMarker_;
+
+   /// Tracks current NVTX marker for alternating regions.
+   int nvtxCurrentMarker_;
+#endif
+
+   /// CUDA stream used for asynchronous kernel launches.
+   cudaStream_t RNG_stream_;
+
+   /// Double-buffered random number output on device.
+   float *buffers_d[2];
+
+   /// Index of currently active buffer.
+   int currentBuffer_;
+
+   /// Index of the next segment to serve.
+   int segmentIndex_;
+
+   /// Device-side array of Philox curand RNG states.
+   curandStatePhilox4_32_10_t *spStates_d;
+
+   // FILE* logfile;
+   // float* hostBuffer;
+   /// Logger for printing to the console as well as the logging file
+   log4cplus::Logger consoleLogger_;
+
+   /// Internal helper to fill a specified buffer with random floats.
+   /// @param bufferIndex Index (0 or 1) of the buffer to fill.
+   void fillBuffer(int bufferIndex);
+};
diff --git a/Simulator/Utils/RNG/MersenneTwister_d.cu b/Simulator/Utils/RNG/MersenneTwister_d.cu
index e90151495..2c5c60345 100644
--- a/Simulator/Utils/RNG/MersenneTwister_d.cu
+++ b/Simulator/Utils/RNG/MersenneTwister_d.cu
@@ -260,5 +260,4 @@ extern "C" void initMTGPU(unsigned int seed, unsigned int blocks, unsigned int t
 
 	loadMTGPU(MT_DATAFILE);
 	seedMTGPU(seed);
-}
-
+}
\ No newline at end of file
diff --git a/Simulator/Utils/RNG/MersenneTwister_d.h b/Simulator/Utils/RNG/MersenneTwister_d.h
index 04f1cd42b..7f7c888e8 100644
--- a/Simulator/Utils/RNG/MersenneTwister_d.h
+++ b/Simulator/Utils/RNG/MersenneTwister_d.h
@@ -50,4 +50,4 @@ struct mt_struct_stripped {
 #define MT_SHIFTC 15
 #define MT_SHIFT1 18
 
-//#endif
+//#endif
\ No newline at end of file
diff --git a/Simulator/Vertices/AllVertices.cpp b/Simulator/Vertices/AllVertices.cpp
index 8c20235f0..55b2ba160 100644
--- a/Simulator/Vertices/AllVertices.cpp
+++ b/Simulator/Vertices/AllVertices.cpp
@@ -89,4 +89,19 @@ void AllVertices::loadEpochInputs(uint64_t currentStep, uint64_t endStep)
 {
    // This is an empty implementation so that Neural Network simulation works
    // normally
-}
\ No newline at end of file
+}
+
+
+#ifdef USE_GPU
+/// Set the CUDA stream to be used by GPU vertices kernels in derived classes.
+///
+/// This assigns a CUDA stream to the base class, allowing subclasses
+/// to launch kernels on the correct stream. The stream is typically
+/// created by GPUModel and passed down during simulation setup.
+///
+/// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller.
+void AllVertices::SetStream(cudaStream_t simulationStream)
+{
+   simulationStream_ = simulationStream;
+}
+#endif
\ No newline at end of file
diff --git a/Simulator/Vertices/AllVertices.h b/Simulator/Vertices/AllVertices.h
index 73c013318..7182adfa7 100644
--- a/Simulator/Vertices/AllVertices.h
+++ b/Simulator/Vertices/AllVertices.h
@@ -32,6 +32,9 @@ using namespace std;
 #include <log4cplus/loggingmacros.h>
 // cereal
 #include "cereal/types/vector.hpp"
+#if defined(USE_GPU)
+   #include <cuda_runtime.h>
+#endif
 
 // Utility function to convert a vertexType into a string.
 string vertexTypeToString(vertexType t);
@@ -95,7 +98,19 @@ class AllVertices {
    log4cplus::Logger vertexLogger_;   // Logs to Output/Debug/neurons.txt
 
 #if defined(USE_GPU)
+   ///  Cuda Stream for Edge Kernels
+   cudaStream_t simulationStream_;
+
 public:
+   /// Set the CUDA stream to be used by GPU vertices kernels in derived classes.
+   ///
+   /// This assigns a CUDA stream to the base class, allowing subclasses
+   /// to launch kernels on the correct stream. The stream is typically
+   /// created by GPUModel and passed down during simulation setup.
+   ///
+   /// @param simulationStream A valid CUDA stream (`cudaStream_t`) managed by the caller.
+   void SetStream(cudaStream_t simulationStream);
+
    ///  Allocate GPU memories to store all vertices' states,
    ///  and copy them from host to GPU memory.
    virtual void allocVerticesDeviceStruct() = 0;
diff --git a/Simulator/Vertices/Neuro/AllIZHNeurons_d.cpp b/Simulator/Vertices/Neuro/AllIZHNeurons_d.cpp
index faf05169e..f5325c54c 100644
--- a/Simulator/Vertices/Neuro/AllIZHNeurons_d.cpp
+++ b/Simulator/Vertices/Neuro/AllIZHNeurons_d.cpp
@@ -166,7 +166,7 @@ void AllIZHNeurons::advanceVertices(AllEdges &synapses, void *allVerticesDevice,
    int blocksPerGrid = (vertex_count + threadsPerBlock - 1) / threadsPerBlock;
 
    // Advance neurons ------------->
-   advanceIZHNeuronsDevice<<<blocksPerGrid, threadsPerBlock>>>(
+   advanceIZHNeuronsDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream_>>>(
       vertex_count, Simulator::getInstance().getMaxEdgesPerVertex(), maxSpikes,
       Simulator::getInstance().getDeltaT(), g_simulationStep, randNoise, hasFired_,
       summationPoints_, Vm_, Aconst_, Bconst_, u_, numStepsInRefractoryPeriod_, Vthresh_, Trefract_,
diff --git a/Simulator/Vertices/Neuro/AllLIFNeurons_d.cpp b/Simulator/Vertices/Neuro/AllLIFNeurons_d.cpp
index 1af6c6b2f..62648ef16 100644
--- a/Simulator/Vertices/Neuro/AllLIFNeurons_d.cpp
+++ b/Simulator/Vertices/Neuro/AllLIFNeurons_d.cpp
@@ -56,7 +56,7 @@ void AllLIFNeurons::advanceVertices(AllEdges &synapses, void *allVerticesDevice,
    int blocksPerGrid = (vertex_count + threadsPerBlock - 1) / threadsPerBlock;
 
    // Advance neurons ------------->
-   advanceLIFNeuronsDevice<<<blocksPerGrid, threadsPerBlock>>>(
+   advanceLIFNeuronsDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream_>>>(
       vertex_count, Simulator::getInstance().getMaxEdgesPerVertex(), maxSpikes,
       Simulator::getInstance().getDeltaT(), g_simulationStep, randNoise, hasFired_,
       summationPoints_, Vm_, Trefract_, numStepsInRefractoryPeriod_, Vthresh_, Vreset_, I0_,
diff --git a/Simulator/Vertices/Neuro/AllSpikingNeurons_d.cpp b/Simulator/Vertices/Neuro/AllSpikingNeurons_d.cpp
index 91edc75f1..9b0af580e 100644
--- a/Simulator/Vertices/Neuro/AllSpikingNeurons_d.cpp
+++ b/Simulator/Vertices/Neuro/AllSpikingNeurons_d.cpp
@@ -200,7 +200,7 @@ void AllSpikingNeurons::integrateVertexInputs(void *allVerticesDevice,
       = (Simulator::getInstance().getTotalVertices() + threadsPerBlock - 1) / threadsPerBlock;
    int vertex_count = Simulator::getInstance().getTotalVertices();
 
-   calcSummationPointDevice<<<blocksPerGrid, threadsPerBlock>>>(
+   calcSummationPointDevice<<<blocksPerGrid, threadsPerBlock, 0, simulationStream_>>>(
       vertex_count, summationPoints_, edgeIndexMapDevice,
       (AllSpikingSynapsesDeviceProperties *)allEdgesDevice);
 }
diff --git a/docs/Developer/AsyncPhilox.md b/docs/Developer/AsyncPhilox.md
new file mode 100644
index 000000000..8117c8c97
--- /dev/null
+++ b/docs/Developer/AsyncPhilox.md
@@ -0,0 +1,40 @@
+# AsyncPhilox_d Class
+
+## Overview
+
+`AsyncPhilox_d` is a GPU-based random number generator class that uses NVIDIA's [CURAND](https://docs.nvidia.com/cuda/curand/index.html) library and the **Philox** counter-based RNG engine. It is designed for high-throughput simulations and supports asynchronous random number generation via an internal CUDA stream.
+
+The class provides a **double-buffered**, asynchronous mechanism to produce random floating-point numbers (normally distributed) directly on the GPU. This enables overlapping random number generation with compute or memory transfer tasks in other streams.
+
+---
+
+## Purpose
+
+`AsyncPhilox_d` enables:
+
+- **Per-thread RNG state initialization** on the GPU
+- **Segmented random number generation** using `curand_normal`
+- **Double-buffering** for non-blocking buffer filling
+- **Asynchronous execution** using a CUDA stream created and managed internally
+
+This design improves parallelism and simulation throughput by decoupling RNG generation from synchronous host and device execution.
+
+---
+
+## CURAND Philox Generator
+
+Philox is a **counter-based** RNG suitable for parallel applications. It is:
+
+- **Stateless** across launches (state only encodes seed, counter, and thread ID)
+- **Efficient** on GPUs due to its low register and instruction overhead
+- **Deterministic**, producing reproducible sequences across threads
+
+NVIDIA’s CURAND provides the Philox generator via the `curandStatePhilox4_32_10_t` type, which is initialized on a per-thread basis using `curand_init`.
+
+You can find more details in the [CURAND Device API Overview](https://docs.nvidia.com/cuda/curand/device-api-overview.html).
+
+
+## CUDA Documentation Links
+
+- [CURAND API Reference](https://docs.nvidia.com/cuda/curand/index.html)
+- [Philox Generator in CURAND](https://docs.nvidia.com/cuda/curand/device-api-overview.html#bit-generation-3)
diff --git a/docs/Developer/index.md b/docs/Developer/index.md
index 770612418..a48dc6c7f 100644
--- a/docs/Developer/index.md
+++ b/docs/Developer/index.md
@@ -46,7 +46,7 @@ Students, use this [quickstart guide](StudentSetup.md) to help setup, use, and d
 - [Neuro Implementation](NeuroImplementation.md)
 - [GraphManager and InputManager classes](GraphAndEventInputs.md)
 - [Configuration](../User/configuration.md)
-
+- [AsyncPhilox RNG class](AsyncPhilox.md)
 
 ---------
 [<< Go back to the Graphitti home page](../index.md)