diff --git a/PYTHON/cuda_version_check.cpp b/PYTHON/cuda_version_check.cpp
new file mode 100644
index 000000000..caef05890
--- /dev/null
+++ b/PYTHON/cuda_version_check.cpp
@@ -0,0 +1,84 @@
+#include <iostream>
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+int main() {
+    std::cout << "=== CUDA Version Information ===" << std::endl;
+    
+    // Get CUDA Runtime Version
+    int runtimeVersion;
+    cudaError_t runtimeResult = cudaRuntimeGetVersion(&runtimeVersion);
+    if (runtimeResult == cudaSuccess) {
+        int runtimeMajor = runtimeVersion / 1000;
+        int runtimeMinor = (runtimeVersion % 1000) / 10;
+        std::cout << "CUDA Runtime Version: " << runtimeMajor << "." << runtimeMinor 
+                  << " (raw: " << runtimeVersion << ")" << std::endl;
+    } else {
+        std::cout << "Error getting CUDA Runtime version: " 
+                  << cudaGetErrorString(runtimeResult) << std::endl;
+    }
+    
+    // Get CUDA Driver Version
+    int driverVersion;
+    cudaError_t driverResult = cudaDriverGetVersion(&driverVersion);
+    if (driverResult == cudaSuccess) {
+        int driverMajor = driverVersion / 1000;
+        int driverMinor = (driverVersion % 1000) / 10;
+        std::cout << "CUDA Driver Version: " << driverMajor << "." << driverMinor 
+                  << " (raw: " << driverVersion << ")" << std::endl;
+    } else {
+        std::cout << "Error getting CUDA Driver version: " 
+                  << cudaGetErrorString(driverResult) << std::endl;
+    }
+    
+    // Check compatibility
+    if (driverResult == cudaSuccess && runtimeResult == cudaSuccess) {
+        std::cout << "\nCompatibility Check:" << std::endl;
+        if (driverVersion >= runtimeVersion) {
+            std::cout << "✓ Driver and runtime versions are compatible" << std::endl;
+        } else {
+            std::cout << "✗ WARNING: Driver version is older than runtime!" << std::endl;
+            std::cout << "  This may cause cudaErrorInsufficientDriver errors" << std::endl;
+        }
+    }
+    
+    // Get device information
+    int deviceCount;
+    cudaError_t deviceResult = cudaGetDeviceCount(&deviceCount);
+    if (deviceResult == cudaSuccess) {
+        std::cout << "\n=== Device Information ===" << std::endl;
+        std::cout << "Number of CUDA devices: " << deviceCount << std::endl;
+        
+        for (int i = 0; i < deviceCount; i++) {
+            cudaDeviceProp prop;
+            cudaError_t propResult = cudaGetDeviceProperties(&prop, i);
+            if (propResult == cudaSuccess) {
+                std::cout << "Device " << i << ": " << prop.name << std::endl;
+                std::cout << "  Compute Capability: " << prop.major << "." << prop.minor << std::endl;
+                std::cout << "  Total Memory: " << prop.totalGlobalMem / (1024*1024*1024) << " GB" << std::endl;
+                std::cout << "  Multiprocessors: " << prop.multiProcessorCount << std::endl;
+            }
+        }
+    } else {
+        std::cout << "Error getting device count: " 
+                  << cudaGetErrorString(deviceResult) << std::endl;
+    }
+    
+    // Alternative method using CUDA Driver API directly
+    std::cout << "\n=== Alternative Driver API Check ===" << std::endl;
+    CUresult cuResult = cuInit(0);
+    if (cuResult == CUDA_SUCCESS) {
+        int cuDriverVersion;
+        cuResult = cuDriverGetVersion(&cuDriverVersion);
+        if (cuResult == CUDA_SUCCESS) {
+            int cuDriverMajor = cuDriverVersion / 1000;
+            int cuDriverMinor = (cuDriverVersion % 1000) / 10;
+            std::cout << "CUDA Driver Version (Driver API): " << cuDriverMajor << "." << cuDriverMinor 
+                      << " (raw: " << cuDriverVersion << ")" << std::endl;
+        }
+    } else {
+        std::cout << "Failed to initialize CUDA Driver API" << std::endl;
+    }
+    
+    return 0;
+}
diff --git a/PYTHON/hello-nvshmem.py b/PYTHON/hello-nvshmem.py
new file mode 100644
index 000000000..d4b10dbb8
--- /dev/null
+++ b/PYTHON/hello-nvshmem.py
@@ -0,0 +1,55 @@
+import numpy
+from mpi4py import MPI
+from cuda.core.experimental import Device
+from cuda.core.experimental import system
+import nvshmem.core as nvshmem
+
+# Initialize MPI
+comm = MPI.COMM_WORLD
+me = comm.Get_rank()
+np = comm.Get_size()
+
+# Initialize NVSHMEM with MPI
+dev = Device(me % system.num_devices)
+dev.set_current()
+nvshmem.init(device=dev, mpi_comm=comm, initializer_method="mpi")
+
+#uid = nvshmem.get_unique_id(empty=(me != 0))
+#comm.Bcast(uid._data.view(numpy.int8), root=0)
+#dev = Device()
+#dev.set_current()
+#nvshmem.init(device=dev, uid=uid, rank=me, nranks=np, initializer_method="uid")
+
+#dev = Device(me % system.num_devices)
+#dev.set_current()
+#nvshmem.init(device=dev, mpi_comm=comm, initializer_method="emulated_mpi")
+
+stream = dev.create_stream()
+
+# Get information about the current PE
+my_pe = nvshmem.my_pe()
+n_pes = nvshmem.n_pes()
+
+# Allocate symmetric memory
+# array() returns a CuPy NDArray object
+x = nvshmem.array((1024,), dtype="float32")
+y = nvshmem.array((1024,), dtype="float32")
+
+#if my_pe == 0:
+#    y[:] = 1.0
+
+# Perform communication operations
+# Put y from PE 0 into x on PE 1
+if my_pe == 0:
+    nvshmem.put(x, y, 1, stream=stream)
+
+# Synchronize PEs
+nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
+stream.sync()
+
+# Clean up
+nvshmem.free_array(x)
+nvshmem.free_array(y)
+nvshmem.finalize()
+print('OK')
+
diff --git a/PYTHON/nstream-cupy-nvshmem.py b/PYTHON/nstream-cupy-nvshmem.py
new file mode 100755
index 000000000..a5335999e
--- /dev/null
+++ b/PYTHON/nstream-cupy-nvshmem.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2020, Intel Corporation
+# Copyright (c) 2023, NVIDIA
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      disclaimer in the documentation and/or other materials provided
+#      with the distribution.
+# * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products
+#      derived from this software without specific prior written
+#      permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+#*******************************************************************
+#
+# NAME:    nstream
+#
+# PURPOSE: To compute memory bandwidth when adding a vector of a given
+#          number of double precision values to the scalar multiple of
+#          another vector of the same length, and storing the result in
+#          a third vector.
+#
+# USAGE:   The program takes as input the number
+#          of iterations to loop over the triad vectors, the length of the
+#          vectors, and the offset between vectors
+#
+#          <progname> <# iterations> <vector length> <offset>
+#
+#          The output consists of diagnostics to make sure the
+#          algorithm worked, and of timing statistics.
+#
+# NOTES:   Bandwidth is determined as the number of words read, plus the
+#          number of words written, times the size of the words, divided
+#          by the execution time. For a vector length of N, the total
+#          number of words read and written is 4*N*sizeof(double).
+#
+#
+# HISTORY: This code is loosely based on the Stream benchmark by John
+#          McCalpin, but does not follow all the Stream rules. Hence,
+#          reported results should not be associated with Stream in
+#          external publications
+#
+#          Converted to Python by Jeff Hammond, October 2017.
+#          Adapted for CuPy+NVSHMEM4Py, December 2024.
+#
+# *******************************************************************
+
+import sys
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
+
+from mpi4py import MPI
+
+import cupy
+
+if False:
+    print('=== CUDA Version Information ===')
+
+    try:
+        # Get CUDA runtime version
+        runtime_version = cupy.cuda.runtime.runtimeGetVersion()
+        runtime_major = runtime_version // 1000
+        runtime_minor = (runtime_version % 1000) // 10
+        print(f'CUDA Runtime Version: {runtime_major}.{runtime_minor} (raw: {runtime_version})')
+        
+        # Get CUDA driver version  
+        driver_version = cupy.cuda.runtime.driverGetVersion()
+        driver_major = driver_version // 1000
+        driver_minor = (driver_version % 1000) // 10
+        print(f'CUDA Driver Version: {driver_major}.{driver_minor} (raw: {driver_version})')
+        
+        print(f'Version compatibility: Driver {driver_major}.{driver_minor} vs Runtime {runtime_major}.{runtime_minor}')
+        
+        if driver_version < runtime_version:
+            print('WARNING: Driver version is older than runtime version!')
+            print('This can cause \"cudaErrorInsufficientDriver\" errors.')
+            print('Consider updating your NVIDIA drivers.')
+        else:
+            print('Driver and runtime versions are compatible.')
+            
+    except Exception as e:
+        print(f'Error: {e}')
+    print('This usually indicates CUDA driver/runtime compatibility issues.')
+
+from cuda.core.experimental import Device
+from cuda.core.experimental import system
+
+import nvshmem.core as nvshmem
+
+def main():
+    
+    # Initialize MPI and CUDA device
+    comm = MPI.COMM_WORLD
+    local_rank = comm.Get_rank() % system.num_devices
+    device = Device(local_rank)
+    device.set_current()
+    stream = device.create_stream()
+    
+    # Initialize NVSHMEM with MPI
+    nvshmem.init(device=device, mpi_comm=comm, initializer_method="mpi")
+    
+    me = nvshmem.my_pe()
+    np = nvshmem.n_pes()
+
+    # ********************************************************************
+    # read and test input parameters
+    # ********************************************************************
+
+    if (me==0):
+        print('Parallel Research Kernels version ') #, PRKVERSION
+        print('Python CuPy/NVSHMEM STREAM triad: A = B + scalar * C')
+
+    if len(sys.argv) != 3:
+        if (me==0):
+            print('argument count = ', len(sys.argv))
+            print("Usage: python nstream-cupy-nvshmem.py <# iterations> <vector length>")
+        nvshmem.finalize()
+        sys.exit()
+
+    iterations = int(sys.argv[1])
+    if iterations < 1:
+        if (me==0):
+            print("ERROR: iterations must be >= 1")
+        nvshmem.finalize()
+        sys.exit()
+
+    total_length = int(sys.argv[2])
+    if total_length < 1:
+        if (me==0):
+            print("ERROR: length must be positive")
+        nvshmem.finalize()
+        sys.exit()
+
+    # Distribute work across GPUs/PEs
+    length = int(total_length / np)
+    remainder = total_length % np
+    if (remainder > 0):
+        if (me < remainder):
+            length += 1
+
+    if (me==0):
+        print('Number of PEs        = ', np)
+        print('Number of iterations = ', iterations)
+        print('Vector length        = ', total_length)
+        print('Local vector length  = ', length)
+
+    # Barrier using NVSHMEM
+    nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
+    stream.sync()
+
+    # ********************************************************************
+    # ** Allocate space for the input and execute STREAM triad
+    # ********************************************************************
+
+    # Allocate symmetric GPU arrays using NVSHMEM4Py interoperability with CuPy
+    A = nvshmem.interop.cupy.array((length,), dtype="float64")
+    B = nvshmem.interop.cupy.array((length,), dtype="float64") 
+    C = nvshmem.interop.cupy.array((length,), dtype="float64")
+    
+    # Initialize arrays
+    A[:] = 0.0
+    B[:] = 2.0
+    C[:] = 2.0
+
+    scalar = 3.0
+
+    # Timing loop
+    for k in range(0, iterations+1):
+
+        if k < 1:
+            nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
+            stream.sync()
+            t0 = timer()
+
+        # STREAM triad operation on GPU using CuPy operations
+        A += B + scalar * C
+        # it seems like this is required to get proper timings - maybe some weird JiT thing happening
+        nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
+
+    # Final synchronization
+    nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
+    stream.sync()
+    t1 = timer()
+    nstream_time = t1 - t0
+
+    # ********************************************************************
+    # ** Analyze and output results.
+    # ********************************************************************
+
+    # Calculate expected result
+    ar = 0.0
+    br = 2.0
+    cr = 2.0
+    for k in range(0, iterations+1):
+        ar += br + scalar * cr
+
+    ar *= total_length
+
+    # Calculate local checksum
+    asum_local = cupy.linalg.norm(A, ord=1)
+    
+    # Create source and destination arrays for reduction
+    src = nvshmem.interop.cupy.array((1,), dtype="float64")
+    dst = nvshmem.interop.cupy.array((1,), dtype="float64")
+    src[0] = asum_local
+    dst[0] = 0.0
+    
+    # Reduce across all PEs using NVSHMEM collective
+    nvshmem.reduce(nvshmem.Teams.TEAM_WORLD, dst, src, op="sum", stream=stream)
+    stream.sync()
+    
+    asum_global = float(dst[0])
+
+    epsilon = 1.e-8
+    if abs(ar - asum_global) / asum_global > epsilon:
+        if (me == 0):
+            print('Failed Validation on output array')
+            print('        Expected checksum: ', ar)
+            print('        Observed checksum: ', asum_global)
+            print("ERROR: solution did not validate")
+    else:
+        if (me == 0):
+            print('Solution validates')
+            avgtime = nstream_time / iterations
+            nbytes = 4.0 * total_length * 8  # 8 bytes per double
+            print('Rate (GB/s): ', 1.e-9 * nbytes / avgtime, ' Avg time (s): ', avgtime)
+
+    # Free NVSHMEM arrays
+    nvshmem.free_array(A)
+    nvshmem.free_array(B)
+    nvshmem.free_array(C)
+    nvshmem.free_array(src)
+    nvshmem.free_array(dst)
+    
+    # Finalize NVSHMEM
+    nvshmem.finalize()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 27633dc01..169b4280c 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -1,7 +1,7 @@
 #
 # This file shows the CUDA toolchain options
 # for both NVHPC and GCC.
-NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/24.9
+NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/25.7
 #NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11
 #NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021
 NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/
@@ -163,14 +163,16 @@ CBLASFLAG=${BLASFLAG}
 # Use appropriate arch or code is compiled to ancient features.
 #NVCC=${NVHPC_CBIN}nvc++
 #NVCC=${NVHPC_CBIN}nvcc
-NVCC=/usr/local/cuda-12.6/bin/nvcc
+#NVCC=/usr/local/cuda-12/bin/nvcc
+NVCC=nvcc
 CUDAFLAGS=-g -O3 -std=c++20
 CUDAFLAGS+=--extended-lambda
-CUDAFLAGS+=--gpu-architecture=sm_89
+CUDAFLAGS+=--gpu-architecture=sm_90
 #CUDAFLAGS+=-allow-unsupported-compiler
 #CUDAFLAGS+=-ccbin=g++-13 -lm #-lstdc++
 #CUDAFLAGS+=--compiler-bindir=/opt/gcc/12.3.0/bin/
 #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp
+CUDAFLAGS+=--forward-unknown-to-host-compiler # rpath
 CUDAFLAGS+=-rdc=true # FIXES ptxas fatal   : Unresolved extern function 'cudaCGGetIntrinsicHandle'
 #CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/include
 #CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/lib
@@ -196,7 +198,7 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED
 #CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED
 #CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED
 #CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED
-CUDAFLAGS += -I/usr/local/cuda-12.6/targets/x86_64-linux/include/cub/detail
+CUDAFLAGS += -I/usr/local/cuda-12/targets/x86_64-linux/include/cub/detail
 #
 # NCCL
 #
@@ -207,12 +209,15 @@ NCCLLIB=-L${NCCLDIR}/lib -lnccl
 # NVSHMEM (Apt packages not reliable...)
 #
 NVSHMEMFLAGS=-rdc=true -diag-suppress 3012,3013
-#NVSHMEMFLAGS+=-I/usr/include/nvshmem_12
-NVSHMEM_DIR=${HOME}/NVSHMEM/nvshmem/build/src
+#NVSHMEM_DIR=${HOME}/NVSHMEM/nvshmem/build/src
 #NVSHMEM_DIR=/opt/nvidia/hpc_sdk/Linux_x86_64/24.11/comm_libs/12.6/nvshmem
-NVSHMEMFLAGS+=-I${NVSHMEM_DIR}/include
-NVSHMEMFLAGS+=-L${NVSHMEM_DIR}/lib
-NVSHMEMFLAGS+=-Wl,-rpath=${NVSHMEM_DIR}/lib
+#NVSHMEMFLAGS+=-I${NVSHMEM_DIR}/include
+#NVSHMEMFLAGS+=-L${NVSHMEM_DIR}/lib
+#NVSHMEMFLAGS+=-Wl,-rpath=${NVSHMEM_DIR}/lib
+# apt or pip installs like this
+NVSHMEMFLAGS+=-I/usr/include/nvshmem_12
+NVSHMEMFLAGS+=-L/usr/lib/x86_64-linux-gnu/nvshmem/12
+NVSHMEMFLAGS+=-Wl,-rpath=/usr/lib/x86_64-linux-gnu/nvshmem/12
 NVSHMEMFLAGS+=-lnvshmem_device -lnvshmem_host
 #
 # CUDASTF
@@ -227,14 +232,15 @@ CUDASTF_CFLAGS+=-lcuda
 #
 # mpiicc wraps icc.  mpicc and mpigcc wrap gcc.
 #MPIDIR=${NVHPC_PATH}/comm_libs/hpcx
-MPIDIR=${NVHPC_PATH}/comm_libs/12.6/openmpi4/latest
+#MPIDIR=${NVHPC_PATH}/comm_libs/12.9/openmpi4/latest
+MPIDIR=/usr/local
 MPICC=${MPIDIR}/bin/mpicc
 MPICXX=${MPIDIR}/bin/mpicxx
 MPIFORT=${MPIDIR}/bin/mpifort
 MPIINC=-I${MPIDIR}/include
 MPILIB=-L${MPIDIR}/lib -lmpi
 #MPILIB+=-Wl,-rpath -Wl,${MPIDIR}/lib -Wl,--enable-new-dtags # NVCC chokes on -Wl
-MPILIB+=-lopen-pal -lopen-rte
+#MPILIB+=-lopen-pal -lopen-rte
 #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi
 #MPIINC=-I/usr/include/mpich-3.2-$$(uname -m)
 #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi