diff --git a/PYTHON/cuda_version_check.cpp b/PYTHON/cuda_version_check.cpp new file mode 100644 index 000000000..caef05890 --- /dev/null +++ b/PYTHON/cuda_version_check.cpp @@ -0,0 +1,84 @@ +#include +#include +#include + +int main() { + std::cout << "=== CUDA Version Information ===" << std::endl; + + // Get CUDA Runtime Version + int runtimeVersion; + cudaError_t runtimeResult = cudaRuntimeGetVersion(&runtimeVersion); + if (runtimeResult == cudaSuccess) { + int runtimeMajor = runtimeVersion / 1000; + int runtimeMinor = (runtimeVersion % 1000) / 10; + std::cout << "CUDA Runtime Version: " << runtimeMajor << "." << runtimeMinor + << " (raw: " << runtimeVersion << ")" << std::endl; + } else { + std::cout << "Error getting CUDA Runtime version: " + << cudaGetErrorString(runtimeResult) << std::endl; + } + + // Get CUDA Driver Version + int driverVersion; + cudaError_t driverResult = cudaDriverGetVersion(&driverVersion); + if (driverResult == cudaSuccess) { + int driverMajor = driverVersion / 1000; + int driverMinor = (driverVersion % 1000) / 10; + std::cout << "CUDA Driver Version: " << driverMajor << "." << driverMinor + << " (raw: " << driverVersion << ")" << std::endl; + } else { + std::cout << "Error getting CUDA Driver version: " + << cudaGetErrorString(driverResult) << std::endl; + } + + // Check compatibility + if (driverResult == cudaSuccess && runtimeResult == cudaSuccess) { + std::cout << "\nCompatibility Check:" << std::endl; + if (driverVersion >= runtimeVersion) { + std::cout << "✓ Driver and runtime versions are compatible" << std::endl; + } else { + std::cout << "✗ WARNING: Driver version is older than runtime!" << std::endl; + std::cout << " This may cause cudaErrorInsufficientDriver errors" << std::endl; + } + } + + // Get device information + int deviceCount; + cudaError_t deviceResult = cudaGetDeviceCount(&deviceCount); + if (deviceResult == cudaSuccess) { + std::cout << "\n=== Device Information ===" << std::endl; + std::cout << "Number of CUDA devices: " << deviceCount << std::endl; + + for (int i = 0; i < deviceCount; i++) { + cudaDeviceProp prop; + cudaError_t propResult = cudaGetDeviceProperties(&prop, i); + if (propResult == cudaSuccess) { + std::cout << "Device " << i << ": " << prop.name << std::endl; + std::cout << " Compute Capability: " << prop.major << "." << prop.minor << std::endl; + std::cout << " Total Memory: " << prop.totalGlobalMem / (1024*1024*1024) << " GB" << std::endl; + std::cout << " Multiprocessors: " << prop.multiProcessorCount << std::endl; + } + } + } else { + std::cout << "Error getting device count: " + << cudaGetErrorString(deviceResult) << std::endl; + } + + // Alternative method using CUDA Driver API directly + std::cout << "\n=== Alternative Driver API Check ===" << std::endl; + CUresult cuResult = cuInit(0); + if (cuResult == CUDA_SUCCESS) { + int cuDriverVersion; + cuResult = cuDriverGetVersion(&cuDriverVersion); + if (cuResult == CUDA_SUCCESS) { + int cuDriverMajor = cuDriverVersion / 1000; + int cuDriverMinor = (cuDriverVersion % 1000) / 10; + std::cout << "CUDA Driver Version (Driver API): " << cuDriverMajor << "." << cuDriverMinor + << " (raw: " << cuDriverVersion << ")" << std::endl; + } + } else { + std::cout << "Failed to initialize CUDA Driver API" << std::endl; + } + + return 0; +} diff --git a/PYTHON/hello-nvshmem.py b/PYTHON/hello-nvshmem.py new file mode 100644 index 000000000..d4b10dbb8 --- /dev/null +++ b/PYTHON/hello-nvshmem.py @@ -0,0 +1,55 @@ +import numpy +from mpi4py import MPI +from cuda.core.experimental import Device +from cuda.core.experimental import system +import nvshmem.core as nvshmem + +# Initialize MPI +comm = MPI.COMM_WORLD +me = comm.Get_rank() +np = comm.Get_size() + +# Initialize NVSHMEM with MPI +dev = Device(me % system.num_devices) +dev.set_current() +nvshmem.init(device=dev, mpi_comm=comm, initializer_method="mpi") + +#uid = nvshmem.get_unique_id(empty=(me != 0)) +#comm.Bcast(uid._data.view(numpy.int8), root=0) +#dev = Device() +#dev.set_current() +#nvshmem.init(device=dev, uid=uid, rank=me, nranks=np, initializer_method="uid") + +#dev = Device(me % system.num_devices) +#dev.set_current() +#nvshmem.init(device=dev, mpi_comm=comm, initializer_method="emulated_mpi") + +stream = dev.create_stream() + +# Get information about the current PE +my_pe = nvshmem.my_pe() +n_pes = nvshmem.n_pes() + +# Allocate symmetric memory +# array() returns a CuPy NDArray object +x = nvshmem.array((1024,), dtype="float32") +y = nvshmem.array((1024,), dtype="float32") + +#if my_pe == 0: +# y[:] = 1.0 + +# Perform communication operations +# Put y from PE 0 into x on PE 1 +if my_pe == 0: + nvshmem.put(x, y, 1, stream=stream) + +# Synchronize PEs +nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) +stream.sync() + +# Clean up +nvshmem.free_array(x) +nvshmem.free_array(y) +nvshmem.finalize() +print('OK') + diff --git a/PYTHON/nstream-cupy-nvshmem.py b/PYTHON/nstream-cupy-nvshmem.py new file mode 100755 index 000000000..a5335999e --- /dev/null +++ b/PYTHON/nstream-cupy-nvshmem.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2023, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: nstream +# +# PURPOSE: To compute memory bandwidth when adding a vector of a given +# number of double precision values to the scalar multiple of +# another vector of the same length, and storing the result in +# a third vector. +# +# USAGE: The program takes as input the number +# of iterations to loop over the triad vectors, the length of the +# vectors, and the offset between vectors +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# NOTES: Bandwidth is determined as the number of words read, plus the +# number of words written, times the size of the words, divided +# by the execution time. For a vector length of N, the total +# number of words read and written is 4*N*sizeof(double). +# +# +# HISTORY: This code is loosely based on the Stream benchmark by John +# McCalpin, but does not follow all the Stream rules. Hence, +# reported results should not be associated with Stream in +# external publications +# +# Converted to Python by Jeff Hammond, October 2017. +# Adapted for CuPy+NVSHMEM4Py, December 2024. +# +# ******************************************************************* + +import sys +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer + +from mpi4py import MPI + +import cupy + +if False: + print('=== CUDA Version Information ===') + + try: + # Get CUDA runtime version + runtime_version = cupy.cuda.runtime.runtimeGetVersion() + runtime_major = runtime_version // 1000 + runtime_minor = (runtime_version % 1000) // 10 + print(f'CUDA Runtime Version: {runtime_major}.{runtime_minor} (raw: {runtime_version})') + + # Get CUDA driver version + driver_version = cupy.cuda.runtime.driverGetVersion() + driver_major = driver_version // 1000 + driver_minor = (driver_version % 1000) // 10 + print(f'CUDA Driver Version: {driver_major}.{driver_minor} (raw: {driver_version})') + + print(f'Version compatibility: Driver {driver_major}.{driver_minor} vs Runtime {runtime_major}.{runtime_minor}') + + if driver_version < runtime_version: + print('WARNING: Driver version is older than runtime version!') + print('This can cause \"cudaErrorInsufficientDriver\" errors.') + print('Consider updating your NVIDIA drivers.') + else: + print('Driver and runtime versions are compatible.') + + except Exception as e: + print(f'Error: {e}') + print('This usually indicates CUDA driver/runtime compatibility issues.') + +from cuda.core.experimental import Device +from cuda.core.experimental import system + +import nvshmem.core as nvshmem + +def main(): + + # Initialize MPI and CUDA device + comm = MPI.COMM_WORLD + local_rank = comm.Get_rank() % system.num_devices + device = Device(local_rank) + device.set_current() + stream = device.create_stream() + + # Initialize NVSHMEM with MPI + nvshmem.init(device=device, mpi_comm=comm, initializer_method="mpi") + + me = nvshmem.my_pe() + np = nvshmem.n_pes() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python CuPy/NVSHMEM STREAM triad: A = B + scalar * C') + + if len(sys.argv) != 3: + if (me==0): + print('argument count = ', len(sys.argv)) + print("Usage: python nstream-cupy-nvshmem.py <# iterations> ") + nvshmem.finalize() + sys.exit() + + iterations = int(sys.argv[1]) + if iterations < 1: + if (me==0): + print("ERROR: iterations must be >= 1") + nvshmem.finalize() + sys.exit() + + total_length = int(sys.argv[2]) + if total_length < 1: + if (me==0): + print("ERROR: length must be positive") + nvshmem.finalize() + sys.exit() + + # Distribute work across GPUs/PEs + length = int(total_length / np) + remainder = total_length % np + if (remainder > 0): + if (me < remainder): + length += 1 + + if (me==0): + print('Number of PEs = ', np) + print('Number of iterations = ', iterations) + print('Vector length = ', total_length) + print('Local vector length = ', length) + + # Barrier using NVSHMEM + nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) + stream.sync() + + # ******************************************************************** + # ** Allocate space for the input and execute STREAM triad + # ******************************************************************** + + # Allocate symmetric GPU arrays using NVSHMEM4Py interoperability with CuPy + A = nvshmem.interop.cupy.array((length,), dtype="float64") + B = nvshmem.interop.cupy.array((length,), dtype="float64") + C = nvshmem.interop.cupy.array((length,), dtype="float64") + + # Initialize arrays + A[:] = 0.0 + B[:] = 2.0 + C[:] = 2.0 + + scalar = 3.0 + + # Timing loop + for k in range(0, iterations+1): + + if k < 1: + nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) + stream.sync() + t0 = timer() + + # STREAM triad operation on GPU using CuPy operations + A += B + scalar * C + # it seems like this is required to get proper timings - maybe some weird JiT thing happening + nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) + + # Final synchronization + nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) + stream.sync() + t1 = timer() + nstream_time = t1 - t0 + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + # Calculate expected result + ar = 0.0 + br = 2.0 + cr = 2.0 + for k in range(0, iterations+1): + ar += br + scalar * cr + + ar *= total_length + + # Calculate local checksum + asum_local = cupy.linalg.norm(A, ord=1) + + # Create source and destination arrays for reduction + src = nvshmem.interop.cupy.array((1,), dtype="float64") + dst = nvshmem.interop.cupy.array((1,), dtype="float64") + src[0] = asum_local + dst[0] = 0.0 + + # Reduce across all PEs using NVSHMEM collective + nvshmem.reduce(nvshmem.Teams.TEAM_WORLD, dst, src, op="sum", stream=stream) + stream.sync() + + asum_global = float(dst[0]) + + epsilon = 1.e-8 + if abs(ar - asum_global) / asum_global > epsilon: + if (me == 0): + print('Failed Validation on output array') + print(' Expected checksum: ', ar) + print(' Observed checksum: ', asum_global) + print("ERROR: solution did not validate") + else: + if (me == 0): + print('Solution validates') + avgtime = nstream_time / iterations + nbytes = 4.0 * total_length * 8 # 8 bytes per double + print('Rate (GB/s): ', 1.e-9 * nbytes / avgtime, ' Avg time (s): ', avgtime) + + # Free NVSHMEM arrays + nvshmem.free_array(A) + nvshmem.free_array(B) + nvshmem.free_array(C) + nvshmem.free_array(src) + nvshmem.free_array(dst) + + # Finalize NVSHMEM + nvshmem.finalize() + + +if __name__ == '__main__': + main() diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 27633dc01..169b4280c 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -1,7 +1,7 @@ # # This file shows the CUDA toolchain options # for both NVHPC and GCC. -NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/24.9 +NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/25.7 #NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11 #NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021 NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/ @@ -163,14 +163,16 @@ CBLASFLAG=${BLASFLAG} # Use appropriate arch or code is compiled to ancient features. #NVCC=${NVHPC_CBIN}nvc++ #NVCC=${NVHPC_CBIN}nvcc -NVCC=/usr/local/cuda-12.6/bin/nvcc +#NVCC=/usr/local/cuda-12/bin/nvcc +NVCC=nvcc CUDAFLAGS=-g -O3 -std=c++20 CUDAFLAGS+=--extended-lambda -CUDAFLAGS+=--gpu-architecture=sm_89 +CUDAFLAGS+=--gpu-architecture=sm_90 #CUDAFLAGS+=-allow-unsupported-compiler #CUDAFLAGS+=-ccbin=g++-13 -lm #-lstdc++ #CUDAFLAGS+=--compiler-bindir=/opt/gcc/12.3.0/bin/ #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp +CUDAFLAGS+=--forward-unknown-to-host-compiler # rpath CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' #CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/include #CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/lib @@ -196,7 +198,7 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED #CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED #CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED #CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED -CUDAFLAGS += -I/usr/local/cuda-12.6/targets/x86_64-linux/include/cub/detail +CUDAFLAGS += -I/usr/local/cuda-12/targets/x86_64-linux/include/cub/detail # # NCCL # @@ -207,12 +209,15 @@ NCCLLIB=-L${NCCLDIR}/lib -lnccl # NVSHMEM (Apt packages not reliable...) # NVSHMEMFLAGS=-rdc=true -diag-suppress 3012,3013 -#NVSHMEMFLAGS+=-I/usr/include/nvshmem_12 -NVSHMEM_DIR=${HOME}/NVSHMEM/nvshmem/build/src +#NVSHMEM_DIR=${HOME}/NVSHMEM/nvshmem/build/src #NVSHMEM_DIR=/opt/nvidia/hpc_sdk/Linux_x86_64/24.11/comm_libs/12.6/nvshmem -NVSHMEMFLAGS+=-I${NVSHMEM_DIR}/include -NVSHMEMFLAGS+=-L${NVSHMEM_DIR}/lib -NVSHMEMFLAGS+=-Wl,-rpath=${NVSHMEM_DIR}/lib +#NVSHMEMFLAGS+=-I${NVSHMEM_DIR}/include +#NVSHMEMFLAGS+=-L${NVSHMEM_DIR}/lib +#NVSHMEMFLAGS+=-Wl,-rpath=${NVSHMEM_DIR}/lib +# apt or pip installs like this +NVSHMEMFLAGS+=-I/usr/include/nvshmem_12 +NVSHMEMFLAGS+=-L/usr/lib/x86_64-linux-gnu/nvshmem/12 +NVSHMEMFLAGS+=-Wl,-rpath=/usr/lib/x86_64-linux-gnu/nvshmem/12 NVSHMEMFLAGS+=-lnvshmem_device -lnvshmem_host # # CUDASTF @@ -227,14 +232,15 @@ CUDASTF_CFLAGS+=-lcuda # # mpiicc wraps icc. mpicc and mpigcc wrap gcc. #MPIDIR=${NVHPC_PATH}/comm_libs/hpcx -MPIDIR=${NVHPC_PATH}/comm_libs/12.6/openmpi4/latest +#MPIDIR=${NVHPC_PATH}/comm_libs/12.9/openmpi4/latest +MPIDIR=/usr/local MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi #MPILIB+=-Wl,-rpath -Wl,${MPIDIR}/lib -Wl,--enable-new-dtags # NVCC chokes on -Wl -MPILIB+=-lopen-pal -lopen-rte +#MPILIB+=-lopen-pal -lopen-rte #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi #MPIINC=-I/usr/include/mpich-3.2-$$(uname -m) #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi