Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions PYTHON/cuda_version_check.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>

int main() {
std::cout << "=== CUDA Version Information ===" << std::endl;

// Get CUDA Runtime Version
int runtimeVersion;
cudaError_t runtimeResult = cudaRuntimeGetVersion(&runtimeVersion);
if (runtimeResult == cudaSuccess) {
int runtimeMajor = runtimeVersion / 1000;
int runtimeMinor = (runtimeVersion % 1000) / 10;
std::cout << "CUDA Runtime Version: " << runtimeMajor << "." << runtimeMinor
<< " (raw: " << runtimeVersion << ")" << std::endl;
} else {
std::cout << "Error getting CUDA Runtime version: "
<< cudaGetErrorString(runtimeResult) << std::endl;
}

// Get CUDA Driver Version
int driverVersion;
cudaError_t driverResult = cudaDriverGetVersion(&driverVersion);
if (driverResult == cudaSuccess) {
int driverMajor = driverVersion / 1000;
int driverMinor = (driverVersion % 1000) / 10;
std::cout << "CUDA Driver Version: " << driverMajor << "." << driverMinor
<< " (raw: " << driverVersion << ")" << std::endl;
} else {
std::cout << "Error getting CUDA Driver version: "
<< cudaGetErrorString(driverResult) << std::endl;
}

// Check compatibility
if (driverResult == cudaSuccess && runtimeResult == cudaSuccess) {
std::cout << "\nCompatibility Check:" << std::endl;
if (driverVersion >= runtimeVersion) {
std::cout << "✓ Driver and runtime versions are compatible" << std::endl;
} else {
std::cout << "✗ WARNING: Driver version is older than runtime!" << std::endl;
std::cout << " This may cause cudaErrorInsufficientDriver errors" << std::endl;
}
}

// Get device information
int deviceCount;
cudaError_t deviceResult = cudaGetDeviceCount(&deviceCount);
if (deviceResult == cudaSuccess) {
std::cout << "\n=== Device Information ===" << std::endl;
std::cout << "Number of CUDA devices: " << deviceCount << std::endl;

for (int i = 0; i < deviceCount; i++) {
cudaDeviceProp prop;
cudaError_t propResult = cudaGetDeviceProperties(&prop, i);
if (propResult == cudaSuccess) {
std::cout << "Device " << i << ": " << prop.name << std::endl;
std::cout << " Compute Capability: " << prop.major << "." << prop.minor << std::endl;
std::cout << " Total Memory: " << prop.totalGlobalMem / (1024*1024*1024) << " GB" << std::endl;
std::cout << " Multiprocessors: " << prop.multiProcessorCount << std::endl;
}
}
} else {
std::cout << "Error getting device count: "
<< cudaGetErrorString(deviceResult) << std::endl;
}

// Alternative method using CUDA Driver API directly
std::cout << "\n=== Alternative Driver API Check ===" << std::endl;
CUresult cuResult = cuInit(0);
if (cuResult == CUDA_SUCCESS) {
int cuDriverVersion;
cuResult = cuDriverGetVersion(&cuDriverVersion);
if (cuResult == CUDA_SUCCESS) {
int cuDriverMajor = cuDriverVersion / 1000;
int cuDriverMinor = (cuDriverVersion % 1000) / 10;
std::cout << "CUDA Driver Version (Driver API): " << cuDriverMajor << "." << cuDriverMinor
<< " (raw: " << cuDriverVersion << ")" << std::endl;
}
} else {
std::cout << "Failed to initialize CUDA Driver API" << std::endl;
}

return 0;
}
55 changes: 55 additions & 0 deletions PYTHON/hello-nvshmem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import numpy
from mpi4py import MPI
from cuda.core.experimental import Device
from cuda.core.experimental import system
import nvshmem.core as nvshmem

# Initialize MPI
comm = MPI.COMM_WORLD
me = comm.Get_rank()
np = comm.Get_size()

# Initialize NVSHMEM with MPI
dev = Device(me % system.num_devices)
dev.set_current()
nvshmem.init(device=dev, mpi_comm=comm, initializer_method="mpi")

#uid = nvshmem.get_unique_id(empty=(me != 0))
#comm.Bcast(uid._data.view(numpy.int8), root=0)
#dev = Device()
#dev.set_current()
#nvshmem.init(device=dev, uid=uid, rank=me, nranks=np, initializer_method="uid")

#dev = Device(me % system.num_devices)
#dev.set_current()
#nvshmem.init(device=dev, mpi_comm=comm, initializer_method="emulated_mpi")

stream = dev.create_stream()

# Get information about the current PE
my_pe = nvshmem.my_pe()
n_pes = nvshmem.n_pes()

# Allocate symmetric memory
# array() returns a CuPy NDArray object
x = nvshmem.array((1024,), dtype="float32")
y = nvshmem.array((1024,), dtype="float32")

#if my_pe == 0:
# y[:] = 1.0

# Perform communication operations
# Put y from PE 0 into x on PE 1
if my_pe == 0:
nvshmem.put(x, y, 1, stream=stream)

# Synchronize PEs
nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
stream.sync()

# Clean up
nvshmem.free_array(x)
nvshmem.free_array(y)
nvshmem.finalize()
print('OK')

Loading