Universal CUDA Runtime instrumentation library with hook API for intercepting CUDA Runtime functions.
A drop-in replacement for libcudart.so that provides a universal hook API for intercepting any CUDA Runtime API function. Enables before/after/instead-of callbacks without modifying application code.
- Auto-generated wrappers for ~300 CUDA Runtime functions using libclang
- Hook API with before/after/instead semantics
- Hook chaining - multiple libraries can register hooks
- Pattern matching - use wildcards like
cuda*orcudaMemcpy* - Thread-safe implementation with recursion guards
- Dynamic loading of real libcudart.so via dlopen/dlsym
- Strict mode - forbid real CUDA calls for full CPU emulation scenarios
┌─────────────────────────────────────────────────────────────────────┐
│ Application │
│ (links with -lcudart) │
└─────────────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────────────┐
│ cudart_instrumented.so │
│ (LD_PRELOAD or direct linking) │
├─────────────────────────────────────────────────────────────────────┤
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Hook Chain │ │ Hook Chain │ │ Hook Chain │ ... │
│ │cudaLaunch.. │ │cudaStream.. │ │cudaMalloc │ │
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ Generated Wrapper Functions │ │
│ └─────────────────────────────────────────────────────────────┘ │
│ │ │
└────────────────────────────────┼────────────────────────────────────┘
│ dlsym(handle, "cudaXxx")
▼
┌─────────────────────────────────────────────────────────────────────┐
│ Real libcudart.so │
│ (loaded via dlopen at runtime) │
└─────────────────────────────────────────────────────────────────────┘
- CMake 3.18+
- C++17 compiler
- libclang development files (
libclang-devorllvm-dev) - CUDA Toolkit
sudo apt install cmake build-essential libclang-devmkdir build && cd build
cmake ..
make -j$(nproc)The build process:
- Builds the C++ code generator (
cudart_gen) - Runs the generator to create wrapper code from CUDA headers
- Builds
libcudart_instrumented.so
LD_PRELOAD=/path/to/libcudart_instrumented.so ./your_cuda_apptarget_link_libraries(your_app PRIVATE cudart_instrumented)#include <cudart_instrumented.h>
// Hook types
typedef void (*CudartBeforeHook)(CudartHookContext ctx, CudartCallInfo* info);
typedef void (*CudartAfterHook)(CudartHookContext ctx, CudartCallInfo* info, cudaError_t result);
typedef cudaError_t (*CudartInsteadHook)(CudartHookContext ctx, CudartCallInfo* info);
// Register hooks (pattern supports wildcards: "cuda*", "cudaMemcpy*")
CudartHookHandle cudart_hook_before(const char* pattern, CudartBeforeHook hook,
CudartHookContext ctx, int priority);
CudartHookHandle cudart_hook_after(const char* pattern, CudartAfterHook hook,
CudartHookContext ctx, int priority);
CudartHookHandle cudart_hook_instead(const char* pattern, CudartInsteadHook hook,
CudartHookContext ctx, int priority);
// Remove hook
void cudart_unhook(CudartHookHandle handle);
// Call original function from within "instead" hook
cudaError_t cudart_call_next(CudartCallInfo* info);
// Strict mode API
void cudart_set_strict_mode(int enabled);
int cudart_is_strict_mode(void);Strict mode is designed for full CPU emulation scenarios (e.g., OpenMP-based CUDA emulation) where the real CUDA runtime should never be invoked. When enabled:
- The real
libcudart.sois not loaded at all - Any CUDA function call without an
INSTEADhook will abort the program - Nested CUDA calls from within hooks also require their own
INSTEADhooks
#include <cudart_instrumented.h>
__attribute__((constructor))
void init() {
// Enable strict mode BEFORE any CUDA calls
cudart_set_strict_mode(1);
// Register INSTEAD hooks for all CUDA functions you need
cudart_hook_instead("cudaMalloc", my_malloc_emulation, NULL, 0);
cudart_hook_instead("cudaFree", my_free_emulation, NULL, 0);
cudart_hook_instead("cudaMemcpy", my_memcpy_emulation, NULL, 0);
// ... etc
}Strict mode ensures that no CUDA function accidentally falls through to the real GPU runtime. If any CUDA API call is made without a corresponding emulation hook, the program will abort with a clear error message:
[cudart_instrumented] STRICT MODE VIOLATION: attempted to call real CUDA function 'cudaStreamSynchronize' but no INSTEAD hook is registered!
This usually means a CUDA API call is missing emulation in the OpenMP emulation layer.
This helps identify missing emulation implementations during development.
if (cudart_is_strict_mode()) {
printf("Running in strict mode - real CUDA forbidden\n");
}Set CUDART_INSTR_DEBUG=1 environment variable to see debug messages:
CUDART_INSTR_DEBUG=1 ./my_app
# Output: [cudart_instrumented] STRICT MODE ENABLED - real CUDA calls forbidden
# Output: [cudart_instrumented] Strict mode: NOT loading real libcudart.so#include <cudart_instrumented.h>
#include <cstdio>
void log_before(CudartHookContext ctx, CudartCallInfo* info) {
printf("[CUDA] Calling %s\n", info->function_name);
}
void log_after(CudartHookContext ctx, CudartCallInfo* info, cudaError_t result) {
printf("[CUDA] %s returned %d\n", info->function_name, result);
}
__attribute__((constructor))
void init_logging() {
cudart_hook_before("cuda*", log_before, NULL, 0);
cudart_hook_after("cuda*", log_after, NULL, 0);
}#include <cudart_instrumented.h>
#include <cuda_arg_types.h>
#include <map>
#include <mutex>
static std::map<void*, size_t> allocations;
static std::mutex alloc_mutex;
void track_malloc_after(CudartHookContext ctx, CudartCallInfo* info, cudaError_t result) {
if (result != cudaSuccess) return;
auto* args = (CudartArgs_cudaMalloc*)info->args_struct;
std::lock_guard<std::mutex> lock(alloc_mutex);
allocations[*args->devPtr] = args->size;
}
void track_free_before(CudartHookContext ctx, CudartCallInfo* info) {
auto* args = (CudartArgs_cudaFree*)info->args_struct;
std::lock_guard<std::mutex> lock(alloc_mutex);
allocations.erase(args->devPtr);
}
__attribute__((constructor))
void init_tracking() {
cudart_hook_after("cudaMalloc", track_malloc_after, NULL, 0);
cudart_hook_before("cudaFree", track_free_before, NULL, 0);
}#include <cudart_instrumented.h>
#include <cuda_arg_types.h>
cudaError_t my_malloc(CudartHookContext ctx, CudartCallInfo* info) {
auto* args = (CudartArgs_cudaMalloc*)info->args_struct;
// Add custom logic before
printf("Allocating %zu bytes\n", args->size);
// Call the real function
cudaError_t result = cudart_call_next(info);
// Add custom logic after
if (result == cudaSuccess) {
printf("Allocated at %p\n", *args->devPtr);
}
return result;
}
__attribute__((constructor))
void init() {
cudart_hook_instead("cudaMalloc", my_malloc, NULL, 0);
}For each CUDA function, a corresponding argument structure is generated:
// In cuda_arg_types.h (auto-generated)
struct CudartArgs_cudaMalloc {
void** devPtr;
size_t size;
};
struct CudartArgs_cudaMemcpy {
void* dst;
const void* src;
size_t count;
cudaMemcpyKind kind;
};
struct CudartArgs_cudaLaunchKernel {
const void* func;
dim3 gridDim;
dim3 blockDim;
void** args;
size_t sharedMem;
cudaStream_t stream;
};
// ... ~300 more structurescd build
ctest --output-on-failurecudart_instrumented/
├── CMakeLists.txt
├── README.md
├── include/
│ └── cudart_instrumented.h # Public API
├── src/
│ ├── cudart_core.cpp # dlopen/dlsym management
│ ├── hook_registry.cpp # Hook storage and matching
│ └── internal.h # Internal declarations
├── generator/ # C++ code generator
│ ├── CMakeLists.txt
│ ├── main.cpp # CLI entry point
│ ├── cuda_parser.cpp # libclang header parsing
│ ├── symbol_reader.cpp # ELF symbol extraction
│ └── code_gen.cpp # Wrapper code generation
├── generated/ # (created at build time)
│ ├── cuda_wrappers.cpp
│ ├── cuda_arg_types.h
│ └── cuda_func_table.h
└── tests/
├── test_basic_hook.cpp
├── test_hook_chain.cpp
└── test_instead_hook.cpp
- Hook registration/unregistration is thread-safe (uses
std::shared_mutex) - Hook invocation is thread-safe and supports concurrent calls
- Recursive CUDA calls from within hooks are detected and bypass hook processing
MIT License