diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 30f749d5f..f5f0f7404 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -106,7 +106,7 @@ jobs:
             clang-runtime: '19'
             coverage: true
             cuda: true
-            extra_cmake_options: '-DCLAD_ENABLE_ENZYME_BACKEND=On'
+            extra_cmake_options: '-DCLAD_ENABLE_ENZYME_BACKEND=On -DCLAD_CUDA_TEST_USE_SANITIZER=On'
             extra_packages: ' libzstd-dev '
             #clang-format: true
 
@@ -127,6 +127,7 @@ jobs:
             os: [self-hosted, cuda, heavy]
             runs-on: cuda
             compiler: clang-16
+            extra_cmake_options: '-DCLAD_CUDA_TEST_USE_SANITIZER=On'
             clang-runtime: '18'
             cuda: true
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 71faa478e..6d773df0e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -84,11 +84,15 @@ if(NOT LLVM_MAIN_SRC_DIR)
   set(LLVM_MAIN_SRC_DIR ${LLVM_BUILD_MAIN_SRC_DIR})
 endif()
 
+option(CLAD_CUDA_TEST_USE_SANITIZER "Run Clang cuda tests with compute-sanitizer" OFF)
 option(CLAD_TEST_USE_VG "Run Clang tests under Valgrind" OFF)
 set(CLAD_TEST_EXTRA_ARGS --verbose --show-skipped --show-unsupported)
 if(CLAD_TEST_USE_VG)
   set(CLAD_TEST_EXTRA_ARGS ${CLAD_TEST_EXTRA_ARGS} "--vg --vg-arg=-q")
 endif ()
+if(CLAD_CUDA_TEST_USE_SANITIZER)
+  set(CLAD_TEST_EXTRA_ARGS ${CLAD_TEST_EXTRA_ARGS} "--param" "cuda_sanitizer=1")
+endif ()
 
 add_lit_testsuite(check-clad "Running the Clad regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/test/CUDA/ForwardMode.cu b/test/CUDA/ForwardMode.cu
index 7ad89ad0b..05e692a44 100644
--- a/test/CUDA/ForwardMode.cu
+++ b/test/CUDA/ForwardMode.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oForwardMode.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ForwardMode.out | %filecheck_exec %s
+// RUN: %cudarun ./ForwardMode.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/GradientCuda.cu b/test/CUDA/GradientCuda.cu
index 6d15d9b7a..218eda8fa 100644
--- a/test/CUDA/GradientCuda.cu
+++ b/test/CUDA/GradientCuda.cu
@@ -8,7 +8,7 @@
 // RUN: %cladclang_cuda -I%S/../../include --cuda-gpu-arch=%cudaarch \
 // RUN:      --cuda-path=%cudapath %cudaldflags -oGradientCuda.out %s
 //
-// RUN: ./GradientCuda.out | %filecheck_exec %s
+// RUN: %cudarun ./GradientCuda.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index a0efd1b74..a9b95f6f2 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -5,7 +5,7 @@
 // RUN: %cladclang_cuda -Xclang -plugin-arg-clad -Xclang -disable-tbr -I%S/../../include --cuda-path=%cudapath \
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oGradientKernels.out %s
 //
-// RUN: ./GradientKernels.out | %filecheck_exec %s
+// RUN: %cudarun ./GradientKernels.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 
@@ -1019,11 +1019,11 @@ __global__ void injective_reassignment_loop(int *a) {
 
 int main(void) {
   int *a, *d_a;
-  cudaMalloc(&a, sizeof(int));
-  cudaMalloc(&d_a, sizeof(int));
+  cudaMalloc(&a, 2 * sizeof(int));
+  cudaMalloc(&d_a, 2 * sizeof(int));
 
-  TEST(kernel, dim3(1), dim3(1), 0, false, a, d_a, 1); // CHECK-EXEC: 10
-  TEST(kernel, dim3(1), dim3(1), 0, true, a, d_a, 1); // CHECK-EXEC: 10
+  TEST(kernel, dim3(1), dim3(1), 0, false, a, d_a, 2); // CHECK-EXEC: 10
+  TEST(kernel, dim3(1), dim3(1), 0, true, a, d_a, 2); // CHECK-EXEC: 10
 
   auto error = clad::gradient(fake_kernel); 
   error.execute_kernel(dim3(1), dim3(1), a, d_a); // CHECK-EXEC: Use execute() for non-global CUDA kernels
@@ -1146,12 +1146,12 @@ int main(void) {
   TEST_2(indices_lin_comb, dim3(1), dim3(5, 1, 1), 0, false, "out, in", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 20, 25, 45, 15, 15
 
   int *n, *d_n;
-  cudaMalloc(&n, sizeof(int));
-  cudaMalloc(&d_n, sizeof(int));
+  cudaMalloc(&n, 2 * sizeof(int));
+  cudaMalloc(&d_n, 2 * sizeof(int));
 
-  TEST(kernel_device_injective, dim3(1), dim3(1), 0, false, n, d_n, 1); // CHECK-EXEC: 4
-  TEST(injective_reassignment, dim3(1), dim3(1), 0, false, n, d_n, 1); // CHECK-EXEC: 1
-  TEST(injective_reassignment_loop, dim3(1), dim3(1), 0, false, n, d_n, 1); // CHECK-EXEC: 1
+  TEST(kernel_device_injective, dim3(1), dim3(1), 0, false, n, d_n, 2); // CHECK-EXEC: 4
+  TEST(injective_reassignment, dim3(1), dim3(1), 0, false, n, d_n, 2); // CHECK-EXEC: 1
+  TEST(injective_reassignment_loop, dim3(1), dim3(1), 0, false, n, d_n,2); // CHECK-EXEC: 1
 
   cudaFree(dummy_in);
   cudaFree(dummy_out);
diff --git a/test/CUDA/RunCudaDemos.C b/test/CUDA/RunCudaDemos.C
index d784c712f..5e2fd2d79 100644
--- a/test/CUDA/RunCudaDemos.C
+++ b/test/CUDA/RunCudaDemos.C
@@ -5,7 +5,7 @@
 // CHECK_VECTOR_ADDITION: clad::custom_derivatives::thrust::reduce_pullback
 // CHECK_VECTOR_ADDITION: clad::custom_derivatives::thrust::transform_pullback
 
-// RUN: ./VectorAddition.out | FileCheck -check-prefix CHECK_VECTOR_ADDITION_EXEC %s
+// RUN: %cudarun ./VectorAddition.out | FileCheck -check-prefix CHECK_VECTOR_ADDITION_EXEC %s
 // CHECK_VECTOR_ADDITION_EXEC: Running vector addition demo.
 // CHECK_VECTOR_ADDITION_EXEC: Gradients of sum wrt initial x: 1 1 1 1 1 1 1 1 1 1 
 
@@ -17,7 +17,7 @@
 // CHECK_PARTICLE_SIMULATION: clad::custom_derivatives::thrust::copy_pullback
 // CHECK_PARTICLE_SIMULATION: clad::custom_derivatives::thrust::transform_pullback
 
-// RUN: ./ParticleSimulation.out | FileCheck -check-prefix CHECK_PARTICLE_SIMULATION_EXEC %s
+// RUN: %cudarun ./ParticleSimulation.out | FileCheck -check-prefix CHECK_PARTICLE_SIMULATION_EXEC %s
 // CHECK_PARTICLE_SIMULATION_EXEC: Running particle simulation demo.
 // CHECK_PARTICLE_SIMULATION_EXEC: Gradients of final x-pos sum wrt initial vx: 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 
@@ -44,7 +44,7 @@
 // CHECK_LINEAR_REGRESSION-NEXT:     }
 // CHECK_LINEAR_REGRESSION-NEXT: }
 
-// RUN: ./LinearRegression.out | FileCheck -check-prefix CHECK_LINEAR_REGRESSION_EXEC %s
+// RUN: %cudarun ./LinearRegression.out | FileCheck -check-prefix CHECK_LINEAR_REGRESSION_EXEC %s
 // CHECK_LINEAR_REGRESSION_EXEC: Running linear regression demo.
 // CHECK_LINEAR_REGRESSION_EXEC: Gradients of loss wrt weights (w): -9 -18 -27 -36 -45 -54 -63 -72 -81 -90
 
@@ -53,7 +53,7 @@
 // CHECK_BOW_LOGREG: void logistic_loss_batch2_prepared_l2_grad(
 // CHECK_BOW_LOGREG: clad::custom_derivatives::thrust::inner_product_pullback
 
-// RUN: ./BoWLogisticRegression.out | FileCheck -check-prefix CHECK_BOW_LOGREG_EXEC %s
+// RUN: %cudarun ./BoWLogisticRegression.out | FileCheck -check-prefix CHECK_BOW_LOGREG_EXEC %s
 // CHECK_BOW_LOGREG_EXEC: Running minimal logistic regression demo.
 // CHECK_BOW_LOGREG_EXEC: Loss:
 // CHECK_BOW_LOGREG_EXEC: Gradient wrt w:
diff --git a/test/CUDA/ThrustAdjacentDifference.cu b/test/CUDA/ThrustAdjacentDifference.cu
index 4131b0b7a..5c89aedc8 100644
--- a/test/CUDA/ThrustAdjacentDifference.cu
+++ b/test/CUDA/ThrustAdjacentDifference.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustAdjacentDifference.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ThrustAdjacentDifference.out | %filecheck_exec %s
+// RUN: %cudarun ./ThrustAdjacentDifference.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/ThrustCopy.cu b/test/CUDA/ThrustCopy.cu
index dd8b16645..940b42a21 100644
--- a/test/CUDA/ThrustCopy.cu
+++ b/test/CUDA/ThrustCopy.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustCopy.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ThrustCopy.out | %filecheck_exec %s
+// RUN: %cudarun ./ThrustCopy.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/ThrustReduce.cu b/test/CUDA/ThrustReduce.cu
index 7db6dac92..d5d48dbbc 100644
--- a/test/CUDA/ThrustReduce.cu
+++ b/test/CUDA/ThrustReduce.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustReduce.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ThrustReduce.out | %filecheck_exec %s
+// RUN: %cudarun ./ThrustReduce.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/ThrustReduceByKey.cu b/test/CUDA/ThrustReduceByKey.cu
index a8d4e2903..9d616cdd9 100644
--- a/test/CUDA/ThrustReduceByKey.cu
+++ b/test/CUDA/ThrustReduceByKey.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustReduceByKey.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ThrustReduceByKey.out | %filecheck_exec %s
+// RUN: %cudarun ./ThrustReduceByKey.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/ThrustScan.cu b/test/CUDA/ThrustScan.cu
index e84e0ce2c..c0bb0388a 100644
--- a/test/CUDA/ThrustScan.cu
+++ b/test/CUDA/ThrustScan.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustScan.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ThrustScan.out | %filecheck_exec %s
+// RUN: %cudarun ./ThrustScan.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/ThrustScanByKey.cu b/test/CUDA/ThrustScanByKey.cu
index 9ef0e31a1..e95f1e0dd 100644
--- a/test/CUDA/ThrustScanByKey.cu
+++ b/test/CUDA/ThrustScanByKey.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustScanByKey.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ThrustScanByKey.out | %filecheck_exec %s
+// RUN: %cudarun ./ThrustScanByKey.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/ThrustSortByKey.cu b/test/CUDA/ThrustSortByKey.cu
index 8eff9b700..c452ae4e2 100644
--- a/test/CUDA/ThrustSortByKey.cu
+++ b/test/CUDA/ThrustSortByKey.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustSortByKey.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ThrustSortByKey.out | %filecheck_exec %s
+// RUN: %cudarun ./ThrustSortByKey.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/ThrustTransform.cu b/test/CUDA/ThrustTransform.cu
index 402b054b3..2dd632d47 100644
--- a/test/CUDA/ThrustTransform.cu
+++ b/test/CUDA/ThrustTransform.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustTransform.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ThrustTransform.out | %filecheck_exec %s
+// RUN: %cudarun ./ThrustTransform.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/CUDA/ThrustTransformReduce.cu b/test/CUDA/ThrustTransformReduce.cu
index 7518c677e..a0a417c0e 100644
--- a/test/CUDA/ThrustTransformReduce.cu
+++ b/test/CUDA/ThrustTransformReduce.cu
@@ -2,7 +2,7 @@
 // RUN:     --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustTransformReduce.out \
 // RUN:     -Xclang -verify %s 2>&1 | %filecheck %s
 //
-// RUN: ./ThrustTransformReduce.out | %filecheck_exec %s
+// RUN: %cudarun ./ThrustTransformReduce.out | %filecheck_exec %s
 //
 // REQUIRES: cuda-runtime
 //
diff --git a/test/lit.cfg b/test/lit.cfg
index 8df893a8f..11ae202a4 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -323,6 +323,15 @@ if clang_repl_path:
 #if loadable_module:
 #    config.available_features.add('loadable_module')
 
+cuda_run_prefix = ''
+if lit_config.params.get('cuda_sanitizer'):
+    cuda_tool_path = config.environment['PATH']
+    san_path = lit.util.which('compute-sanitizer', os.path.join(config.cuda_path, 'bin'))
+    if san_path is None:
+        lit_config.fatal("cuda_sanitizer is enabled but compute-sanitizer was not found")
+    cuda_run_prefix = san_path + ' --tool memcheck --error-exitcode=1 --print-limit=0'
+config.substitutions.append(('%cudarun', cuda_run_prefix))
+
 libcudart_path = lit.util.which('libcudart.so', config.cuda_libdir)
 if libcudart_path is not None:
     config.available_features.add('cuda-runtime')
@@ -337,6 +346,7 @@ if libcudart_path is not None:
         config.environment['CUDA_VISIBLE_DEVICES'] = os.environ['CUDA_VISIBLE_DEVICES']
     config.substitutions.append(('%cudaarch', config.cuda_test_arch))
 
+
 if(config.have_enzyme):
     config.available_features.add('Enzyme')