diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 30f749d5f..f5f0f7404 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -106,7 +106,7 @@ jobs: clang-runtime: '19' coverage: true cuda: true - extra_cmake_options: '-DCLAD_ENABLE_ENZYME_BACKEND=On' + extra_cmake_options: '-DCLAD_ENABLE_ENZYME_BACKEND=On -DCLAD_CUDA_TEST_USE_SANITIZER=On' extra_packages: ' libzstd-dev ' #clang-format: true @@ -127,6 +127,7 @@ jobs: os: [self-hosted, cuda, heavy] runs-on: cuda compiler: clang-16 + extra_cmake_options: '-DCLAD_CUDA_TEST_USE_SANITIZER=On' clang-runtime: '18' cuda: true diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 71faa478e..6d773df0e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -84,11 +84,15 @@ if(NOT LLVM_MAIN_SRC_DIR) set(LLVM_MAIN_SRC_DIR ${LLVM_BUILD_MAIN_SRC_DIR}) endif() +option(CLAD_CUDA_TEST_USE_SANITIZER "Run Clang cuda tests with compute-sanitizer" OFF) option(CLAD_TEST_USE_VG "Run Clang tests under Valgrind" OFF) set(CLAD_TEST_EXTRA_ARGS --verbose --show-skipped --show-unsupported) if(CLAD_TEST_USE_VG) set(CLAD_TEST_EXTRA_ARGS ${CLAD_TEST_EXTRA_ARGS} "--vg --vg-arg=-q") endif () +if(CLAD_CUDA_TEST_USE_SANITIZER) + set(CLAD_TEST_EXTRA_ARGS ${CLAD_TEST_EXTRA_ARGS} "--param" "cuda_sanitizer=1") +endif () add_lit_testsuite(check-clad "Running the Clad regression tests" ${CMAKE_CURRENT_BINARY_DIR} diff --git a/test/CUDA/ForwardMode.cu b/test/CUDA/ForwardMode.cu index 7ad89ad0b..05e692a44 100644 --- a/test/CUDA/ForwardMode.cu +++ b/test/CUDA/ForwardMode.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oForwardMode.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ForwardMode.out | %filecheck_exec %s +// RUN: %cudarun ./ForwardMode.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/GradientCuda.cu b/test/CUDA/GradientCuda.cu index 6d15d9b7a..218eda8fa 100644 --- a/test/CUDA/GradientCuda.cu +++ b/test/CUDA/GradientCuda.cu @@ -8,7 +8,7 @@ // RUN: %cladclang_cuda -I%S/../../include --cuda-gpu-arch=%cudaarch \ // RUN: --cuda-path=%cudapath %cudaldflags -oGradientCuda.out %s // -// RUN: ./GradientCuda.out | %filecheck_exec %s +// RUN: %cudarun ./GradientCuda.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index a0efd1b74..a9b95f6f2 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -5,7 +5,7 @@ // RUN: %cladclang_cuda -Xclang -plugin-arg-clad -Xclang -disable-tbr -I%S/../../include --cuda-path=%cudapath \ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oGradientKernels.out %s // -// RUN: ./GradientKernels.out | %filecheck_exec %s +// RUN: %cudarun ./GradientKernels.out | %filecheck_exec %s // // REQUIRES: cuda-runtime @@ -1019,11 +1019,11 @@ __global__ void injective_reassignment_loop(int *a) { int main(void) { int *a, *d_a; - cudaMalloc(&a, sizeof(int)); - cudaMalloc(&d_a, sizeof(int)); + cudaMalloc(&a, 2 * sizeof(int)); + cudaMalloc(&d_a, 2 * sizeof(int)); - TEST(kernel, dim3(1), dim3(1), 0, false, a, d_a, 1); // CHECK-EXEC: 10 - TEST(kernel, dim3(1), dim3(1), 0, true, a, d_a, 1); // CHECK-EXEC: 10 + TEST(kernel, dim3(1), dim3(1), 0, false, a, d_a, 2); // CHECK-EXEC: 10 + TEST(kernel, dim3(1), dim3(1), 0, true, a, d_a, 2); // CHECK-EXEC: 10 auto error = clad::gradient(fake_kernel); error.execute_kernel(dim3(1), dim3(1), a, d_a); // CHECK-EXEC: Use execute() for non-global CUDA kernels @@ -1146,12 +1146,12 @@ int main(void) { TEST_2(indices_lin_comb, dim3(1), dim3(5, 1, 1), 0, false, "out, in", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 20, 25, 45, 15, 15 int *n, *d_n; - cudaMalloc(&n, sizeof(int)); - cudaMalloc(&d_n, sizeof(int)); + cudaMalloc(&n, 2 * sizeof(int)); + cudaMalloc(&d_n, 2 * sizeof(int)); - TEST(kernel_device_injective, dim3(1), dim3(1), 0, false, n, d_n, 1); // CHECK-EXEC: 4 - TEST(injective_reassignment, dim3(1), dim3(1), 0, false, n, d_n, 1); // CHECK-EXEC: 1 - TEST(injective_reassignment_loop, dim3(1), dim3(1), 0, false, n, d_n, 1); // CHECK-EXEC: 1 + TEST(kernel_device_injective, dim3(1), dim3(1), 0, false, n, d_n, 2); // CHECK-EXEC: 4 + TEST(injective_reassignment, dim3(1), dim3(1), 0, false, n, d_n, 2); // CHECK-EXEC: 1 + TEST(injective_reassignment_loop, dim3(1), dim3(1), 0, false, n, d_n,2); // CHECK-EXEC: 1 cudaFree(dummy_in); cudaFree(dummy_out); diff --git a/test/CUDA/RunCudaDemos.C b/test/CUDA/RunCudaDemos.C index d784c712f..5e2fd2d79 100644 --- a/test/CUDA/RunCudaDemos.C +++ b/test/CUDA/RunCudaDemos.C @@ -5,7 +5,7 @@ // CHECK_VECTOR_ADDITION: clad::custom_derivatives::thrust::reduce_pullback // CHECK_VECTOR_ADDITION: clad::custom_derivatives::thrust::transform_pullback -// RUN: ./VectorAddition.out | FileCheck -check-prefix CHECK_VECTOR_ADDITION_EXEC %s +// RUN: %cudarun ./VectorAddition.out | FileCheck -check-prefix CHECK_VECTOR_ADDITION_EXEC %s // CHECK_VECTOR_ADDITION_EXEC: Running vector addition demo. // CHECK_VECTOR_ADDITION_EXEC: Gradients of sum wrt initial x: 1 1 1 1 1 1 1 1 1 1 @@ -17,7 +17,7 @@ // CHECK_PARTICLE_SIMULATION: clad::custom_derivatives::thrust::copy_pullback // CHECK_PARTICLE_SIMULATION: clad::custom_derivatives::thrust::transform_pullback -// RUN: ./ParticleSimulation.out | FileCheck -check-prefix CHECK_PARTICLE_SIMULATION_EXEC %s +// RUN: %cudarun ./ParticleSimulation.out | FileCheck -check-prefix CHECK_PARTICLE_SIMULATION_EXEC %s // CHECK_PARTICLE_SIMULATION_EXEC: Running particle simulation demo. // CHECK_PARTICLE_SIMULATION_EXEC: Gradients of final x-pos sum wrt initial vx: 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 @@ -44,7 +44,7 @@ // CHECK_LINEAR_REGRESSION-NEXT: } // CHECK_LINEAR_REGRESSION-NEXT: } -// RUN: ./LinearRegression.out | FileCheck -check-prefix CHECK_LINEAR_REGRESSION_EXEC %s +// RUN: %cudarun ./LinearRegression.out | FileCheck -check-prefix CHECK_LINEAR_REGRESSION_EXEC %s // CHECK_LINEAR_REGRESSION_EXEC: Running linear regression demo. // CHECK_LINEAR_REGRESSION_EXEC: Gradients of loss wrt weights (w): -9 -18 -27 -36 -45 -54 -63 -72 -81 -90 @@ -53,7 +53,7 @@ // CHECK_BOW_LOGREG: void logistic_loss_batch2_prepared_l2_grad( // CHECK_BOW_LOGREG: clad::custom_derivatives::thrust::inner_product_pullback -// RUN: ./BoWLogisticRegression.out | FileCheck -check-prefix CHECK_BOW_LOGREG_EXEC %s +// RUN: %cudarun ./BoWLogisticRegression.out | FileCheck -check-prefix CHECK_BOW_LOGREG_EXEC %s // CHECK_BOW_LOGREG_EXEC: Running minimal logistic regression demo. // CHECK_BOW_LOGREG_EXEC: Loss: // CHECK_BOW_LOGREG_EXEC: Gradient wrt w: diff --git a/test/CUDA/ThrustAdjacentDifference.cu b/test/CUDA/ThrustAdjacentDifference.cu index 4131b0b7a..5c89aedc8 100644 --- a/test/CUDA/ThrustAdjacentDifference.cu +++ b/test/CUDA/ThrustAdjacentDifference.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustAdjacentDifference.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ThrustAdjacentDifference.out | %filecheck_exec %s +// RUN: %cudarun ./ThrustAdjacentDifference.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/ThrustCopy.cu b/test/CUDA/ThrustCopy.cu index dd8b16645..940b42a21 100644 --- a/test/CUDA/ThrustCopy.cu +++ b/test/CUDA/ThrustCopy.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustCopy.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ThrustCopy.out | %filecheck_exec %s +// RUN: %cudarun ./ThrustCopy.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/ThrustReduce.cu b/test/CUDA/ThrustReduce.cu index 7db6dac92..d5d48dbbc 100644 --- a/test/CUDA/ThrustReduce.cu +++ b/test/CUDA/ThrustReduce.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustReduce.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ThrustReduce.out | %filecheck_exec %s +// RUN: %cudarun ./ThrustReduce.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/ThrustReduceByKey.cu b/test/CUDA/ThrustReduceByKey.cu index a8d4e2903..9d616cdd9 100644 --- a/test/CUDA/ThrustReduceByKey.cu +++ b/test/CUDA/ThrustReduceByKey.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustReduceByKey.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ThrustReduceByKey.out | %filecheck_exec %s +// RUN: %cudarun ./ThrustReduceByKey.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/ThrustScan.cu b/test/CUDA/ThrustScan.cu index e84e0ce2c..c0bb0388a 100644 --- a/test/CUDA/ThrustScan.cu +++ b/test/CUDA/ThrustScan.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustScan.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ThrustScan.out | %filecheck_exec %s +// RUN: %cudarun ./ThrustScan.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/ThrustScanByKey.cu b/test/CUDA/ThrustScanByKey.cu index 9ef0e31a1..e95f1e0dd 100644 --- a/test/CUDA/ThrustScanByKey.cu +++ b/test/CUDA/ThrustScanByKey.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustScanByKey.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ThrustScanByKey.out | %filecheck_exec %s +// RUN: %cudarun ./ThrustScanByKey.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/ThrustSortByKey.cu b/test/CUDA/ThrustSortByKey.cu index 8eff9b700..c452ae4e2 100644 --- a/test/CUDA/ThrustSortByKey.cu +++ b/test/CUDA/ThrustSortByKey.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustSortByKey.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ThrustSortByKey.out | %filecheck_exec %s +// RUN: %cudarun ./ThrustSortByKey.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/ThrustTransform.cu b/test/CUDA/ThrustTransform.cu index 402b054b3..2dd632d47 100644 --- a/test/CUDA/ThrustTransform.cu +++ b/test/CUDA/ThrustTransform.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustTransform.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ThrustTransform.out | %filecheck_exec %s +// RUN: %cudarun ./ThrustTransform.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/CUDA/ThrustTransformReduce.cu b/test/CUDA/ThrustTransformReduce.cu index 7518c677e..a0a417c0e 100644 --- a/test/CUDA/ThrustTransformReduce.cu +++ b/test/CUDA/ThrustTransformReduce.cu @@ -2,7 +2,7 @@ // RUN: --cuda-gpu-arch=%cudaarch %cudaldflags -oThrustTransformReduce.out \ // RUN: -Xclang -verify %s 2>&1 | %filecheck %s // -// RUN: ./ThrustTransformReduce.out | %filecheck_exec %s +// RUN: %cudarun ./ThrustTransformReduce.out | %filecheck_exec %s // // REQUIRES: cuda-runtime // diff --git a/test/lit.cfg b/test/lit.cfg index 8df893a8f..11ae202a4 100644 --- a/test/lit.cfg +++ b/test/lit.cfg @@ -323,6 +323,15 @@ if clang_repl_path: #if loadable_module: # config.available_features.add('loadable_module') +cuda_run_prefix = '' +if lit_config.params.get('cuda_sanitizer'): + cuda_tool_path = config.environment['PATH'] + san_path = lit.util.which('compute-sanitizer', os.path.join(config.cuda_path, 'bin')) + if san_path is None: + lit_config.fatal("cuda_sanitizer is enabled but compute-sanitizer was not found") + cuda_run_prefix = san_path + ' --tool memcheck --error-exitcode=1 --print-limit=0' +config.substitutions.append(('%cudarun', cuda_run_prefix)) + libcudart_path = lit.util.which('libcudart.so', config.cuda_libdir) if libcudart_path is not None: config.available_features.add('cuda-runtime') @@ -337,6 +346,7 @@ if libcudart_path is not None: config.environment['CUDA_VISIBLE_DEVICES'] = os.environ['CUDA_VISIBLE_DEVICES'] config.substitutions.append(('%cudaarch', config.cuda_test_arch)) + if(config.have_enzyme): config.available_features.add('Enzyme')