Remove explicit eager loading from runtime wrappers and replace with CUDA_MODULE_LOADING=EAGER in test.

chsigg · chsigg · commit d38f98f20120 · 2025-04-13T08:07:05.000+02:00
Format test file.
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -15,7 +15,6 @@
 #include "mlir/ExecutionEngine/CRunnerUtils.h"
 
 #include <cstdio>
-#include <vector>
 
 #include "cuda.h"
 #include "cuda_bf16.h"
@@ -122,16 +121,6 @@ mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
   ScopedContext scopedContext;
   CUmodule module = nullptr;
   CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
-  // Preload functions in the module so that the first call to
-  // cuModuleGetFunction below doesn't synchronize context.
-  unsigned numFunctions = 0;
-  CUDA_REPORT_IF_ERROR(cuModuleGetFunctionCount(&numFunctions, module));
-  std::vector<CUfunction> functions(numFunctions);
-  CUDA_REPORT_IF_ERROR(
-      cuModuleEnumerateFunctions(functions.data(), numFunctions, module));
-  for (CUfunction function : functions) {
-    CUDA_REPORT_IF_ERROR(cuFuncLoad(function));
-  }
   return module;
 }
 
diff --git a/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir b/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir
@@ -1,48 +1,53 @@
-// Tests that we can run multiple kernels concurrently. Runs two kernels, which
-// increment a global atomic counter, then wait for the counter to reach 2.
+// Tests multiple kernels running concurrently. Runs two kernels, which
+// increment a global atomic counter and wait for the counter to reach 2.
 //
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
-// RUN: | mlir-runner \
+// RUN: | CUDA_MODULE_LOADING=EAGER mlir-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 // RUN:   --entry-point-result=void
 
+// CUDA_MODULE_LOADING=EAGER avoids an implicit context synchronization on first
+// use of each kernel. It is technically not needed for this test, because
+// there is only one kernel.
+
 module attributes {gpu.container_module} {
-    gpu.module @kernels {
-        gpu.func @kernel(%memref: memref<i32>) kernel {
-            %c0 = arith.constant 0 : i32
-            %c1 = arith.constant 1 : i32
-            %c2 = arith.constant 2 : i32
-            %block = memref.atomic_rmw addi %c1, %memref[] : (i32, memref<i32>) -> i32
-            scf.while: () -> () {
-                %value = memref.atomic_rmw addi %c0, %memref[] : (i32, memref<i32>) -> i32
-                %cond = arith.cmpi slt, %value, %c2 : i32
-                scf.condition(%cond)
-            } do {
-                scf.yield
-            }
-            gpu.return
-        }
+
+gpu.module @kernels {
+  gpu.func @kernel(%memref: memref<i32>) kernel {
+    %c0 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : i32
+    %c2 = arith.constant 2 : i32
+    %block = memref.atomic_rmw addi %c1, %memref[] : (i32, memref<i32>) -> i32
+    scf.while: () -> () {
+      %value = memref.atomic_rmw addi %c0, %memref[] : (i32, memref<i32>) -> i32
+      %cond = arith.cmpi slt, %value, %c2 : i32
+      scf.condition(%cond)
+    } do {
+      scf.yield
     }
+    gpu.return
+  }
+}
 
-    func.func @main() {
-        %memref = gpu.alloc host_shared () : memref<i32>
-        %c0 = arith.constant 0 : i32
-        memref.store %c0, %memref[] : memref<i32>
+func.func @main() {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : index
+  %memref = gpu.alloc host_shared () : memref<i32>
+  memref.store %c0, %memref[] : memref<i32>
+  %0 = gpu.wait async
+  %1 = gpu.wait async
+  %2 = gpu.launch_func async [%0] @kernels::@kernel
+      blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1)
+      args(%memref: memref<i32>)
+  %3 = gpu.launch_func async [%1] @kernels::@kernel
+      blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1)
+      args(%memref: memref<i32>)
+  gpu.wait [%2, %3]
+  return
+}
 
-        %0 = gpu.wait async
-        %1 = gpu.wait async
-        %c1 = arith.constant 1 : index
-        %2 = gpu.launch_func async [%0] @kernels::@kernel
-            blocks in (%c1, %c1, %c1)
-            threads in (%c1, %c1, %c1)
-            args(%memref: memref<i32>)
-        %3 = gpu.launch_func async [%1] @kernels::@kernel
-            blocks in (%c1, %c1, %c1)
-            threads in (%c1, %c1, %c1)
-            args(%memref: memref<i32>)
-        gpu.wait [%2, %3]
-        return
-    }
 }