Skip to content

Commit d38f98f

Browse files
committed
Remove explicit eager loading from runtime wrappers and replace with CUDA_MODULE_LOADING=EAGER in test.
Format test file.
1 parent 473482c commit d38f98f

File tree

2 files changed

+41
-47
lines changed

2 files changed

+41
-47
lines changed

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
#include "mlir/ExecutionEngine/CRunnerUtils.h"
1616

1717
#include <cstdio>
18-
#include <vector>
1918

2019
#include "cuda.h"
2120
#include "cuda_bf16.h"
@@ -122,16 +121,6 @@ mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
122121
ScopedContext scopedContext;
123122
CUmodule module = nullptr;
124123
CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
125-
// Preload functions in the module so that the first call to
126-
// cuModuleGetFunction below doesn't synchronize context.
127-
unsigned numFunctions = 0;
128-
CUDA_REPORT_IF_ERROR(cuModuleGetFunctionCount(&numFunctions, module));
129-
std::vector<CUfunction> functions(numFunctions);
130-
CUDA_REPORT_IF_ERROR(
131-
cuModuleEnumerateFunctions(functions.data(), numFunctions, module));
132-
for (CUfunction function : functions) {
133-
CUDA_REPORT_IF_ERROR(cuFuncLoad(function));
134-
}
135124
return module;
136125
}
137126

Lines changed: 41 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,53 @@
1-
// Tests that we can run multiple kernels concurrently. Runs two kernels, which
2-
// increment a global atomic counter, then wait for the counter to reach 2.
1+
// Tests multiple kernels running concurrently. Runs two kernels, which
2+
// increment a global atomic counter and wait for the counter to reach 2.
33
//
44
// RUN: mlir-opt %s \
55
// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
6-
// RUN: | mlir-runner \
6+
// RUN: | CUDA_MODULE_LOADING=EAGER mlir-runner \
77
// RUN: --shared-libs=%mlir_cuda_runtime \
88
// RUN: --shared-libs=%mlir_runner_utils \
99
// RUN: --entry-point-result=void
1010

11+
// CUDA_MODULE_LOADING=EAGER avoids an implicit context synchronization on first
12+
// use of each kernel. It is technically not needed for this test, because
13+
// there is only one kernel.
14+
1115
module attributes {gpu.container_module} {
12-
gpu.module @kernels {
13-
gpu.func @kernel(%memref: memref<i32>) kernel {
14-
%c0 = arith.constant 0 : i32
15-
%c1 = arith.constant 1 : i32
16-
%c2 = arith.constant 2 : i32
17-
%block = memref.atomic_rmw addi %c1, %memref[] : (i32, memref<i32>) -> i32
18-
scf.while: () -> () {
19-
%value = memref.atomic_rmw addi %c0, %memref[] : (i32, memref<i32>) -> i32
20-
%cond = arith.cmpi slt, %value, %c2 : i32
21-
scf.condition(%cond)
22-
} do {
23-
scf.yield
24-
}
25-
gpu.return
26-
}
16+
17+
gpu.module @kernels {
18+
gpu.func @kernel(%memref: memref<i32>) kernel {
19+
%c0 = arith.constant 0 : i32
20+
%c1 = arith.constant 1 : i32
21+
%c2 = arith.constant 2 : i32
22+
%block = memref.atomic_rmw addi %c1, %memref[] : (i32, memref<i32>) -> i32
23+
scf.while: () -> () {
24+
%value = memref.atomic_rmw addi %c0, %memref[] : (i32, memref<i32>) -> i32
25+
%cond = arith.cmpi slt, %value, %c2 : i32
26+
scf.condition(%cond)
27+
} do {
28+
scf.yield
2729
}
30+
gpu.return
31+
}
32+
}
2833

29-
func.func @main() {
30-
%memref = gpu.alloc host_shared () : memref<i32>
31-
%c0 = arith.constant 0 : i32
32-
memref.store %c0, %memref[] : memref<i32>
34+
func.func @main() {
35+
%c0 = arith.constant 0 : i32
36+
%c1 = arith.constant 1 : index
37+
%memref = gpu.alloc host_shared () : memref<i32>
38+
memref.store %c0, %memref[] : memref<i32>
39+
%0 = gpu.wait async
40+
%1 = gpu.wait async
41+
%2 = gpu.launch_func async [%0] @kernels::@kernel
42+
blocks in (%c1, %c1, %c1)
43+
threads in (%c1, %c1, %c1)
44+
args(%memref: memref<i32>)
45+
%3 = gpu.launch_func async [%1] @kernels::@kernel
46+
blocks in (%c1, %c1, %c1)
47+
threads in (%c1, %c1, %c1)
48+
args(%memref: memref<i32>)
49+
gpu.wait [%2, %3]
50+
return
51+
}
3352

34-
%0 = gpu.wait async
35-
%1 = gpu.wait async
36-
%c1 = arith.constant 1 : index
37-
%2 = gpu.launch_func async [%0] @kernels::@kernel
38-
blocks in (%c1, %c1, %c1)
39-
threads in (%c1, %c1, %c1)
40-
args(%memref: memref<i32>)
41-
%3 = gpu.launch_func async [%1] @kernels::@kernel
42-
blocks in (%c1, %c1, %c1)
43-
threads in (%c1, %c1, %c1)
44-
args(%memref: memref<i32>)
45-
gpu.wait [%2, %3]
46-
return
47-
}
4853
}

0 commit comments

Comments
 (0)