Skip to content
This repository was archived by the owner on Mar 25, 2025. It is now read-only.

Commit 95782bc

Browse files
authored
[LLVM][GPU] Added CUDADriver to execute benchmark on GPU (#829)
- Added CUDADriver to compile LLVM IR string generated from CodegenLLVMVisitor to PTX string and then execute it using CUDA API - Ability to select the compilation GPU architecture and then set the proper GPU architecture based on the GPU that is going to be used - Link `libdevice` math library with GPU LLVM module - Handles kernel and wrapper functions attributes properly for GPU execution (wrapper function is `kernel` and kernel attribute is `device`) - Small fixes in InstanceStruct declaration and setup to allocate the pointer variables properly, including the shadow variables - Adds tests in the CI that run small benchmarks in CPU and GPU on BB5 - Adds replacement of `log` math function for SLEEF and libdevice, `pow` and `fabs` for libdevice - Adds GPU execution ability in PyJIT - Small improvement in PyJIT benchmark python script to handle arguments and GPU execution - Separated benchmark info from benchmark driver - Added hh and expsyn mod files in benchmarking tests
1 parent 5952a87 commit 95782bc

29 files changed

+1045
-125
lines changed

.gitlab-ci.yml

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ trigger cvf:
2626
variables:
2727
SPACK_PACKAGE: nmodl
2828
SPACK_PACKAGE_SPEC: ~legacy-unit+python+llvm
29-
SPACK_EXTRA_MODULES: llvm
3029
SPACK_INSTALL_EXTRA_FLAGS: -v
3130

3231
spack_setup:
@@ -44,14 +43,6 @@ build:intel:
4443
variables:
4544
SPACK_PACKAGE_COMPILER: intel
4645

47-
build:gcc:
48-
extends:
49-
- .spack_build
50-
- .spack_nmodl
51-
variables:
52-
SPACK_PACKAGE_COMPILER: gcc
53-
SPACK_PACKAGE_DEPENDENCIES: ^bison%gcc^flex%gcc^py-jinja2%gcc^py-sympy%gcc^py-pyyaml%gcc
54-
5546
.nmodl_tests:
5647
variables:
5748
# https://github.com/BlueBrain/nmodl/issues/737
@@ -63,8 +54,30 @@ test:intel:
6354
- .nmodl_tests
6455
needs: ["build:intel"]
6556

66-
test:gcc:
57+
.benchmark_config:
58+
variables:
59+
bb5_ntasks: 1
60+
bb5_cpus_per_task: 1
61+
bb5_memory: 16G
62+
bb5_exclusive: full
63+
bb5_constraint: gpu_32g # CascadeLake CPU & V100 GPU node
64+
65+
.build_allocation:
66+
variables:
67+
bb5_ntasks: 2 # so we block 16 cores
68+
bb5_cpus_per_task: 8 # ninja -j {this}
69+
bb5_memory: 76G # ~16*384/80
70+
71+
build_cuda:gcc:
72+
extends: [.spack_build, .build_allocation]
73+
variables:
74+
SPACK_PACKAGE: nmodl
75+
SPACK_PACKAGE_SPEC: ~legacy-unit+python+llvm+llvm_cuda
76+
SPACK_INSTALL_EXTRA_FLAGS: -v
77+
SPACK_PACKAGE_COMPILER: gcc
78+
79+
test_benchmark:gcc:
6780
extends:
81+
- .benchmark_config
6882
- .ctest
69-
- .nmodl_tests
70-
needs: ["build:gcc"]
83+
needs: ["build_cuda:gcc"]

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" ON)
2424
option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
2525
option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
2626
option(NMODL_ENABLE_LLVM_GPU "Enable LLVM based GPU code generation" ON)
27+
option(NMODL_ENABLE_LLVM_CUDA "Enable LLVM CUDA backend to run GPU benchmark" OFF)
2728
option(NMODL_ENABLE_JIT_EVENT_LISTENERS "Enable JITEventListener for Perf and Vtune" OFF)
2829

2930
if(NMODL_ENABLE_LEGACY_UNITS)
@@ -162,6 +163,7 @@ if(NMODL_ENABLE_LLVM)
162163
if(NMODL_ENABLE_LLVM_CUDA)
163164
enable_language(CUDA)
164165
find_package(CUDAToolkit)
166+
include_directories(${CUDAToolkit_INCLUDE_DIRS})
165167
add_definitions(-DNMODL_LLVM_CUDA_BACKEND)
166168
endif()
167169
endif()

INSTALL.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ To build the project from source, a modern C++ compiler with C++14 support is ne
2121

2222
- flex (>=2.6)
2323
- bison (>=3.0)
24-
- CMake (>=3.15)
24+
- CMake (>=3.17)
2525
- Python (>=3.6)
2626
- Python packages : jinja2 (>=2.10), pyyaml (>=3.13), pytest (>=4.0.0), sympy (>=1.3), textwrap
2727

@@ -141,6 +141,29 @@ export NMODL_WRAPLIB=/opt/nmodl/lib/libpywrapper.so
141141
**Note**: In order for all unit tests to function correctly when building without linking against libpython we must
142142
set `NMODL_PYLIB` before running cmake!
143143

144+
### Using CUDA backend to run benchmarks
145+
146+
`NMODL` supports generating code and compiling it for execution on an `NVIDIA` GPU via its benchmark infrastructure using the `LLVM` backend. To enable the `CUDA` backend to compile and execute the GPU code we need to set the following `CMake` flag during compilation of `NMODL`:
147+
```
148+
-DNMODL_ENABLE_LLVM_CUDA=ON
149+
```
150+
151+
To find the need `CUDA` libraries (`cudart` and `nvrtc`) it's needed to have CUDA Toolkit installed on your system.
152+
This can be done by installing the CUDA Toolkit from the [CUDA Toolkit website](https://developer.nvidia.com/cuda-downloads) or by installing the `CUDA` spack package and loading the corresponding module.
153+
154+
Then given a supported MOD file you can execute the benchmark on GPU in you supported NVIDIA GPU by running the following command:
155+
```
156+
./bin/nmodl <file>.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_80" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_ROOT}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2 --grid-dim-x 4096 --block-dim-x 256
157+
```
158+
The above command executes the benchmark on a GPU with `Compute Architecture` `sm_80` and links the generated code to the `libdevice` optimized math library provided by `NVIDIA`.
159+
Using the above command you can also select the optimization level of the generated code, the instance size of the generated data, the number of repetitions and the grid and block dimensions for the GPU execution.
160+
161+
**Note**: In order for the CUDA backend to be able to compile and execute the generated code on GPU the CUDA Toolkit version installed needs to have the same version as the `CUDA` installed by the NVIDIA driver in the system that will be used to run the benchmark.
162+
You can find the CUDA Toolkit version by running the following command:
163+
```
164+
nvidia-smi
165+
```
166+
and noting the `CUDA Version` stated there. For example if `CUDA Version` reported by `nvidia-smi` is CUDA 11.4 you need to install the `CUDA Toolkit 11.4.*` to be able to compile and execute the GPU code.
144167

145168
## Testing the Installed Module
146169

src/codegen/codegen_driver.hpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@ struct CodeGenConfig {
3333
/// true if cuda code to be generated
3434
bool cuda_backend = false;
3535

36-
/// true if llvm code to be generated
37-
bool llvm_backend = false;
38-
3936
/// true if sympy should be used for solving ODEs analytically
4037
bool sympy_analytic = false;
4138

src/codegen/llvm/codegen_llvm_helper_visitor.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -239,13 +239,6 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
239239
add_var_with_type(VOLTAGE_VAR, FLOAT_TYPE, /*is_pointer=*/1);
240240
add_var_with_type(NODE_INDEX_VAR, INTEGER_TYPE, /*is_pointer=*/1);
241241

242-
// add dt, t, celsius
243-
add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
244-
add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
245-
add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
246-
add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
247-
add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
248-
249242
// As we do not have `NrnThread` object as an argument, we store points to rhs
250243
// and d to in the instance struct as well. Also need their respective shadow variables
251244
// in case of point process mechanism.
@@ -256,6 +249,17 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
256249
add_var_with_type(naming::NTHREAD_RHS_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
257250
add_var_with_type(naming::NTHREAD_D_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
258251

252+
// NOTE: All the pointer variables should be declared before the scalar variables otherwise
253+
// the allocation of memory for the variables in the InstanceStruct and their offsets will be
254+
// wrong
255+
256+
// add dt, t, celsius
257+
add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
258+
add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
259+
add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
260+
add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
261+
add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
262+
259263
return std::make_shared<ast::InstanceStruct>(codegen_vars);
260264
}
261265

src/codegen/llvm/codegen_llvm_visitor.cpp

Lines changed: 61 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ namespace codegen {
2525
/* Helper routines */
2626
/****************************************************************************************/
2727

28+
static std::string get_wrapper_name(const std::string& kernel_name) {
29+
return "__" + kernel_name + "_wrapper";
30+
}
31+
2832
/// A utility to check for supported Statement AST nodes.
2933
static bool is_supported_statement(const ast::Statement& statement) {
3034
return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
@@ -55,15 +59,36 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
5559
return unsupported.empty() && supported.size() <= 1;
5660
}
5761

58-
void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
62+
void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel,
63+
const std::string& annotation = "kernel") {
5964
llvm::Metadata* metadata[] = {llvm::ValueAsMetadata::get(kernel),
60-
llvm::MDString::get(*context, "kernel"),
65+
llvm::MDString::get(*context, annotation),
6166
llvm::ValueAsMetadata::get(
6267
llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
6368
llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
6469
module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
6570
}
6671

72+
void CodegenLLVMVisitor::annotate_wrapper_kernels_with_nvvm() {
73+
// First clear all the nvvm annotations from the module
74+
auto module_named_metadata = module->getNamedMetadata("nvvm.annotations");
75+
module->eraseNamedMetadata(module_named_metadata);
76+
77+
// Then each kernel should be annotated as "device" function and wrappers should be annotated as
78+
// "kernel" functions
79+
std::vector<std::string> kernel_names;
80+
find_kernel_names(kernel_names);
81+
82+
for (const auto& kernel_name: kernel_names) {
83+
// Get the kernel function.
84+
auto kernel = module->getFunction(kernel_name);
85+
// Get the kernel wrapper function.
86+
auto kernel_wrapper = module->getFunction(get_wrapper_name(kernel_name));
87+
annotate_kernel_with_nvvm(kernel, "device");
88+
annotate_kernel_with_nvvm(kernel_wrapper, "kernel");
89+
}
90+
}
91+
6792
llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
6893
node->accept(*this);
6994
return ir_builder.pop_last_value();
@@ -402,12 +427,17 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
402427
auto kernel = module->getFunction(kernel_name);
403428

404429
// Create a wrapper void function that takes a void pointer as a single argument.
405-
llvm::Type* i32_type = ir_builder.get_i32_type();
430+
llvm::Type* return_type;
431+
if (platform.is_gpu()) {
432+
return_type = ir_builder.get_void_type();
433+
} else {
434+
return_type = ir_builder.get_i32_type();
435+
}
406436
llvm::Type* void_ptr_type = ir_builder.get_i8_ptr_type();
407437
llvm::Function* wrapper_func = llvm::Function::Create(
408-
llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
438+
llvm::FunctionType::get(return_type, {void_ptr_type}, /*isVarArg=*/false),
409439
llvm::Function::ExternalLinkage,
410-
"__" + kernel_name + "_wrapper",
440+
get_wrapper_name(kernel_name),
411441
*module);
412442

413443
// Optionally, add debug information for the wrapper function.
@@ -425,9 +455,23 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
425455
args.push_back(bitcasted);
426456
ir_builder.create_function_call(kernel, args, /*use_result=*/false);
427457

428-
// Create a 0 return value and a return instruction.
429-
ir_builder.create_i32_constant(0);
430-
ir_builder.create_return(ir_builder.pop_last_value());
458+
// create return instructions and annotate wrapper with certain attributes depending on
459+
// the backend type
460+
if (platform.is_gpu()) {
461+
// return void
462+
ir_builder.create_return();
463+
} else {
464+
// Create a 0 return value and a return instruction.
465+
ir_builder.create_i32_constant(0);
466+
ir_builder.create_return(ir_builder.pop_last_value());
467+
ir_builder.set_function(wrapper_func);
468+
ir_builder.set_kernel_attributes();
469+
}
470+
ir_builder.clear_function();
471+
}
472+
// for GPU we need to first clear all the annotations and then reapply them
473+
if (platform.is_gpu()) {
474+
annotate_wrapper_kernels_with_nvvm();
431475
}
432476
}
433477

@@ -823,9 +867,6 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
823867

824868
// Handle GPU optimizations (CUDA platfroms only for now).
825869
if (platform.is_gpu()) {
826-
if (!platform.is_CUDA_gpu())
827-
throw std::runtime_error("Error: unsupported GPU architecture!\n");
828-
829870
// We only support CUDA backends anyway, so this works for now.
830871
utils::initialise_nvptx_passes();
831872

@@ -839,15 +880,12 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
839880
logger->debug("Dumping generated IR...\n" + dump_module());
840881
}
841882

842-
// If the output directory is specified, save the IR to .ll file.
843-
if (output_dir != ".") {
844-
utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
845-
}
846-
847883
// Setup CodegenHelper for C++ wrapper file
848884
setup(node);
885+
// Print C++ wrapper file
849886
print_wrapper_routines();
850-
print_target_file();
887+
// Print LLVM IR module to <mod_filename>.ll file
888+
utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
851889
}
852890

853891
void CodegenLLVMVisitor::print_mechanism_range_var_structure() {
@@ -960,6 +998,12 @@ void CodegenLLVMVisitor::print_instance_variable_setup() {
960998
// Pass ml->nodeindices pointer to node_index
961999
printer->add_line("inst->node_index = ml->nodeindices;");
9621000

1001+
// Setup rhs, d and their shadow vectors
1002+
printer->add_line(fmt::format("inst->{} = nt->_actual_rhs;", naming::NTHREAD_RHS));
1003+
printer->add_line(fmt::format("inst->{} = nt->_actual_d;", naming::NTHREAD_D));
1004+
printer->add_line(fmt::format("inst->{} = nt->_shadow_rhs;", naming::NTHREAD_RHS_SHADOW));
1005+
printer->add_line(fmt::format("inst->{} = nt->_shadow_d;", naming::NTHREAD_D_SHADOW));
1006+
9631007
// Setup global variables
9641008
printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_T_VARIABLE));
9651009
printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_DT_VARIABLE));

src/codegen/llvm/codegen_llvm_visitor.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -139,10 +139,6 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
139139
return str;
140140
}
141141

142-
void print_target_file() const {
143-
target_printer->add_multi_line(dump_module());
144-
}
145-
146142
/// Fills the container with the names of kernel functions from the MOD file.
147143
void find_kernel_names(std::vector<std::string>& container);
148144

@@ -303,8 +299,12 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
303299
void print_compute_functions() override;
304300

305301
private:
306-
// Annotates kernel function with NVVM metadata.
307-
void annotate_kernel_with_nvvm(llvm::Function* kernel);
302+
/// Annotates kernel function with NVVM metadata.
303+
void annotate_kernel_with_nvvm(llvm::Function* kernel, const std::string& annotation);
304+
305+
/// Handles NVVM function annotations when we create the wrapper functions. All original kernels
306+
/// should be "device" functions and wrappers "kernel" functions
307+
void annotate_wrapper_kernels_with_nvvm();
308308

309309
/// Accepts the given AST node and returns the processed value.
310310
llvm::Value* accept_and_get(const std::shared_ptr<ast::Node>& node);

src/codegen/llvm/llvm_utils.cpp

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,9 @@ void initialise_nvptx_passes() {
7575
initialise_optimisation_passes();
7676
}
7777

78-
void optimise_module_for_nvptx(codegen::Platform& platform,
79-
llvm::Module& module,
80-
int opt_level,
81-
std::string& target_asm) {
78+
std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform,
79+
llvm::Module& module) {
8280
// CUDA target machine we generating code for.
83-
std::unique_ptr<llvm::TargetMachine> tm;
8481
std::string platform_name = platform.get_name();
8582

8683
// Target and layout information.
@@ -111,9 +108,30 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
111108
if (!target)
112109
throw std::runtime_error("Error: " + error_msg + "\n");
113110

111+
std::unique_ptr<llvm::TargetMachine> tm;
114112
tm.reset(target->createTargetMachine(triple, subtarget, features, {}, {}));
115113
if (!tm)
116114
throw std::runtime_error("Error: creating target machine failed! Aborting.");
115+
return tm;
116+
}
117+
118+
std::string get_module_ptx(llvm::TargetMachine& tm, llvm::Module& module) {
119+
std::string target_asm;
120+
llvm::raw_string_ostream stream(target_asm);
121+
llvm::buffer_ostream pstream(stream);
122+
llvm::legacy::PassManager codegen_pm;
123+
124+
tm.addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
125+
codegen_pm.run(module);
126+
return target_asm;
127+
}
128+
129+
void optimise_module_for_nvptx(const codegen::Platform& platform,
130+
llvm::Module& module,
131+
int opt_level,
132+
std::string& target_asm) {
133+
// Create target machine for CUDA GPU
134+
auto tm = create_CUDA_target_machine(platform, module);
117135

118136
// Create pass managers.
119137
llvm::legacy::FunctionPassManager func_pm(&module);
@@ -137,12 +155,7 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
137155

138156
// Now, we want to run target-specific (e.g. NVPTX) passes. In LLVM, this
139157
// is done via `addPassesToEmitFile`.
140-
llvm::raw_string_ostream stream(target_asm);
141-
llvm::buffer_ostream pstream(stream);
142-
llvm::legacy::PassManager codegen_pm;
143-
144-
tm->addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
145-
codegen_pm.run(module);
158+
target_asm = get_module_ptx(*tm, module);
146159
}
147160

148161
void initialise_optimisation_passes() {

src/codegen/llvm/llvm_utils.hpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,18 @@ void initialise_optimisation_passes();
2121
/// Initialises NVPTX-specific optimisation passes.
2222
void initialise_nvptx_passes();
2323

24+
//// Initializes a CUDA target machine
25+
std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform,
26+
llvm::Module& module);
27+
28+
/// Generate PTX code given a CUDA target machine and the module
29+
std::string get_module_ptx(llvm::TargetMachine& tm, llvm::Module& module);
30+
2431
/// Replaces calls to LLVM intrinsics with appropriate library calls.
2532
void replace_with_lib_functions(codegen::Platform& platform, llvm::Module& module);
2633

2734
/// Optimises the given LLVM IR module for NVPTX targets.
28-
void optimise_module_for_nvptx(codegen::Platform& platform,
35+
void optimise_module_for_nvptx(const codegen::Platform& platform,
2936
llvm::Module& module,
3037
int opt_level,
3138
std::string& target_asm);

0 commit comments

Comments
 (0)