Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ jobs:

- name: Install FFCx (Linux, with optional dependencies)
if: runner.os == 'Linux'
run: pip install .[ci,optional]
run: |
pip install .[ci,optional]
- name: Install FFCx (macOS, Windows)
if: runner.os != 'Linux'
run: pip install .[ci]
Expand Down
110 changes: 110 additions & 0 deletions demo/nvrtc_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include "Components.h"
#include "FacetIntegrals.h"
#include "HyperElasticity.h"
#include "MathFunctions.h"
#include "StabilisedStokes.h"
#include "VectorPoisson.h"
#include "ufcx.h"
#include "nvrtc.h"
#include <iostream>
#include <stdexcept>
#include <sstream>
#include <string>
#include <vector>

void check_nvrtc_compilation(ufcx_form* form)
{
// extract kernel
ufcx_integral* integral = form->form_integrals[0];
ufcx_tabulate_tensor_cuda_nvrtc* kernel = integral->tabulate_tensor_cuda_nvrtc;
// call kernel to get CUDA-wrapped source code
int num_program_headers;
const char** program_headers;
const char** program_include_names;
const char* program_src;
const char* tabulate_tensor_function_name;
if (!kernel) {
throw std::runtime_error("NVRTC wrapper function is NULL!");
}
(*kernel)(
&num_program_headers, &program_headers,
&program_include_names, &program_src,
&tabulate_tensor_function_name);
// compile CUDA-wrapped source code with NVRTC
// with proper error checking

nvrtcResult nvrtc_err;
nvrtcProgram program;
nvrtc_err = nvrtcCreateProgram(
&program, program_src, tabulate_tensor_function_name,
num_program_headers, program_headers,
program_include_names);

if (nvrtc_err != NVRTC_SUCCESS) {
throw std::runtime_error(
"nvrtcCreateProgram() failed with " +
std::string(nvrtcGetErrorString(nvrtc_err)) + " "
"at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
}

int num_compile_options = 0;
const char** compile_options;
// Compile the CUDA C++ program
nvrtcResult nvrtc_compile_err = nvrtcCompileProgram(
program, num_compile_options, compile_options);
if (nvrtc_compile_err != NVRTC_SUCCESS) {
// If the compiler failed, obtain the compiler log
std::string program_log;
size_t log_size;
nvrtc_err = nvrtcGetProgramLogSize(program, &log_size);
if (nvrtc_err != NVRTC_SUCCESS) {
program_log = std::string(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Std::format?

"nvrtcGetProgramLogSize() failed with " +
std::string(nvrtcGetErrorString(nvrtc_err)) + " "
"at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
} else {
program_log.resize(log_size);
nvrtc_err = nvrtcGetProgramLog(
program, const_cast<char*>(program_log.c_str()));
if (nvrtc_err != NVRTC_SUCCESS) {
program_log = std::string(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Std::format?

"nvrtcGetProgramLog() failed with " +
std::string(nvrtcGetErrorString(nvrtc_err))) + " "
"at " + std::string(__FILE__) + ":" + std::to_string(__LINE__);
}
if (log_size > 0)
program_log.resize(log_size-1);
}
nvrtcDestroyProgram(&program);

std::stringstream ss;
ss << "nvrtcCompileProgram() failed with "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use std::format if you can switch to C++20, removes the need for using C++ terrible string formatting.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I tried switching this code to use std::format, however the C++ compiler in Github's CI environment appears to not support c++20 (tests worked locally, but broke during CI on Github).

<< nvrtcGetErrorString(nvrtc_compile_err) << "\n"
<< "CUDA C++ source code:\n"
<< std::string(60, '-') << "\n"
<< program_src
<< std::string(60, '-') << "\n"
<< "NVRTC compiler log:\n"
<< std::string(60, '-') << "\n"
<< program_log << "\n"
<< std::string(60, '-') << "\n";
throw std::runtime_error(ss.str());
}
}

int main()
{
std::vector<ufcx_form*> forms = {
form_Components_L,
form_FacetIntegrals_a,
form_HyperElasticity_a_F, form_HyperElasticity_a_J,
form_MathFunctions_a,
form_StabilisedStokes_a, form_StabilisedStokes_L,
form_VectorPoisson_a, form_VectorPoisson_L
};

for (ufcx_form* form : forms) check_nvrtc_compilation(form);

return 0;
}

47 changes: 47 additions & 0 deletions demo/test_demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,50 @@ def test_demo(file, scalar_type):
os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c")
== 0
)


@pytest.mark.parametrize("scalar_type", ["float64", "float32"])
def test_demo_nvrtc(scalar_type):
"""Test generated CUDA code with NVRTC."""
import importlib.util

try:
spec = importlib.util.find_spec("nvidia.cuda_nvrtc")
except ModuleNotFoundError:
pytest.skip(reason="Must have NVRTC pip package installed to run test.")

if sys.platform.startswith("win32"):
pytest.skip(reason="NVRTC CUDA wrappers not currently supported for Windows.")

files = [
"Components",
"FacetIntegrals",
"HyperElasticity",
"MathFunctions",
"StabilisedStokes",
"VectorPoisson",
]
opts = f"--scalar_type {scalar_type} --cuda_nvrtc"
nvrtc_dir = os.path.realpath(spec.submodule_search_locations[0])
cc = os.environ.get("CC", "cc")
extra_flags = (
"-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
)
for file in files:
assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0
assert (
os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c")
== 0
)

cxx = os.environ.get("CXX", "c++")
assert (
os.system(
f"cd {demo_dir} && "
f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib "
f" -Werror -o nvrtc_test nvrtc_test.cpp "
f"{' '.join([file + '.o' for file in files])} -l:libnvrtc.so.12"
)
== 0
)
assert os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") == 0
13 changes: 12 additions & 1 deletion ffcx/codegeneration/C/integrals.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,27 @@ def generator(ir: IntegralIR, domain: basix.CellType, options):
else:
code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL,"
code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL,"
if options.get("cuda_nvrtc"):
code["tabulate_tensor_cuda_nvrtc"] = (
f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name},"
)
code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n "')
else:
code["tabulate_tensor_cuda_nvrtc"] = ""
code["tabulate_tensor_quoted"] = ""

np_scalar_type = np.dtype(options["scalar_type"]).name
code[f"tabulate_tensor_{np_scalar_type}"] = (
f".tabulate_tensor_{np_scalar_type} = tabulate_tensor_{factory_name},"
)

assert ir.expression.coordinate_element_hash is not None
implementation = ufcx_integrals.factory.format(
implementation = ufcx_integrals.get_factory(options).format(
factory_name=factory_name,
enabled_coefficients=code["enabled_coefficients"],
enabled_coefficients_init=code["enabled_coefficients_init"],
tabulate_tensor=code["tabulate_tensor"],
tabulate_tensor_quoted=code["tabulate_tensor_quoted"],
needs_facet_permutations="true" if ir.expression.needs_facet_permutations else "false",
scalar_type=dtype_to_c_type(options["scalar_type"]),
geom_type=dtype_to_c_type(dtype_to_scalar_dtype(options["scalar_type"])),
Expand All @@ -88,6 +98,7 @@ def generator(ir: IntegralIR, domain: basix.CellType, options):
tabulate_tensor_float64=code["tabulate_tensor_float64"],
tabulate_tensor_complex64=code["tabulate_tensor_complex64"],
tabulate_tensor_complex128=code["tabulate_tensor_complex128"],
tabulate_tensor_cuda_nvrtc=code["tabulate_tensor_cuda_nvrtc"],
domain=int(domain),
)

Expand Down
51 changes: 51 additions & 0 deletions ffcx/codegeneration/C/integrals_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,61 @@
{tabulate_tensor_float64}
{tabulate_tensor_complex64}
{tabulate_tensor_complex128}
{tabulate_tensor_cuda_nvrtc}
.needs_facet_permutations = {needs_facet_permutations},
.coordinate_element_hash = {coordinate_element_hash},
.domain = {domain},
}};

// End of code for integral {factory_name}
"""

cuda_wrapper = """

// Begin NVRTC CUDA wrapper for integral {factory_name}
// The wrapper is compiled with a standard C++ compiler, and is called at runtime to generate
// source code which is then compiled into a CUDA kernel at runtime via NVRTC.
void tabulate_tensor_cuda_nvrtc_{factory_name}(int* num_program_headers,
const char*** program_headers,
const char*** program_include_names,
const char** out_program_src,
const char** tabulate_tensor_function_name)
{{
// The below typedefs are needed due to issues with including stdint.h in NVRTC source code
const char* program_src = ""
"#define alignas(x)\\n"
"#define restrict __restrict__\\n"
"\\n"
"typedef unsigned char uint8_t;\\n"
"typedef unsigned int uint32_t;\\n"
"typedef double ufc_scalar_t;\\n"
"\\n"
"extern \\"C\\" __global__\\n"
"void tabulate_tensor_{factory_name}({scalar_type}* restrict A,\\n"
" const {scalar_type}* restrict w,\\n"
" const {scalar_type}* restrict c,\\n"
" const {geom_type}* restrict coordinate_dofs,\\n"
" const int* restrict entity_local_index,\\n"
" const uint8_t* restrict quadrature_permutation\\n"
" )\\n"
"{{\\n"
"{tabulate_tensor_quoted}\\n"
"}}";
*num_program_headers = 0;
*program_headers = NULL;
*program_include_names = NULL;
*out_program_src = program_src;
*tabulate_tensor_function_name = "tabulate_tensor_{factory_name}";
}}

// End NVRTC CUDA wrapper for integral {factory_name}

"""


def get_factory(options):
"""Return the template string for constructing form integrals."""
if options.get("cuda_nvrtc"):
return cuda_wrapper + factory
else:
return factory
3 changes: 3 additions & 0 deletions ffcx/codegeneration/jit.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@
UFC_INTEGRAL_DECL += "\n".join(
re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL)
)
UFC_INTEGRAL_DECL += "\n".join(
re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda_nvrtc\).*?\);", ufcx_h, re.DOTALL)
)

UFC_INTEGRAL_DECL += "\n".join(
re.findall("typedef struct ufcx_integral.*?ufcx_integral;", ufcx_h, re.DOTALL)
Expand Down
23 changes: 23 additions & 0 deletions ffcx/codegeneration/ufcx.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,28 @@ extern "C"
const uint8_t* restrict quadrature_permutation, void* custom_data);
#endif // __STDC_NO_COMPLEX__

/// Return CUDA C++ source code for the ufc_tabulate_tensor kernel
/// The resulting source code is passed to NVRTC for runtime compilation
///
/// @param[out] num_program_headers
/// The number of headers required by the program
/// @param[out] program_headers
/// Entire contents of each header file
/// @param[out] program_include_names
/// Names of each header file
/// @param[out] program_src
/// CUDA C++ source code for the program containing the
/// tabulate_tensor function.
/// @param[out] tabulate_tensor_function_name
/// The name of the device-side function.
///
typedef void(ufcx_tabulate_tensor_cuda_nvrtc)(
int* num_program_headers,
const char*** program_headers,
const char*** program_include_names,
const char** program_src,
const char** tabulate_tensor_function_name);

typedef struct ufcx_integral
{
const bool* enabled_coefficients;
Expand All @@ -138,6 +160,7 @@ extern "C"
ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64;
ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128;
#endif // __STDC_NO_COMPLEX__
ufcx_tabulate_tensor_cuda_nvrtc* tabulate_tensor_cuda_nvrtc;
bool needs_facet_permutations;

/// Hash of the coordinate element associated with the geometry of the mesh.
Expand Down
6 changes: 6 additions & 0 deletions ffcx/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
logger = logging.getLogger("ffcx")

FFCX_DEFAULT_OPTIONS = {
"cuda_nvrtc": (
bool,
False,
"generate CUDA wrapped versions of tabulate tensor functions for use with NVRTC",
None,
),
"epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None),
"scalar_type": (
str,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main"
[project.optional-dependencies]
lint = ["ruff"]
docs = ["sphinx", "sphinx_rtd_theme"]
optional = ["numba", "pygraphviz==1.9"]
optional = ["numba", "pygraphviz", "nvidia-cuda-nvrtc-cu12>=12.9.86"]
test = ["pytest >= 6.0", "sympy", "numba"]
ci = [
"coveralls",
Expand Down