diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index a6a8dd453..5a42fffd9 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -79,7 +79,8 @@ jobs: - name: Install FFCx (Linux, with optional dependencies) if: runner.os == 'Linux' - run: pip install .[ci,optional] + run: | + pip install .[ci,optional] - name: Install FFCx (macOS, Windows) if: runner.os != 'Linux' run: pip install .[ci] diff --git a/demo/nvrtc_test.cpp b/demo/nvrtc_test.cpp new file mode 100644 index 000000000..ed426baf3 --- /dev/null +++ b/demo/nvrtc_test.cpp @@ -0,0 +1,110 @@ +#include "Components.h" +#include "FacetIntegrals.h" +#include "HyperElasticity.h" +#include "MathFunctions.h" +#include "StabilisedStokes.h" +#include "VectorPoisson.h" +#include "ufcx.h" +#include "nvrtc.h" +#include +#include +#include +#include +#include + +void check_nvrtc_compilation(ufcx_form* form) +{ + // extract kernel + ufcx_integral* integral = form->form_integrals[0]; + ufcx_tabulate_tensor_cuda_nvrtc* kernel = integral->tabulate_tensor_cuda_nvrtc; + // call kernel to get CUDA-wrapped source code + int num_program_headers; + const char** program_headers; + const char** program_include_names; + const char* program_src; + const char* tabulate_tensor_function_name; + if (!kernel) { + throw std::runtime_error("NVRTC wrapper function is NULL!"); + } + (*kernel)( + &num_program_headers, &program_headers, + &program_include_names, &program_src, + &tabulate_tensor_function_name); + // compile CUDA-wrapped source code with NVRTC + // with proper error checking + + nvrtcResult nvrtc_err; + nvrtcProgram program; + nvrtc_err = nvrtcCreateProgram( + &program, program_src, tabulate_tensor_function_name, + num_program_headers, program_headers, + program_include_names); + + if (nvrtc_err != NVRTC_SUCCESS) { + throw std::runtime_error( + "nvrtcCreateProgram() failed with " + + std::string(nvrtcGetErrorString(nvrtc_err)) + " " + "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); + } + + int num_compile_options = 0; + const char** compile_options; + // Compile the CUDA C++ program + nvrtcResult nvrtc_compile_err = nvrtcCompileProgram( + program, num_compile_options, compile_options); + if (nvrtc_compile_err != NVRTC_SUCCESS) { + // If the compiler failed, obtain the compiler log + std::string program_log; + size_t log_size; + nvrtc_err = nvrtcGetProgramLogSize(program, &log_size); + if (nvrtc_err != NVRTC_SUCCESS) { + program_log = std::string( + "nvrtcGetProgramLogSize() failed with " + + std::string(nvrtcGetErrorString(nvrtc_err)) + " " + "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); + } else { + program_log.resize(log_size); + nvrtc_err = nvrtcGetProgramLog( + program, const_cast(program_log.c_str())); + if (nvrtc_err != NVRTC_SUCCESS) { + program_log = std::string( + "nvrtcGetProgramLog() failed with " + + std::string(nvrtcGetErrorString(nvrtc_err))) + " " + "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__); + } + if (log_size > 0) + program_log.resize(log_size-1); + } + nvrtcDestroyProgram(&program); + + std::stringstream ss; + ss << "nvrtcCompileProgram() failed with " + << nvrtcGetErrorString(nvrtc_compile_err) << "\n" + << "CUDA C++ source code:\n" + << std::string(60, '-') << "\n" + << program_src + << std::string(60, '-') << "\n" + << "NVRTC compiler log:\n" + << std::string(60, '-') << "\n" + << program_log << "\n" + << std::string(60, '-') << "\n"; + throw std::runtime_error(ss.str()); + } +} + +int main() +{ + std::vector forms = { + form_Components_L, + form_FacetIntegrals_a, + form_HyperElasticity_a_F, form_HyperElasticity_a_J, + form_MathFunctions_a, + form_StabilisedStokes_a, form_StabilisedStokes_L, + form_VectorPoisson_a, form_VectorPoisson_L + }; + + for (ufcx_form* form : forms) check_nvrtc_compilation(form); + + return 0; +} + diff --git a/demo/test_demos.py b/demo/test_demos.py index bebfec2a2..eb8519230 100644 --- a/demo/test_demos.py +++ b/demo/test_demos.py @@ -58,3 +58,50 @@ def test_demo(file, scalar_type): os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c") == 0 ) + + +@pytest.mark.parametrize("scalar_type", ["float64", "float32"]) +def test_demo_nvrtc(scalar_type): + """Test generated CUDA code with NVRTC.""" + import importlib.util + + try: + spec = importlib.util.find_spec("nvidia.cuda_nvrtc") + except ModuleNotFoundError: + pytest.skip(reason="Must have NVRTC pip package installed to run test.") + + if sys.platform.startswith("win32"): + pytest.skip(reason="NVRTC CUDA wrappers not currently supported for Windows.") + + files = [ + "Components", + "FacetIntegrals", + "HyperElasticity", + "MathFunctions", + "StabilisedStokes", + "VectorPoisson", + ] + opts = f"--scalar_type {scalar_type} --cuda_nvrtc" + nvrtc_dir = os.path.realpath(spec.submodule_search_locations[0]) + cc = os.environ.get("CC", "cc") + extra_flags = ( + "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration" + ) + for file in files: + assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0 + assert ( + os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c") + == 0 + ) + + cxx = os.environ.get("CXX", "c++") + assert ( + os.system( + f"cd {demo_dir} && " + f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib " + f" -Werror -o nvrtc_test nvrtc_test.cpp " + f"{' '.join([file + '.o' for file in files])} -l:libnvrtc.so.12" + ) + == 0 + ) + assert os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") == 0 diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py index 9fb64ccc7..e4b981493 100644 --- a/ffcx/codegeneration/C/integrals.py +++ b/ffcx/codegeneration/C/integrals.py @@ -69,17 +69,27 @@ def generator(ir: IntegralIR, domain: basix.CellType, options): else: code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL," code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL," + if options.get("cuda_nvrtc"): + code["tabulate_tensor_cuda_nvrtc"] = ( + f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name}," + ) + code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n "') + else: + code["tabulate_tensor_cuda_nvrtc"] = "" + code["tabulate_tensor_quoted"] = "" + np_scalar_type = np.dtype(options["scalar_type"]).name code[f"tabulate_tensor_{np_scalar_type}"] = ( f".tabulate_tensor_{np_scalar_type} = tabulate_tensor_{factory_name}," ) assert ir.expression.coordinate_element_hash is not None - implementation = ufcx_integrals.factory.format( + implementation = ufcx_integrals.get_factory(options).format( factory_name=factory_name, enabled_coefficients=code["enabled_coefficients"], enabled_coefficients_init=code["enabled_coefficients_init"], tabulate_tensor=code["tabulate_tensor"], + tabulate_tensor_quoted=code["tabulate_tensor_quoted"], needs_facet_permutations="true" if ir.expression.needs_facet_permutations else "false", scalar_type=dtype_to_c_type(options["scalar_type"]), geom_type=dtype_to_c_type(dtype_to_scalar_dtype(options["scalar_type"])), @@ -88,6 +98,7 @@ def generator(ir: IntegralIR, domain: basix.CellType, options): tabulate_tensor_float64=code["tabulate_tensor_float64"], tabulate_tensor_complex64=code["tabulate_tensor_complex64"], tabulate_tensor_complex128=code["tabulate_tensor_complex128"], + tabulate_tensor_cuda_nvrtc=code["tabulate_tensor_cuda_nvrtc"], domain=int(domain), ) diff --git a/ffcx/codegeneration/C/integrals_template.py b/ffcx/codegeneration/C/integrals_template.py index e2c28887c..b7d5c78aa 100644 --- a/ffcx/codegeneration/C/integrals_template.py +++ b/ffcx/codegeneration/C/integrals_template.py @@ -31,6 +31,7 @@ {tabulate_tensor_float64} {tabulate_tensor_complex64} {tabulate_tensor_complex128} + {tabulate_tensor_cuda_nvrtc} .needs_facet_permutations = {needs_facet_permutations}, .coordinate_element_hash = {coordinate_element_hash}, .domain = {domain}, @@ -38,3 +39,53 @@ // End of code for integral {factory_name} """ + +cuda_wrapper = """ + +// Begin NVRTC CUDA wrapper for integral {factory_name} +// The wrapper is compiled with a standard C++ compiler, and is called at runtime to generate +// source code which is then compiled into a CUDA kernel at runtime via NVRTC. +void tabulate_tensor_cuda_nvrtc_{factory_name}(int* num_program_headers, + const char*** program_headers, + const char*** program_include_names, + const char** out_program_src, + const char** tabulate_tensor_function_name) +{{ + // The below typedefs are needed due to issues with including stdint.h in NVRTC source code + const char* program_src = "" + "#define alignas(x)\\n" + "#define restrict __restrict__\\n" + "\\n" + "typedef unsigned char uint8_t;\\n" + "typedef unsigned int uint32_t;\\n" + "typedef double ufc_scalar_t;\\n" + "\\n" + "extern \\"C\\" __global__\\n" + "void tabulate_tensor_{factory_name}({scalar_type}* restrict A,\\n" + " const {scalar_type}* restrict w,\\n" + " const {scalar_type}* restrict c,\\n" + " const {geom_type}* restrict coordinate_dofs,\\n" + " const int* restrict entity_local_index,\\n" + " const uint8_t* restrict quadrature_permutation\\n" + " )\\n" + "{{\\n" + "{tabulate_tensor_quoted}\\n" + "}}"; + *num_program_headers = 0; + *program_headers = NULL; + *program_include_names = NULL; + *out_program_src = program_src; + *tabulate_tensor_function_name = "tabulate_tensor_{factory_name}"; +}} + +// End NVRTC CUDA wrapper for integral {factory_name} + +""" + + +def get_factory(options): + """Return the template string for constructing form integrals.""" + if options.get("cuda_nvrtc"): + return cuda_wrapper + factory + else: + return factory diff --git a/ffcx/codegeneration/jit.py b/ffcx/codegeneration/jit.py index 52353946f..62dece1fc 100644 --- a/ffcx/codegeneration/jit.py +++ b/ffcx/codegeneration/jit.py @@ -69,6 +69,9 @@ UFC_INTEGRAL_DECL += "\n".join( re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL) ) +UFC_INTEGRAL_DECL += "\n".join( + re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda_nvrtc\).*?\);", ufcx_h, re.DOTALL) +) UFC_INTEGRAL_DECL += "\n".join( re.findall("typedef struct ufcx_integral.*?ufcx_integral;", ufcx_h, re.DOTALL) diff --git a/ffcx/codegeneration/ufcx.h b/ffcx/codegeneration/ufcx.h index 98b403c90..6df37918a 100644 --- a/ffcx/codegeneration/ufcx.h +++ b/ffcx/codegeneration/ufcx.h @@ -129,6 +129,28 @@ extern "C" const uint8_t* restrict quadrature_permutation, void* custom_data); #endif // __STDC_NO_COMPLEX__ + /// Return CUDA C++ source code for the ufc_tabulate_tensor kernel + /// The resulting source code is passed to NVRTC for runtime compilation + /// + /// @param[out] num_program_headers + /// The number of headers required by the program + /// @param[out] program_headers + /// Entire contents of each header file + /// @param[out] program_include_names + /// Names of each header file + /// @param[out] program_src + /// CUDA C++ source code for the program containing the + /// tabulate_tensor function. + /// @param[out] tabulate_tensor_function_name + /// The name of the device-side function. + /// + typedef void(ufcx_tabulate_tensor_cuda_nvrtc)( + int* num_program_headers, + const char*** program_headers, + const char*** program_include_names, + const char** program_src, + const char** tabulate_tensor_function_name); + typedef struct ufcx_integral { const bool* enabled_coefficients; @@ -138,6 +160,7 @@ extern "C" ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64; ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128; #endif // __STDC_NO_COMPLEX__ + ufcx_tabulate_tensor_cuda_nvrtc* tabulate_tensor_cuda_nvrtc; bool needs_facet_permutations; /// Hash of the coordinate element associated with the geometry of the mesh. diff --git a/ffcx/options.py b/ffcx/options.py index 536f02a35..71783a3fe 100644 --- a/ffcx/options.py +++ b/ffcx/options.py @@ -20,6 +20,12 @@ logger = logging.getLogger("ffcx") FFCX_DEFAULT_OPTIONS = { + "cuda_nvrtc": ( + bool, + False, + "generate CUDA wrapped versions of tabulate tensor functions for use with NVRTC", + None, + ), "epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None), "scalar_type": ( str, diff --git a/pyproject.toml b/pyproject.toml index 9ce86e1f1..bf25f1dcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main" [project.optional-dependencies] lint = ["ruff"] docs = ["sphinx", "sphinx_rtd_theme"] -optional = ["numba", "pygraphviz==1.9"] +optional = ["numba", "pygraphviz", "nvidia-cuda-nvrtc-cu12>=12.9.86"] test = ["pytest >= 6.0", "sympy", "numba"] ci = [ "coveralls",