FEniCS · bpachev · Sep 13, 2024 · Sep 19, 2024 · Nov 7, 2024 · Nov 11, 2024
diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -79,7 +79,8 @@ jobs:
 
       - name: Install FFCx (Linux, with optional dependencies)
         if: runner.os == 'Linux'
-        run: pip install .[ci,optional]
+        run: | 
+          pip install .[ci,optional]
       - name: Install FFCx (macOS, Windows)
         if: runner.os != 'Linux'
         run: pip install .[ci]

diff --git a/demo/nvrtc_test.cpp b/demo/nvrtc_test.cpp
@@ -0,0 +1,110 @@
+#include "Components.h"
+#include "FacetIntegrals.h"
+#include "HyperElasticity.h"
+#include "MathFunctions.h"
+#include "StabilisedStokes.h"
+#include "VectorPoisson.h"
+#include "ufcx.h"
+#include "nvrtc.h"
+#include <iostream>
+#include <stdexcept>
+#include <sstream>
+#include <string>
+#include <vector>
+
+void check_nvrtc_compilation(ufcx_form* form)
+{
+  // extract kernel
+  ufcx_integral* integral = form->form_integrals[0];
+  ufcx_tabulate_tensor_cuda_nvrtc* kernel = integral->tabulate_tensor_cuda_nvrtc;
+  // call kernel to get CUDA-wrapped source code
+  int num_program_headers;
+  const char** program_headers;
+  const char** program_include_names;
+  const char* program_src;
+  const char* tabulate_tensor_function_name;
+  if (!kernel) {
+    throw std::runtime_error("NVRTC wrapper function is NULL!");
+  }
+  (*kernel)(
+    &num_program_headers, &program_headers,
+    &program_include_names, &program_src,
+    &tabulate_tensor_function_name);
+  // compile CUDA-wrapped source code with NVRTC
+  // with proper error checking
+
+  nvrtcResult nvrtc_err;
+  nvrtcProgram program;
+  nvrtc_err = nvrtcCreateProgram(
+    &program, program_src, tabulate_tensor_function_name,
+    num_program_headers, program_headers,
+    program_include_names);
+
+ if (nvrtc_err != NVRTC_SUCCESS) {
+    throw std::runtime_error(
+      "nvrtcCreateProgram() failed with " +
+      std::string(nvrtcGetErrorString(nvrtc_err)) + " "
+      "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
+  }
+
+  int num_compile_options = 0;
+  const char** compile_options;
+  // Compile the CUDA C++ program
+  nvrtcResult nvrtc_compile_err = nvrtcCompileProgram(
+    program, num_compile_options, compile_options);
+  if (nvrtc_compile_err != NVRTC_SUCCESS) {
+    // If the compiler failed, obtain the compiler log
+    std::string program_log;
+    size_t log_size;
+    nvrtc_err = nvrtcGetProgramLogSize(program, &log_size);
+    if (nvrtc_err != NVRTC_SUCCESS) {
+      program_log = std::string(
+        "nvrtcGetProgramLogSize() failed with " +
+        std::string(nvrtcGetErrorString(nvrtc_err)) + " "
+        "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
+    } else {
+      program_log.resize(log_size);
+      nvrtc_err = nvrtcGetProgramLog(
+        program, const_cast<char*>(program_log.c_str()));
+      if (nvrtc_err != NVRTC_SUCCESS) {
+        program_log = std::string(
+          "nvrtcGetProgramLog() failed with " +
+          std::string(nvrtcGetErrorString(nvrtc_err))) + " "
+          "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      }
+      if (log_size > 0)
+        program_log.resize(log_size-1);
+    }
+    nvrtcDestroyProgram(&program);
+
+    std::stringstream ss;
+    ss << "nvrtcCompileProgram() failed with "
+       << nvrtcGetErrorString(nvrtc_compile_err) << "\n"
+       << "CUDA C++ source code:\n"
+       << std::string(60, '-') << "\n"
+       << program_src
+       << std::string(60, '-') << "\n"
+       << "NVRTC compiler log:\n"
+       << std::string(60, '-') << "\n"
+       << program_log << "\n"
+       << std::string(60, '-') << "\n";
+    throw std::runtime_error(ss.str());
+  }
+}
+
+int main()
+{
+  std::vector<ufcx_form*> forms = {
+    form_Components_L,
+    form_FacetIntegrals_a,
+    form_HyperElasticity_a_F, form_HyperElasticity_a_J,
+    form_MathFunctions_a,
+    form_StabilisedStokes_a, form_StabilisedStokes_L,
+    form_VectorPoisson_a, form_VectorPoisson_L  
+  };
+
+  for (ufcx_form* form : forms) check_nvrtc_compilation(form);
+
+  return 0;
+}
+
diff --git a/demo/test_demos.py b/demo/test_demos.py
@@ -58,3 +58,50 @@ def test_demo(file, scalar_type):
             os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c")
             == 0
         )
+
+
+@pytest.mark.parametrize("scalar_type", ["float64", "float32"])
+def test_demo_nvrtc(scalar_type):
+    """Test generated CUDA code with NVRTC."""
+    import importlib.util
+
+    try:
+        spec = importlib.util.find_spec("nvidia.cuda_nvrtc")
+    except ModuleNotFoundError:
+        pytest.skip(reason="Must have NVRTC pip package installed to run test.")
+
+    if sys.platform.startswith("win32"):
+        pytest.skip(reason="NVRTC CUDA wrappers not currently supported for Windows.")
+
+    files = [
+        "Components",
+        "FacetIntegrals",
+        "HyperElasticity",
+        "MathFunctions",
+        "StabilisedStokes",
+        "VectorPoisson",
+    ]
+    opts = f"--scalar_type {scalar_type} --cuda_nvrtc"
+    nvrtc_dir = os.path.realpath(spec.submodule_search_locations[0])
+    cc = os.environ.get("CC", "cc")
+    extra_flags = (
+        "-std=c17 -Wunused-variable -Werror -fPIC -Wno-error=implicit-function-declaration"
+    )
+    for file in files:
+        assert os.system(f"cd {demo_dir} && ffcx {opts} {file}.py") == 0
+        assert (
+            os.system(f"cd {demo_dir} && {cc} -I../ffcx/codegeneration {extra_flags} -c {file}.c")
+            == 0
+        )
+
+    cxx = os.environ.get("CXX", "c++")
+    assert (
+        os.system(
+            f"cd {demo_dir} && "
+            f"{cxx} -I../ffcx/codegeneration -I{nvrtc_dir}/include -L{nvrtc_dir}/lib "
+            f" -Werror -o nvrtc_test nvrtc_test.cpp "
+            f"{' '.join([file + '.o' for file in files])} -l:libnvrtc.so.12"
+        )
+        == 0
+    )
+    assert os.system(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{nvrtc_dir}/lib {demo_dir}/nvrtc_test") == 0
diff --git a/ffcx/codegeneration/C/integrals.py b/ffcx/codegeneration/C/integrals.py
@@ -69,17 +69,27 @@ def generator(ir: IntegralIR, domain: basix.CellType, options):
     else:
         code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL,"
         code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL,"
+    if options.get("cuda_nvrtc"):
+        code["tabulate_tensor_cuda_nvrtc"] = (
+            f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name},"
+        )
+        code["tabulate_tensor_quoted"] = body.replace("\n", '\\n"\n    "')
+    else:
+        code["tabulate_tensor_cuda_nvrtc"] = ""
+        code["tabulate_tensor_quoted"] = ""
+
     np_scalar_type = np.dtype(options["scalar_type"]).name
     code[f"tabulate_tensor_{np_scalar_type}"] = (
         f".tabulate_tensor_{np_scalar_type} = tabulate_tensor_{factory_name},"
     )
 
     assert ir.expression.coordinate_element_hash is not None
-    implementation = ufcx_integrals.factory.format(
+    implementation = ufcx_integrals.get_factory(options).format(
         factory_name=factory_name,
         enabled_coefficients=code["enabled_coefficients"],
         enabled_coefficients_init=code["enabled_coefficients_init"],
         tabulate_tensor=code["tabulate_tensor"],
+        tabulate_tensor_quoted=code["tabulate_tensor_quoted"],
         needs_facet_permutations="true" if ir.expression.needs_facet_permutations else "false",
         scalar_type=dtype_to_c_type(options["scalar_type"]),
         geom_type=dtype_to_c_type(dtype_to_scalar_dtype(options["scalar_type"])),
@@ -88,6 +98,7 @@ def generator(ir: IntegralIR, domain: basix.CellType, options):
         tabulate_tensor_float64=code["tabulate_tensor_float64"],
         tabulate_tensor_complex64=code["tabulate_tensor_complex64"],
         tabulate_tensor_complex128=code["tabulate_tensor_complex128"],
+        tabulate_tensor_cuda_nvrtc=code["tabulate_tensor_cuda_nvrtc"],
         domain=int(domain),
     )
 

diff --git a/ffcx/codegeneration/C/integrals_template.py b/ffcx/codegeneration/C/integrals_template.py
@@ -31,10 +31,61 @@
   {tabulate_tensor_float64}
   {tabulate_tensor_complex64}
   {tabulate_tensor_complex128}
+  {tabulate_tensor_cuda_nvrtc}
   .needs_facet_permutations = {needs_facet_permutations},
   .coordinate_element_hash = {coordinate_element_hash},
   .domain = {domain},
 }};
 
 // End of code for integral {factory_name}
 """
+
+cuda_wrapper = """
+
+// Begin NVRTC CUDA wrapper for integral {factory_name}
+// The wrapper is compiled with a standard C++ compiler, and is called at runtime to generate
+// source code which is then compiled into a CUDA kernel at runtime via NVRTC.
+void tabulate_tensor_cuda_nvrtc_{factory_name}(int* num_program_headers,
+                                         const char*** program_headers,
+                                         const char*** program_include_names,
+                                         const char** out_program_src,
+                                         const char** tabulate_tensor_function_name)
+{{
+  // The below typedefs are needed due to issues with including stdint.h in NVRTC source code
+  const char* program_src = ""
+    "#define alignas(x)\\n"
+    "#define restrict __restrict__\\n"
+    "\\n"
+    "typedef unsigned char uint8_t;\\n"
+    "typedef unsigned int uint32_t;\\n"
+    "typedef double ufc_scalar_t;\\n"
+    "\\n"
+    "extern \\"C\\" __global__\\n"
+    "void tabulate_tensor_{factory_name}({scalar_type}* restrict A,\\n"
+    "                                    const {scalar_type}* restrict w,\\n"
+    "                                    const {scalar_type}* restrict c,\\n"
+    "                                    const {geom_type}* restrict coordinate_dofs,\\n"
+    "                                    const int* restrict entity_local_index,\\n"
+    "                                    const uint8_t* restrict quadrature_permutation\\n"
+    "                                    )\\n"
+    "{{\\n"
+    "{tabulate_tensor_quoted}\\n"
+    "}}";
+  *num_program_headers = 0;
+  *program_headers = NULL;
+  *program_include_names = NULL;
+  *out_program_src = program_src;
+  *tabulate_tensor_function_name = "tabulate_tensor_{factory_name}";
+}}
+
+// End NVRTC CUDA wrapper for integral {factory_name}
+
+"""
+
+
+def get_factory(options):
+    """Return the template string for constructing form integrals."""
+    if options.get("cuda_nvrtc"):
+        return cuda_wrapper + factory
+    else:
+        return factory
diff --git a/ffcx/codegeneration/jit.py b/ffcx/codegeneration/jit.py
@@ -69,6 +69,9 @@
 UFC_INTEGRAL_DECL += "\n".join(
     re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL)
 )
+UFC_INTEGRAL_DECL += "\n".join(
+    re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda_nvrtc\).*?\);", ufcx_h, re.DOTALL)
+)
 
 UFC_INTEGRAL_DECL += "\n".join(
     re.findall("typedef struct ufcx_integral.*?ufcx_integral;", ufcx_h, re.DOTALL)

diff --git a/ffcx/codegeneration/ufcx.h b/ffcx/codegeneration/ufcx.h
@@ -129,6 +129,28 @@ extern "C"
       const uint8_t* restrict quadrature_permutation, void* custom_data);
 #endif // __STDC_NO_COMPLEX__
 
+  /// Return CUDA C++ source code for the ufc_tabulate_tensor kernel
+  /// The resulting source code is passed to NVRTC for runtime compilation
+  ///
+  /// @param[out] num_program_headers
+  ///         The number of headers required by the program
+  /// @param[out] program_headers
+  ///         Entire contents of each header file
+  /// @param[out] program_include_names
+  ///         Names of each header file
+  /// @param[out] program_src
+  ///         CUDA C++ source code for the program containing the
+  ///         tabulate_tensor function.
+  /// @param[out] tabulate_tensor_function_name
+  ///         The name of the device-side function.
+  ///
+  typedef void(ufcx_tabulate_tensor_cuda_nvrtc)(
+      int* num_program_headers,
+      const char*** program_headers,
+      const char*** program_include_names,
+      const char** program_src,
+      const char** tabulate_tensor_function_name);
+
   typedef struct ufcx_integral
   {
     const bool* enabled_coefficients;
@@ -138,6 +160,7 @@ extern "C"
     ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64;
     ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128;
 #endif // __STDC_NO_COMPLEX__
+    ufcx_tabulate_tensor_cuda_nvrtc* tabulate_tensor_cuda_nvrtc;
     bool needs_facet_permutations;
 
     /// Hash of the coordinate element associated with the geometry of the mesh.

diff --git a/ffcx/options.py b/ffcx/options.py
@@ -20,6 +20,12 @@
 logger = logging.getLogger("ffcx")
 
 FFCX_DEFAULT_OPTIONS = {
+    "cuda_nvrtc": (
+        bool,
+        False,
+        "generate CUDA wrapped versions of tabulate tensor functions for use with NVRTC",
+        None,
+    ),
     "epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None),
     "scalar_type": (
         str,

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,7 @@ ffcx = "ffcx:__main__.main"
 [project.optional-dependencies]
 lint = ["ruff"]
 docs = ["sphinx", "sphinx_rtd_theme"]
-optional = ["numba", "pygraphviz==1.9"]
+optional = ["numba", "pygraphviz", "nvidia-cuda-nvrtc-cu12>=12.9.86"]
 test = ["pytest >= 6.0", "sympy", "numba"]
 ci = [
     "coveralls",