Add support for conditional compilation of cpu and cuda kernels.

hzhangxyz · windy-pig · hzhangxyz · commit 49cb1b54b004 · 2025-06-26T12:06:12.000+08:00
Co-authored-by: Junkai Xu &lt;michael17322021@163.com&gt;
diff --git a/qmb/hamiltonian.py b/qmb/hamiltonian.py
@@ -14,31 +14,40 @@ class Hamiltonian:
     The Hamiltonian type, which stores the Hamiltonian and processes iteration over each term in the Hamiltonian for given configurations.
     """
 
-    _hamiltonian_module: dict[int, object] = {}
+    _hamiltonian_module: dict[tuple[str, int, int], object] = {}
 
     @classmethod
-    def _load_module(cls, n_qubytes: int = 0, particle_cut: int = 0) -> object:
-        if n_qubytes not in cls._hamiltonian_module:
-            name = "qmb_hamiltonian" if n_qubytes == 0 else f"qmb_hamiltonian_{n_qubytes}_{particle_cut}"
-            build_directory = platformdirs.user_cache_path("qmb", "kclab") / name
+    def _load_module(cls, device_type: str = "declaration", n_qubytes: int = 0, particle_cut: int = 0) -> object:
+        if device_type != "declaration":
+            cls._load_module("declaration", n_qubytes, particle_cut) # Ensure the declaration module is loaded first
+        key = (device_type, n_qubytes, particle_cut)
+        is_prepare = key == ("declaration", 0, 0)
+        name = "qmb_hamiltonian" if is_prepare else f"qmb_hamiltonian_{n_qubytes}_{particle_cut}"
+        if key not in cls._hamiltonian_module:
+            build_directory = platformdirs.user_cache_path("qmb", "kclab") / name / device_type
             build_directory.mkdir(parents=True, exist_ok=True)
             folder = os.path.dirname(__file__)
-            cls._hamiltonian_module[n_qubytes] = torch.utils.cpp_extension.load(
+            match device_type:
+                case "declaration":
+                    sources = [f"{folder}/_hamiltonian.cpp"]
+                case "cpu":
+                    sources = [f"{folder}/_hamiltonian_cpu.cpp"]
+                case "cuda":
+                    sources = [f"{folder}/_hamiltonian_cuda.cu"]
+                case _:
+                    raise ValueError("Unsupported device type")
+            cls._hamiltonian_module[key] = torch.utils.cpp_extension.load(
                 name=name,
-                sources=[
-                    f"{folder}/_hamiltonian.cpp",
-                    f"{folder}/_hamiltonian_cpu.cpp",
-                    f"{folder}/_hamiltonian_cuda.cu",
-                ],
-                is_python_module=n_qubytes == 0,
+                sources=sources,
+                is_python_module=is_prepare,
                 extra_cflags=["-O3", "-ffast-math", "-march=native", f"-DN_QUBYTES={n_qubytes}", f"-DPARTICLE_CUT={particle_cut}", "-std=c++20"],
                 extra_cuda_cflags=["-O3", "--use_fast_math", f"-DN_QUBYTES={n_qubytes}", f"-DPARTICLE_CUT={particle_cut}", "-std=c++20"],
                 build_directory=build_directory,
             )
-        if n_qubytes == 0:  # pylint: disable=no-else-return
-            return cls._hamiltonian_module[n_qubytes]
+        if is_prepare:  # pylint: disable=no-else-return
+            return cls._hamiltonian_module[key]
         else:
-            return getattr(torch.ops, f"qmb_hamiltonian_{n_qubytes}_{particle_cut}")
+            return getattr(torch.ops, name)
 
     @classmethod
     def _prepare(cls, hamiltonian: dict[tuple[tuple[int, int], ...], complex]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -98,7 +107,7 @@ def apply_within(
         """
         self._prepare_data(configs_i.device)
         _apply_within: typing.Callable[[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]
-        _apply_within = getattr(self._load_module(configs_i.size(1), self.particle_cut), "apply_within")
+        _apply_within = getattr(self._load_module(configs_i.device.type, configs_i.size(1), self.particle_cut), "apply_within")
         psi_j = torch.view_as_complex(_apply_within(configs_i, torch.view_as_real(psi_i), configs_j, self.site, self.kind, self.coef))
         return psi_j
 
@@ -133,7 +142,7 @@ def find_relative(
             configs_exclude = configs_i
         self._prepare_data(configs_i.device)
         _find_relative: typing.Callable[[torch.Tensor, torch.Tensor, int, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]
-        _find_relative = getattr(self._load_module(configs_i.size(1), self.particle_cut), "find_relative")
+        _find_relative = getattr(self._load_module(configs_i.device.type, configs_i.size(1), self.particle_cut), "find_relative")
         configs_j = _find_relative(configs_i, torch.view_as_real(psi_i), count_selected, self.site, self.kind, self.coef, configs_exclude)
         return configs_j
 
@@ -153,6 +162,6 @@ def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         """
         self._prepare_data(configs.device)
         _single_relative: typing.Callable[[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]
-        _single_relative = getattr(self._load_module(configs.size(1), self.particle_cut), "single_relative")
+        _single_relative = getattr(self._load_module(configs.device.type, configs.size(1), self.particle_cut), "single_relative")
         configs_result = _single_relative(configs, self.site, self.kind, self.coef)
         return configs_result