Add diagonal_term in kernel.

hzhangxyz · hzhangxyz · commit b30242f1f467 · 2025-08-22T15:36:26.000+08:00
PR: USTC-KnowledgeComputingLab/qmb#69 Signed-off-by: Hao Zhang <hzhangxyz@outlook.com>
diff --git a/qmb/__main__.py b/qmb/__main__.py
@@ -15,6 +15,7 @@
 from . import precompile as _  # type: ignore[no-redef]
 from . import list_loss as _  # type: ignore[no-redef]
 from . import chop_imag as _  # type: ignore[no-redef]
+from . import pert as _  # type: ignore[no-redef]
 from . import run as _  # type: ignore[no-redef]
 from .subcommand_dict import subcommand_dict
 
diff --git a/qmb/_hamiltonian.cpp b/qmb/_hamiltonian.cpp
@@ -84,6 +84,7 @@ TORCH_LIBRARY_FRAGMENT(QMB_LIBRARY(N_QUBYTES, PARTICLE_CUT), m) {
     m.def("apply_within(Tensor configs_i, Tensor psi_i, Tensor configs_j, Tensor site, Tensor kind, Tensor coef) -> Tensor");
     m.def("find_relative(Tensor configs_i, Tensor psi_i, int count_selected, Tensor site, Tensor kind, Tensor coef, Tensor configs_exclude) -> Tensor"
     );
+    m.def("diagonal_term(Tensor configs, Tensor site, Tensor kind, Tensor coef) -> Tensor");
     m.def("single_relative(Tensor configs, Tensor site, Tensor kind, Tensor coef) -> Tensor");
 }
 #undef QMB_LIBRARY
diff --git a/qmb/_hamiltonian_cuda.cu b/qmb/_hamiltonian_cuda.cu
@@ -674,6 +674,134 @@ auto find_relative_interface(
     return unique_nonzero_result_config;
 }
 
+template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
+__device__ void diagonal_term_kernel(
+    std::int64_t term_index,
+    std::int64_t batch_index,
+    std::int64_t term_number,
+    std::int64_t batch_size,
+    const std::array<std::int16_t, max_op_number>* site, // term_number
+    const std::array<std::uint8_t, max_op_number>* kind, // term_number
+    const std::array<double, 2>* coef, // term_number
+    const std::array<std::uint8_t, n_qubytes>* configs, // batch_size
+    std::array<double, 2>* result_psi
+) {
+    std::array<std::uint8_t, n_qubytes> current_configs = configs[batch_index];
+    auto [success, parity] = hamiltonian_apply_kernel<max_op_number, n_qubytes, particle_cut>(
+        /*current_configs=*/current_configs,
+        /*term_index=*/term_index,
+        /*batch_index=*/batch_index,
+        /*site=*/site,
+        /*kind=*/kind
+    );
+
+    if (!success) {
+        return;
+    }
+    auto less = array_less<std::uint8_t, n_qubytes>();
+    if (less(current_configs, configs[batch_index]) || less(configs[batch_index], current_configs)) {
+        return; // The term does not apply to the current configuration
+    }
+    std::int8_t sign = parity ? -1 : +1;
+    atomicAdd(&result_psi[batch_index][0], sign * coef[term_index][0]);
+    atomicAdd(&result_psi[batch_index][1], sign * coef[term_index][1]);
+}
+
+template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
+__global__ void diagonal_term_kernel_interface(
+    std::int64_t term_number,
+    std::int64_t batch_size,
+    const std::array<std::int16_t, max_op_number>* site, // term_number
+    const std::array<std::uint8_t, max_op_number>* kind, // term_number
+    const std::array<double, 2>* coef, // term_number
+    const std::array<std::uint8_t, n_qubytes>* configs, // batch_size
+    std::array<double, 2>* result_psi
+) {
+    std::int64_t term_index = blockIdx.x * blockDim.x + threadIdx.x;
+    std::int64_t batch_index = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (term_index < term_number && batch_index < batch_size) {
+        diagonal_term_kernel<max_op_number, n_qubytes, particle_cut>(
+            /*term_index=*/term_index,
+            /*batch_index=*/batch_index,
+            /*term_number=*/term_number,
+            /*batch_size=*/batch_size,
+            /*site=*/site,
+            /*kind=*/kind,
+            /*coef=*/coef,
+            /*configs=*/configs,
+            /*result_psi=*/result_psi
+        );
+    }
+}
+
+template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
+auto diagonal_term_interface(const torch::Tensor& configs, const torch::Tensor& site, const torch::Tensor& kind, const torch::Tensor& coef)
+    -> torch::Tensor {
+    std::int64_t device_id = configs.device().index();
+    std::int64_t batch_size = configs.size(0);
+    std::int64_t term_number = site.size(0);
+    at::cuda::CUDAGuard cuda_device_guard(device_id);
+
+    TORCH_CHECK(configs.device().type() == torch::kCUDA, "configs must be on CUDA.")
+    TORCH_CHECK(configs.device().index() == device_id, "configs must be on the same device as others.");
+    TORCH_CHECK(configs.is_contiguous(), "configs must be contiguous.")
+    TORCH_CHECK(configs.dtype() == torch::kUInt8, "configs must be uint8.")
+    TORCH_CHECK(configs.dim() == 2, "configs must be 2D.")
+    TORCH_CHECK(configs.size(0) == batch_size, "configs batch size must match the provided batch_size.");
+    TORCH_CHECK(configs.size(1) == n_qubytes, "configs must have the same number of qubits as the provided n_qubytes.");
+
+    TORCH_CHECK(site.device().type() == torch::kCUDA, "site must be on CUDA.")
+    TORCH_CHECK(site.device().index() == device_id, "site must be on the same device as others.");
+    TORCH_CHECK(site.is_contiguous(), "site must be contiguous.")
+    TORCH_CHECK(site.dtype() == torch::kInt16, "site must be int16.")
+    TORCH_CHECK(site.dim() == 2, "site must be 2D.")
+    TORCH_CHECK(site.size(0) == term_number, "site size must match the provided term_number.");
+    TORCH_CHECK(site.size(1) == max_op_number, "site must match the provided max_op_number.");
+
+    TORCH_CHECK(kind.device().type() == torch::kCUDA, "kind must be on CUDA.")
+    TORCH_CHECK(kind.device().index() == device_id, "kind must be on the same device as others.");
+    TORCH_CHECK(kind.is_contiguous(), "kind must be contiguous.")
+    TORCH_CHECK(kind.dtype() == torch::kUInt8, "kind must be uint8.")
+    TORCH_CHECK(kind.dim() == 2, "kind must be 2D.")
+    TORCH_CHECK(kind.size(0) == term_number, "kind size must match the provided term_number.");
+    TORCH_CHECK(kind.size(1) == max_op_number, "kind must match the provided max_op_number.");
+
+    TORCH_CHECK(coef.device().type() == torch::kCUDA, "coef must be on CUDA.")
+    TORCH_CHECK(coef.device().index() == device_id, "coef must be on the same device as others.");
+    TORCH_CHECK(coef.is_contiguous(), "coef must be contiguous.")
+    TORCH_CHECK(coef.dtype() == torch::kFloat64, "coef must be float64.")
+    TORCH_CHECK(coef.dim() == 2, "coef must be 2D.")
+    TORCH_CHECK(coef.size(0) == term_number, "coef size must match the provided term_number.");
+    TORCH_CHECK(coef.size(1) == 2, "coef must contain 2 elements for each term.");
+
+    auto stream = at::cuda::getCurrentCUDAStream(device_id);
+    auto policy = thrust::device.on(stream);
+
+    cudaDeviceProp prop;
+    AT_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_id));
+    std::int64_t max_threads_per_block = prop.maxThreadsPerBlock;
+
+    auto result_psi = torch::zeros({batch_size, 2}, torch::TensorOptions().dtype(torch::kFloat64).device(device, device_id));
+
+    auto threads_per_block = dim3{1, max_threads_per_block >> 1}; // I don't know why, but need to divide by 2 to avoid errors
+    auto num_blocks =
+        dim3{(term_number + threads_per_block.x - 1) / threads_per_block.x, (batch_size + threads_per_block.y - 1) / threads_per_block.y};
+
+    diagonal_term_kernel_interface<max_op_number, n_qubytes, particle_cut><<<num_blocks, threads_per_block, 0, stream>>>(
+        /*term_number=*/term_number,
+        /*batch_size=*/batch_size,
+        /*site=*/reinterpret_cast<const std::array<std::int16_t, max_op_number>*>(site.data_ptr()),
+        /*kind=*/reinterpret_cast<const std::array<std::uint8_t, max_op_number>*>(kind.data_ptr()),
+        /*coef=*/reinterpret_cast<const std::array<double, 2>*>(coef.data_ptr()),
+        /*configs=*/reinterpret_cast<const std::array<std::uint8_t, n_qubytes>*>(configs.data_ptr()),
+        /*result_psi=*/reinterpret_cast<std::array<double, 2>*>(result_psi.data_ptr())
+    );
+    AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    return result_psi;
+}
+
 template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
 __device__ void single_relative_kernel(
     std::int64_t term_index,
@@ -880,6 +1008,7 @@ auto single_relative_interface(const torch::Tensor& configs, const torch::Tensor
 TORCH_LIBRARY_IMPL(QMB_LIBRARY(N_QUBYTES, PARTICLE_CUT), CUDA, m) {
     m.impl("apply_within", apply_within_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
     m.impl("find_relative", find_relative_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
+    m.impl("diagonal_term", diagonal_term_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
     m.impl("single_relative", single_relative_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
 }
 #undef QMB_LIBRARY
diff --git a/qmb/fcidump.py b/qmb/fcidump.py
@@ -192,6 +192,9 @@ def apply_within(self, configs_i: torch.Tensor, psi_i: torch.Tensor, configs_j:
     def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_selected: int, configs_exclude: torch.Tensor | None = None) -> torch.Tensor:
         return self.hamiltonian.find_relative(configs_i, psi_i, count_selected, configs_exclude)
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.diagonal_term(configs)
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         return self.hamiltonian.single_relative(configs)
 
diff --git a/qmb/hamiltonian.py b/qmb/hamiltonian.py
@@ -165,6 +165,26 @@ def find_relative(
         configs_j = _find_relative(configs_i, torch.view_as_real(psi_i), count_selected, self.site, self.kind, self.coef, configs_exclude)
         return configs_j
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        """
+        Get the diagonal term of the Hamiltonian for the given configurations.
+
+        Parameters
+        ----------
+        configs : torch.Tensor
+            A uint8 tensor of shape [batch_size, n_qubytes] representing the input configurations.
+
+        Returns
+        -------
+        torch.Tensor
+            A complex64 tensor of shape [batch_size] representing the diagonal term of the Hamiltonian for the given configurations.
+        """
+        self._prepare_data(configs.device)
+        _diagonal_term: typing.Callable[[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]
+        _diagonal_term = getattr(self._load_module(configs.device.type, configs.size(1), self.particle_cut), "diagonal_term")
+        psi_result = torch.view_as_complex(_diagonal_term(configs, self.site, self.kind, self.coef))
+        return psi_result
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         """
         Find a single relative configuration for each configurations.
diff --git a/qmb/hubbard.py b/qmb/hubbard.py
@@ -112,6 +112,9 @@ def apply_within(self, configs_i: torch.Tensor, psi_i: torch.Tensor, configs_j:
     def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_selected: int, configs_exclude: torch.Tensor | None = None) -> torch.Tensor:
         return self.hamiltonian.find_relative(configs_i, psi_i, count_selected, configs_exclude)
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.diagonal_term(configs)
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         return self.hamiltonian.single_relative(configs)
 
diff --git a/qmb/ising.py b/qmb/ising.py
@@ -221,6 +221,9 @@ def apply_within(self, configs_i: torch.Tensor, psi_i: torch.Tensor, configs_j:
     def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_selected: int, configs_exclude: torch.Tensor | None = None) -> torch.Tensor:
         return self.hamiltonian.find_relative(configs_i, psi_i, count_selected, configs_exclude)
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.diagonal_term(configs)
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         return self.hamiltonian.single_relative(configs)
 
diff --git a/qmb/model_dict.py b/qmb/model_dict.py
@@ -173,6 +173,21 @@ def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_sele
             The relative configurations.
         """
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        """
+        Calculate the diagonal term for the given configurations.
+
+        Parameters
+        ----------
+        configs : torch.Tensor
+            The configurations to calculate the diagonal term for.
+
+        Returns
+        -------
+        torch.Tensor
+            The diagonal term of the configurations.
+        """
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         """
         Find a single relative configuration for each configurations.
diff --git a/qmb/openfermion.py b/qmb/openfermion.py
@@ -86,6 +86,9 @@ def apply_within(self, configs_i: torch.Tensor, psi_i: torch.Tensor, configs_j:
     def find_relative(self, configs_i: torch.Tensor, psi_i: torch.Tensor, count_selected: int, configs_exclude: torch.Tensor | None = None) -> torch.Tensor:
         return self.hamiltonian.find_relative(configs_i, psi_i, count_selected, configs_exclude)
 
+    def diagonal_term(self, configs: torch.Tensor) -> torch.Tensor:
+        return self.hamiltonian.diagonal_term(configs)
+
     def single_relative(self, configs: torch.Tensor) -> torch.Tensor:
         return self.hamiltonian.single_relative(configs)
 
diff --git a/qmb/pert.py b/qmb/pert.py
@@ -0,0 +1,66 @@
+"""
+This file implements a perturbation estimator from haar.
+"""
+
+import logging
+import typing
+import dataclasses
+import tyro
+from .common import CommonConfig
+from .subcommand_dict import subcommand_dict
+
+
+@dataclasses.dataclass
+class PerturbationConfig:
+    """
+    The perturbation estimator from haar.
+    """
+
+    common: typing.Annotated[CommonConfig, tyro.conf.OmitArgPrefixes]
+
+    def main(self, *, model_param: typing.Any = None, network_param: typing.Any = None) -> None:
+        """
+        The main function of two-step optimization process based on imaginary time.
+        """
+        # pylint: disable=too-many-locals
+        # pylint: disable=too-many-statements
+        # pylint: disable=too-many-branches
+
+        model, _, data = self.common.main(model_param=model_param, network_param=network_param)
+
+        if "haar" not in data and "imag" in data:
+            data["haar"] = data.pop("imag")
+        configs, psi = data["haar"]["pool"]
+        configs = configs.to(self.common.device)
+        psi = psi.to(self.common.device)
+
+        energy0_num = psi.conj() @ model.apply_within(configs, psi, configs)
+        energy0_den = psi.conj() @ psi
+        energy0 = (energy0_num / energy0_den).real.item()
+        logging.info("Current energy is %.8f", energy0)
+        logging.info("Reference energy is %.8f", model.ref_energy)
+
+        number = configs.size(0)
+        last_result_number = 0
+        current_target_number = number
+        logging.info("Starting finding relative configurations with %d.", number)
+        while True:
+            other_configs = model.find_relative(configs, psi, current_target_number, configs)
+            current_result_number = other_configs.size(0)
+            logging.info("Found %d relative configurations.", current_result_number)
+            if current_result_number == last_result_number:
+                logging.info("No new configurations found, stopping at %d.", current_result_number)
+                break
+            current_target_number = current_target_number * 2
+            logging.info("Doubling target number to %d.", current_target_number)
+            break
+
+        hamiltonian_psi = model.apply_within(configs, psi, other_configs)
+        energy2_num = (hamiltonian_psi.conj() @ hamiltonian_psi).real / (psi.conj() @ psi).real
+        energy2_den = energy0 - model.diagonal_term(other_configs).real
+        energy2 = (energy2_num / energy2_den).sum().item()
+        logging.info("Correct energy is %.8f", energy2)
+        logging.info("Error is reduced from %.8f to %.8f", energy0 - model.ref_energy, energy2 - model.ref_energy)
+
+
+subcommand_dict["pert"] = PerturbationConfig

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,7 @@ TORCH_LIBRARY_FRAGMENT(QMB_LIBRARY(N_QUBYTES, PARTICLE_CUT), m) {`
`84`	`84`	`m.def("apply_within(Tensor configs_i, Tensor psi_i, Tensor configs_j, Tensor site, Tensor kind, Tensor coef) -> Tensor");`
`85`	`85`	`m.def("find_relative(Tensor configs_i, Tensor psi_i, int count_selected, Tensor site, Tensor kind, Tensor coef, Tensor configs_exclude) -> Tensor"`
`86`	`86`	`);`
	`87`	`+ m.def("diagonal_term(Tensor configs, Tensor site, Tensor kind, Tensor coef) -> Tensor");`
`87`	`88`	`m.def("single_relative(Tensor configs, Tensor site, Tensor kind, Tensor coef) -> Tensor");`
`88`	`89`	`}`
`89`	`90`	`#undef QMB_LIBRARY`