USTC-KnowledgeComputingLab
diff --git a/‎qmb/__main__.py‎
Lines changed: 1 addition & 0 deletions b/‎qmb/__main__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎qmb/_hamiltonian.cpp‎
Lines changed: 1 addition & 0 deletions b/‎qmb/_hamiltonian.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎qmb/_hamiltonian_cuda.cu‎
Lines changed: 194 additions & 0 deletions b/‎qmb/_hamiltonian_cuda.cu‎
Lines changed: 194 additions & 0 deletions
diff --git a/‎qmb/crossmlp.py‎
Lines changed: 163 additions & 0 deletions b/‎qmb/crossmlp.py‎
Lines changed: 163 additions & 0 deletions
@@ -11,6 +11,7 @@
 from . import ising as _  # type: ignore[no-redef]
 from . import vmc as _  # type: ignore[no-redef]
 from . import imag as _  # type: ignore[no-redef]
+from . import rldiag as _  # type: ignore[no-redef]
 from . import precompile as _  # type: ignore[no-redef]
 from . import list_loss as _  # type: ignore[no-redef]
 from . import chop_imag as _  # type: ignore[no-redef]
 
@@ -84,6 +84,7 @@ TORCH_LIBRARY_FRAGMENT(QMB_LIBRARY(N_QUBYTES, PARTICLE_CUT), m) {
     m.def("apply_within(Tensor configs_i, Tensor psi_i, Tensor configs_j, Tensor site, Tensor kind, Tensor coef) -> Tensor");
     m.def("find_relative(Tensor configs_i, Tensor psi_i, int count_selected, Tensor site, Tensor kind, Tensor coef, Tensor configs_exclude) -> Tensor"
     );
+    m.def("single_relative(Tensor configs, Tensor site, Tensor kind, Tensor coef) -> Tensor");
 }
 #undef QMB_LIBRARY
 #undef QMB_LIBRARY_HELPER
 
@@ -1,6 +1,7 @@
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAStream.h>
 #include <cuda_runtime.h>
+#include <curand_kernel.h>
 #include <thrust/sort.h>
 #include <torch/extension.h>
 
@@ -670,6 +671,198 @@ auto find_relative_interface(
     return unique_nonzero_result_config;
 }
 
+template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
+__device__ void single_relative_kernel(
+    std::int64_t term_index,
+    std::int64_t batch_index,
+    std::int64_t term_number,
+    std::int64_t batch_size,
+    std::int64_t exclude_size,
+    std::uint64_t seed,
+    const std::array<std::int16_t, max_op_number>* site, // term_number
+    const std::array<std::uint8_t, max_op_number>* kind, // term_number
+    const std::array<double, 2>* coef, // term_number
+    const std::array<std::uint8_t, n_qubytes>* configs, // batch_size
+    const std::array<std::uint8_t, n_qubytes>* exclude_configs, // exclude_size
+    std::array<std::uint8_t, n_qubytes>* result_configs, // batch_size
+    double* score, // batch_size
+    int* mutex // batch_size
+) {
+    std::array<std::uint8_t, n_qubytes> current_configs = configs[batch_index];
+    auto [success, parity] = hamiltonian_apply_kernel<max_op_number, n_qubytes, particle_cut>(
+        /*current_configs=*/current_configs,
+        /*term_index=*/term_index,
+        /*batch_index=*/batch_index,
+        /*site=*/site,
+        /*kind=*/kind
+    );
+
+    if (!success) {
+        return;
+    }
+    success = true;
+    std::int64_t low = 0;
+    std::int64_t high = exclude_size - 1;
+    std::int64_t mid = 0;
+    auto compare = array_less<std::uint8_t, n_qubytes>();
+    while (low <= high) {
+        mid = (low + high) / 2;
+        if (compare(current_configs, exclude_configs[mid])) {
+            high = mid - 1;
+        } else if (compare(exclude_configs[mid], current_configs)) {
+            low = mid + 1;
+        } else {
+            success = false;
+            break;
+        }
+    }
+    if (!success) {
+        return;
+    }
+
+    // Efraimidis-Spirakis Algorithm is used here.
+    auto weight = std::pow(coef[term_index][0] * coef[term_index][0] + coef[term_index][1] * coef[term_index][1], 0.5);
+    curandState state;
+    curand_init(seed, term_index, 0, &state);
+    auto key = std::pow(curand_uniform_double(&state), 1.0 / weight);
+    if (score[batch_index] < key) {
+        mutex_lock(&mutex[batch_index]);
+        if (score[batch_index] < key) {
+            score[batch_index] = key;
+            result_configs[batch_index] = current_configs;
+        }
+        mutex_unlock(&mutex[batch_index]);
+    }
+}
+
+template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
+__global__ void single_relative_kernel_interface(
+    std::int64_t term_number,
+    std::int64_t batch_size,
+    std::int64_t exclude_size,
+    std::uint64_t seed,
+    const std::array<std::int16_t, max_op_number>* site, // term_number
+    const std::array<std::uint8_t, max_op_number>* kind, // term_number
+    const std::array<double, 2>* coef, // term_number
+    const std::array<std::uint8_t, n_qubytes>* configs, // batch_size
+    const std::array<std::uint8_t, n_qubytes>* exclude_configs, // exclude_size
+    std::array<std::uint8_t, n_qubytes>* result_configs, // batch_size
+    double* score, // batch_size
+    int* mutex // batch_size
+) {
+    std::int64_t term_index = blockIdx.x * blockDim.x + threadIdx.x;
+    std::int64_t batch_index = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (term_index < term_number && batch_index < batch_size) {
+        single_relative_kernel<max_op_number, n_qubytes, particle_cut>(
+            /*term_index=*/term_index,
+            /*batch_index=*/batch_index,
+            /*term_number=*/term_number,
+            /*batch_size=*/batch_size,
+            /*exclude_size=*/exclude_size,
+            /*seed=*/seed,
+            /*site=*/site,
+            /*kind=*/kind,
+            /*coef=*/coef,
+            /*configs=*/configs,
+            /*exclude_configs=*/exclude_configs,
+            /*result_configs=*/result_configs,
+            /*score=*/score,
+            /*mutex=*/mutex
+        );
+    }
+}
+
+template<std::int64_t max_op_number, std::int64_t n_qubytes, std::int64_t particle_cut>
+auto single_relative_interface(const torch::Tensor& configs, const torch::Tensor& site, const torch::Tensor& kind, const torch::Tensor& coef)
+    -> torch::Tensor {
+    std::int64_t device_id = configs.device().index();
+    std::int64_t batch_size = configs.size(0);
+    std::int64_t term_number = site.size(0);
+
+    TORCH_CHECK(configs.device().type() == torch::kCUDA, "configs must be on CUDA.")
+    TORCH_CHECK(configs.device().index() == device_id, "configs must be on the same device as others.");
+    TORCH_CHECK(configs.is_contiguous(), "configs must be contiguous.")
+    TORCH_CHECK(configs.dtype() == torch::kUInt8, "configs must be uint8.")
+    TORCH_CHECK(configs.dim() == 2, "configs must be 2D.")
+    TORCH_CHECK(configs.size(0) == batch_size, "configs batch size must match the provided batch_size.");
+    TORCH_CHECK(configs.size(1) == n_qubytes, "configs must have the same number of qubits as the provided n_qubytes.");
+
+    TORCH_CHECK(site.device().type() == torch::kCUDA, "site must be on CUDA.")
+    TORCH_CHECK(site.device().index() == device_id, "site must be on the same device as others.");
+    TORCH_CHECK(site.is_contiguous(), "site must be contiguous.")
+    TORCH_CHECK(site.dtype() == torch::kInt16, "site must be int16.")
+    TORCH_CHECK(site.dim() == 2, "site must be 2D.")
+    TORCH_CHECK(site.size(0) == term_number, "site size must match the provided term_number.");
+    TORCH_CHECK(site.size(1) == max_op_number, "site must match the provided max_op_number.");
+
+    TORCH_CHECK(kind.device().type() == torch::kCUDA, "kind must be on CUDA.")
+    TORCH_CHECK(kind.device().index() == device_id, "kind must be on the same device as others.");
+    TORCH_CHECK(kind.is_contiguous(), "kind must be contiguous.")
+    TORCH_CHECK(kind.dtype() == torch::kUInt8, "kind must be uint8.")
+    TORCH_CHECK(kind.dim() == 2, "kind must be 2D.")
+    TORCH_CHECK(kind.size(0) == term_number, "kind size must match the provided term_number.");
+    TORCH_CHECK(kind.size(1) == max_op_number, "kind must match the provided max_op_number.");
+
+    TORCH_CHECK(coef.device().type() == torch::kCUDA, "coef must be on CUDA.")
+    TORCH_CHECK(coef.device().index() == device_id, "coef must be on the same device as others.");
+    TORCH_CHECK(coef.is_contiguous(), "coef must be contiguous.")
+    TORCH_CHECK(coef.dtype() == torch::kFloat64, "coef must be float64.")
+    TORCH_CHECK(coef.dim() == 2, "coef must be 2D.")
+    TORCH_CHECK(coef.size(0) == term_number, "coef size must match the provided term_number.");
+    TORCH_CHECK(coef.size(1) == 2, "coef must contain 2 elements for each term.");
+
+    auto stream = at::cuda::getCurrentCUDAStream(device_id);
+    auto policy = thrust::device.on(stream);
+
+    cudaDeviceProp prop;
+    AT_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_id));
+    std::int64_t max_threads_per_block = prop.maxThreadsPerBlock;
+
+    auto sorted_configs = configs.clone(torch::MemoryFormat::Contiguous);
+
+    thrust::sort(
+        policy,
+        reinterpret_cast<std::array<std::uint8_t, n_qubytes>*>(sorted_configs.data_ptr()),
+        reinterpret_cast<std::array<std::uint8_t, n_qubytes>*>(sorted_configs.data_ptr()) + batch_size,
+        array_less<std::uint8_t, n_qubytes>()
+    );
+
+    auto seed_tensor = torch::randint(int64_t(0), int64_t(std::numeric_limits<std::int64_t>::max), {}, torch::TensorOptions().dtype(torch::kInt64));
+    auto seed = *(int64_t*)(seed_tensor.data_ptr());
+
+    auto result_configs = torch::zeros({batch_size, n_qubytes}, torch::TensorOptions().dtype(torch::kUInt8).device(device, device_id));
+    auto score = torch::empty({batch_size}, torch::TensorOptions().dtype(torch::kFloat64).device(device, device_id))
+                     .fill_(-std::numeric_limits<double>::infinity());
+    int* mutex;
+    AT_CUDA_CHECK(cudaMalloc(&mutex, sizeof(int) * batch_size));
+    AT_CUDA_CHECK(cudaMemset(mutex, 0, sizeof(int) * batch_size));
+
+    auto threads_per_block = dim3{1, max_threads_per_block >> 1}; // I don't know why, but need to divide by 2 to avoid errors
+    auto num_blocks =
+        dim3{(term_number + threads_per_block.x - 1) / threads_per_block.x, (batch_size + threads_per_block.y - 1) / threads_per_block.y};
+
+    single_relative_kernel_interface<max_op_number, n_qubytes, particle_cut><<<num_blocks, threads_per_block, 0, stream>>>(
+        /*term_number=*/term_number,
+        /*batch_size=*/batch_size,
+        /*exclude_size=*/batch_size,
+        /*seed=*/seed,
+        /*site=*/reinterpret_cast<const std::array<std::int16_t, max_op_number>*>(site.data_ptr()),
+        /*kind=*/reinterpret_cast<const std::array<std::uint8_t, max_op_number>*>(kind.data_ptr()),
+        /*coef=*/reinterpret_cast<const std::array<double, 2>*>(coef.data_ptr()),
+        /*configs=*/reinterpret_cast<const std::array<std::uint8_t, n_qubytes>*>(configs.data_ptr()),
+        /*exclude_configs=*/reinterpret_cast<const std::array<std::uint8_t, n_qubytes>*>(sorted_configs.data_ptr()),
+        /*result_configs=*/reinterpret_cast<std::array<std::uint8_t, n_qubytes>*>(result_configs.data_ptr()),
+        /*score=*/reinterpret_cast<double*>(score.data_ptr()),
+        /*mutex=*/mutex
+    );
+    AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    AT_CUDA_CHECK(cudaFree(mutex));
+
+    return result_configs;
+}
+
 #ifndef N_QUBYTES
 #define N_QUBYTES 0
 #endif
@@ -683,6 +876,7 @@ auto find_relative_interface(
 TORCH_LIBRARY_IMPL(QMB_LIBRARY(N_QUBYTES, PARTICLE_CUT), CUDA, m) {
     m.impl("apply_within", apply_within_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
     m.impl("find_relative", find_relative_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
+    m.impl("single_relative", single_relative_interface</*max_op_number=*/4, /*n_qubytes=*/N_QUBYTES, /*particle_cut=*/PARTICLE_CUT>);
 }
 #undef QMB_LIBRARY
 #undef QMB_LIBRARY_HELPER
 
@@ -0,0 +1,163 @@
+"""
+This file implements a cross MLP network.
+"""
+
+import typing
+import torch
+from .bitspack import unpack_int
+
+
+class FakeLinear(torch.nn.Module):
+    """
+    A fake linear layer with zero input dimension to avoid PyTorch initialization warnings.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int) -> None:
+        super().__init__()
+        assert dim_in == 0
+        self.bias: torch.nn.Parameter = torch.nn.Parameter(torch.zeros([dim_out]))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for the fake linear layer.
+        """
+        batch, _ = x.shape
+        return self.bias.view([1, -1]).expand([batch, -1])
+
+
+def select_linear_layer(dim_in: int, dim_out: int) -> torch.nn.Module:
+    """
+    Selects between a fake linear layer and a standard one to avoid initialization warnings when dim_in is zero.
+    """
+    if dim_in == 0:  # pylint: disable=no-else-return
+        return FakeLinear(dim_in, dim_out)
+    else:
+        return torch.nn.Linear(dim_in, dim_out)
+
+
+class MLP(torch.nn.Module):
+    """
+    This module implements multiple layers MLP with given dim_input, dim_output and hidden_size.
+    """
+
+    def __init__(self, dim_input: int, dim_output: int, hidden_size: tuple[int, ...]) -> None:
+        super().__init__()
+        self.dim_input: int = dim_input
+        self.dim_output: int = dim_output
+        self.hidden_size: tuple[int, ...] = hidden_size
+        self.depth: int = len(hidden_size)
+
+        dimensions: list[int] = [dim_input] + list(hidden_size) + [dim_output]
+        linears: list[torch.nn.Module] = [select_linear_layer(i, j) for i, j in zip(dimensions[:-1], dimensions[1:])]
+        modules: list[torch.nn.Module] = [module for linear in linears for module in (linear, torch.nn.SiLU())][:-1]
+        self.model: torch.nn.Module = torch.nn.Sequential(*modules)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass for the MLP.
+        """
+        return self.model(x)
+
+
+class WaveFunction(torch.nn.Module):
+    """
+    The wave function for the cross MLP network.
+    """
+
+    # pylint: disable=too-many-instance-attributes
+
+    def __init__(  # pylint: disable=too-many-arguments
+            self,
+            *,
+            sites: int,  # Number of qubits
+            physical_dim: int,  # Dimension of the physical space, which is always 2 for MLP
+            is_complex: bool,  # Indicates whether the wave function is complex-valued, which is always true for MLP
+            embedding_hidden_size: tuple[int, ...],  # Hidden layer sizes for embedding part
+            embedding_size: int,  # The dimension of the embedding
+            momentum_hidden_size: tuple[int, ...],  # Hidden layer sizes for momentum part
+            momentum_count: int,  # The number of max momentum order
+            tail_hidden_size: tuple[int, ...],  # Hidden layer size for tail part
+            kind: typing.Literal[0, 1, 2],  # Kind of the crossmlp forward function
+            ordering: int | list[int],  # Ordering of sites: +1 for normal order, -1 for reversed order, or a custom order list
+    ) -> None:
+        super().__init__()
+        self.sites: int = sites
+        assert physical_dim == 2
+        # This module is only used in reinforcement learning, which expects real values for the weights.
+        assert is_complex == False  # pylint: disable=singleton-comparison
+        self.embedding_hidden_size: tuple[int, ...] = embedding_hidden_size
+        self.embedding_size: int = embedding_size
+        self.momentum_hidden_size: tuple[int, ...] = momentum_hidden_size
+        self.momentum_count: int = momentum_count
+        self.tail_hidden_size: tuple[int, ...] = tail_hidden_size
+        self.kind: typing.Literal[0, 1, 2] = kind
+
+        self.emb = MLP(self.sites, self.embedding_size, self.embedding_hidden_size)
+        self.momentum = torch.nn.ModuleList([MLP(self.embedding_size, self.embedding_size, momentum_hidden_size) for _ in range(self.momentum_count)])
+        self.tail = MLP(self.embedding_size, 1, tail_hidden_size)
+
+        # Site Ordering Configuration
+        # +1 for normal order, -1 for reversed order
+        if isinstance(ordering, int) and ordering == +1:
+            ordering = list(range(self.sites))
+        if isinstance(ordering, int) and ordering == -1:
+            ordering = list(reversed(range(self.sites)))
+        self.ordering: torch.Tensor
+        self.register_buffer('ordering', torch.tensor(ordering, dtype=torch.int64))
+        self.ordering_reversed: torch.Tensor
+        self.register_buffer('ordering_reversed', torch.scatter(torch.zeros(self.sites, dtype=torch.int64), 0, self.ordering, torch.arange(self.sites, dtype=torch.int64)))
+
+        # Dummy Parameter for Device and Dtype Retrieval
+        # This parameter is used to infer the device and dtype of the model.
+        self.dummy_param = torch.nn.Parameter(torch.empty(0))
+
+    @torch.jit.export
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the wave function psi for the given configurations.
+        """
+        dtype = self.dummy_param.dtype
+        # x: batch_size * sites
+        x = unpack_int(x, size=1, last_dim=self.sites)
+        # Apply ordering
+        x = torch.index_select(x, 1, self.ordering_reversed)
+        # Dtype conversion
+        x = x.to(dtype=dtype)
+
+        # emb: batch_size * embedding_size
+        emb = self.emb(x)
+
+        if self.kind == 0:
+            # x' = F(x - E[x]) + x
+            for layer in self.momentum:
+                new_emb = emb - emb.mean(dim=0, keepdim=True)
+                new_emb = layer(new_emb)
+                emb = emb + new_emb
+                emb = emb / emb.norm(p=2, dim=1, keepdim=True)
+        elif self.kind == 1:
+            # x' = F(x) - E[F(x)] + x
+            for layer in self.momentum:
+                new_emb = layer(emb)
+                new_emb = new_emb - new_emb.mean(dim=0, keepdim=True)
+                emb = emb + new_emb
+                emb = emb / emb.norm(p=2, dim=1, keepdim=True)
+        elif self.kind == 2:
+            # x' = (F(x) + x) - E [F(x) + x]
+            for layer in self.momentum:
+                new_emb = layer(emb)
+                new_emb = new_emb + emb
+                emb = new_emb - new_emb.mean(dim=0, keepdim=True)
+                emb = emb / emb.norm(p=2, dim=1, keepdim=True)
+        else:
+            raise ValueError(f"Invalid kind: {self.kind}")
+
+        tail = self.tail(emb).squeeze(-1)
+        return tail
+
+    @torch.jit.export
+    def generate_unique(self, batch_size: int, block_num: int = 1) -> tuple[torch.Tensor, torch.Tensor, None, None]:
+        """
+        This module does not support generating unique configurations.
+        """
+        # This module is only used in reinforcement learning, which does not require configurations sampling.
+        raise NotImplementedError("The generate_unique method is not implemented for this class.")
Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,7 @@ TORCH_LIBRARY_FRAGMENT(QMB_LIBRARY(N_QUBYTES, PARTICLE_CUT), m) {`
`84`	`84`	`m.def("apply_within(Tensor configs_i, Tensor psi_i, Tensor configs_j, Tensor site, Tensor kind, Tensor coef) -> Tensor");`
`85`	`85`	`m.def("find_relative(Tensor configs_i, Tensor psi_i, int count_selected, Tensor site, Tensor kind, Tensor coef, Tensor configs_exclude) -> Tensor"`
`86`	`86`	`);`
	`87`	`+ m.def("single_relative(Tensor configs, Tensor site, Tensor kind, Tensor coef) -> Tensor");`
`87`	`88`	`}`
`88`	`89`	`#undef QMB_LIBRARY`
`89`	`90`	`#undef QMB_LIBRARY_HELPER`