add ligrec

Intron7 · Intron7 · commit d45d6bff2d09 · 2025-09-16T12:24:02.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -55,6 +55,7 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_cooc_cuda         src/rapids_singlecell/_cuda/cooc/cooc.cu)
   add_nb_cuda_module(_aggr_cuda         src/rapids_singlecell/_cuda/aggr/aggr.cu)
   add_nb_cuda_module(_spca_cuda         src/rapids_singlecell/_cuda/spca/spca.cu)
+  add_nb_cuda_module(_ligrec_cuda       src/rapids_singlecell/_cuda/ligrec/ligrec.cu)
   # Harmony CUDA modules
   add_nb_cuda_module(_harmony_scatter_cuda   src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
   add_nb_cuda_module(_harmony_outer_cuda     src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
diff --git a/src/rapids_singlecell/_cuda/ligrec/kernels_ligrec.cuh b/src/rapids_singlecell/_cuda/ligrec/kernels_ligrec.cuh
@@ -0,0 +1,127 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <typename T>
+__global__ void sum_and_count_dense_kernel(const T* __restrict__ data,
+                                           const int* __restrict__ clusters,
+                                           T* __restrict__ sum_gt0, int* __restrict__ count_gt0,
+                                           int num_rows, int num_cols, int n_cls) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i >= num_rows || j >= num_cols) return;
+  int cluster = clusters[i];
+  T value = data[i * num_cols + j];
+  if (value > (T)0) {
+    atomicAdd(&sum_gt0[j * n_cls + cluster], value);
+    atomicAdd(&count_gt0[j * n_cls + cluster], 1);
+  }
+}
+
+template <typename T>
+__global__ void sum_and_count_sparse_kernel(const int* __restrict__ indptr,
+                                            const int* __restrict__ index,
+                                            const T* __restrict__ data,
+                                            const int* __restrict__ clusters,
+                                            T* __restrict__ sum_gt0, int* __restrict__ count_gt0,
+                                            int nrows, int n_cls) {
+  int cell = blockDim.x * blockIdx.x + threadIdx.x;
+  if (cell >= nrows) return;
+  int start_idx = indptr[cell];
+  int stop_idx = indptr[cell + 1];
+  int cluster = clusters[cell];
+  for (int gene = start_idx; gene < stop_idx; gene++) {
+    T value = data[gene];
+    int gene_number = index[gene];
+    if (value > (T)0) {
+      atomicAdd(&sum_gt0[gene_number * n_cls + cluster], value);
+      atomicAdd(&count_gt0[gene_number * n_cls + cluster], 1);
+    }
+  }
+}
+
+template <typename T>
+__global__ void mean_dense_kernel(const T* __restrict__ data, const int* __restrict__ clusters,
+                                  T* __restrict__ g_cluster, int num_rows, int num_cols,
+                                  int n_cls) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i >= num_rows || j >= num_cols) return;
+  atomicAdd(&g_cluster[j * n_cls + clusters[i]], data[i * num_cols + j]);
+}
+
+template <typename T>
+__global__ void mean_sparse_kernel(const int* __restrict__ indptr, const int* __restrict__ index,
+                                   const T* __restrict__ data, const int* __restrict__ clusters,
+                                   T* __restrict__ sum_gt0, int nrows, int n_cls) {
+  int cell = blockDim.x * blockIdx.x + threadIdx.x;
+  if (cell >= nrows) return;
+  int start_idx = indptr[cell];
+  int stop_idx = indptr[cell + 1];
+  int cluster = clusters[cell];
+  for (int gene = start_idx; gene < stop_idx; gene++) {
+    T value = data[gene];
+    int gene_number = index[gene];
+    if (value > (T)0) {
+      atomicAdd(&sum_gt0[gene_number * n_cls + cluster], value);
+    }
+  }
+}
+
+template <typename T>
+__global__ void elementwise_diff_kernel(T* __restrict__ g_cluster,
+                                        const T* __restrict__ total_counts, int num_genes,
+                                        int num_clusters) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i >= num_genes || j >= num_clusters) return;
+  g_cluster[i * num_clusters + j] = g_cluster[i * num_clusters + j] / total_counts[j];
+}
+
+template <typename T>
+__global__ void interaction_kernel(const int* __restrict__ interactions,
+                                   const int* __restrict__ interaction_clusters,
+                                   const T* __restrict__ mean, T* __restrict__ res,
+                                   const bool* __restrict__ mask, const T* __restrict__ g,
+                                   int n_iter, int n_inter_clust, int n_cls) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i >= n_iter || j >= n_inter_clust) return;
+  int rec = interactions[i * 2];
+  int lig = interactions[i * 2 + 1];
+  int c1 = interaction_clusters[j * 2];
+  int c2 = interaction_clusters[j * 2 + 1];
+  T m1 = mean[rec * n_cls + c1];
+  T m2 = mean[lig * n_cls + c2];
+  if (!isnan(res[i * n_inter_clust + j])) {
+    if (m1 > (T)0 && m2 > (T)0) {
+      if (mask[rec * n_cls + c1] && mask[lig * n_cls + c2]) {
+        T g_sum = g[rec * n_cls + c1] + g[lig * n_cls + c2];
+        res[i * n_inter_clust + j] += (g_sum > (m1 + m2));
+      } else {
+        res[i * n_inter_clust + j] = nan("");
+      }
+    } else {
+      res[i * n_inter_clust + j] = nan("");
+    }
+  }
+}
+
+template <typename T>
+__global__ void res_mean_kernel(const int* __restrict__ interactions,
+                                const int* __restrict__ interaction_clusters,
+                                const T* __restrict__ mean, T* __restrict__ res_mean, int n_inter,
+                                int n_inter_clust, int n_cls) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i >= n_inter || j >= n_inter_clust) return;
+  int rec = interactions[i * 2];
+  int lig = interactions[i * 2 + 1];
+  int c1 = interaction_clusters[j * 2];
+  int c2 = interaction_clusters[j * 2 + 1];
+  T m1 = mean[rec * n_cls + c1];
+  T m2 = mean[lig * n_cls + c2];
+  if (m1 > (T)0 && m2 > (T)0) {
+    res_mean[i * n_inter_clust + j] = (m1 + m2) / (T)2;
+  }
+}
diff --git a/src/rapids_singlecell/_cuda/ligrec/ligrec.cu b/src/rapids_singlecell/_cuda/ligrec/ligrec.cu
@@ -0,0 +1,175 @@
+#include <cuda_runtime.h>
+#include <nanobind/nanobind.h>
+#include <cstdint>
+
+#include "kernels_ligrec.cuh"
+
+namespace nb = nanobind;
+
+template <typename T>
+static inline void launch_sum_count_dense(std::uintptr_t data, std::uintptr_t clusters,
+                                          std::uintptr_t sum, std::uintptr_t count, int rows,
+                                          int cols, int ncls) {
+  dim3 block(32, 32);
+  dim3 grid((rows + block.x - 1) / block.x, (cols + block.y - 1) / block.y);
+  sum_and_count_dense_kernel<T>
+      <<<grid, block>>>(reinterpret_cast<const T*>(data), reinterpret_cast<const int*>(clusters),
+                        reinterpret_cast<T*>(sum), reinterpret_cast<int*>(count), rows, cols, ncls);
+}
+
+template <typename T>
+static inline void launch_sum_count_sparse(std::uintptr_t indptr, std::uintptr_t index,
+                                           std::uintptr_t data, std::uintptr_t clusters,
+                                           std::uintptr_t sum, std::uintptr_t count, int rows,
+                                           int ncls) {
+  dim3 block(32);
+  dim3 grid((rows + block.x - 1) / block.x);
+  sum_and_count_sparse_kernel<T>
+      <<<grid, block>>>(reinterpret_cast<const int*>(indptr), reinterpret_cast<const int*>(index),
+                        reinterpret_cast<const T*>(data), reinterpret_cast<const int*>(clusters),
+                        reinterpret_cast<T*>(sum), reinterpret_cast<int*>(count), rows, ncls);
+}
+
+template <typename T>
+static inline void launch_mean_dense(std::uintptr_t data, std::uintptr_t clusters, std::uintptr_t g,
+                                     int rows, int cols, int ncls) {
+  dim3 block(32, 32);
+  dim3 grid((rows + block.x - 1) / block.x, (cols + block.y - 1) / block.y);
+  mean_dense_kernel<T><<<grid, block>>>(reinterpret_cast<const T*>(data),
+                                        reinterpret_cast<const int*>(clusters),
+                                        reinterpret_cast<T*>(g), rows, cols, ncls);
+}
+
+template <typename T>
+static inline void launch_mean_sparse(std::uintptr_t indptr, std::uintptr_t index,
+                                      std::uintptr_t data, std::uintptr_t clusters,
+                                      std::uintptr_t g, int rows, int ncls) {
+  dim3 block(32);
+  dim3 grid((rows + block.x - 1) / block.x);
+  mean_sparse_kernel<T>
+      <<<grid, block>>>(reinterpret_cast<const int*>(indptr), reinterpret_cast<const int*>(index),
+                        reinterpret_cast<const T*>(data), reinterpret_cast<const int*>(clusters),
+                        reinterpret_cast<T*>(g), rows, ncls);
+}
+
+template <typename T>
+static inline void launch_elementwise_diff(std::uintptr_t g, std::uintptr_t total_counts,
+                                           int n_genes, int n_clusters) {
+  dim3 block(32, 32);
+  dim3 grid((n_genes + block.x - 1) / block.x, (n_clusters + block.y - 1) / block.y);
+  elementwise_diff_kernel<T><<<grid, block>>>(
+      reinterpret_cast<T*>(g), reinterpret_cast<const T*>(total_counts), n_genes, n_clusters);
+}
+
+template <typename T>
+static inline void launch_interaction(std::uintptr_t interactions,
+                                      std::uintptr_t interaction_clusters, std::uintptr_t mean,
+                                      std::uintptr_t res, std::uintptr_t mask, std::uintptr_t g,
+                                      int n_iter, int n_inter_clust, int ncls) {
+  dim3 block(32, 32);
+  dim3 grid((n_iter + block.x - 1) / block.x, (n_inter_clust + block.y - 1) / block.y);
+  interaction_kernel<T><<<grid, block>>>(
+      reinterpret_cast<const int*>(interactions),
+      reinterpret_cast<const int*>(interaction_clusters), reinterpret_cast<const T*>(mean),
+      reinterpret_cast<T*>(res), reinterpret_cast<const bool*>(mask), reinterpret_cast<const T*>(g),
+      n_iter, n_inter_clust, ncls);
+}
+
+template <typename T>
+static inline void launch_res_mean(std::uintptr_t interactions, std::uintptr_t interaction_clusters,
+                                   std::uintptr_t mean, std::uintptr_t res_mean, int n_inter,
+                                   int n_inter_clust, int ncls) {
+  dim3 block(32, 32);
+  dim3 grid((n_inter + block.x - 1) / block.x, (n_inter_clust + block.y - 1) / block.y);
+  res_mean_kernel<T><<<grid, block>>>(reinterpret_cast<const int*>(interactions),
+                                      reinterpret_cast<const int*>(interaction_clusters),
+                                      reinterpret_cast<const T*>(mean),
+                                      reinterpret_cast<T*>(res_mean), n_inter, n_inter_clust, ncls);
+}
+
+NB_MODULE(_ligrec_cuda, m) {
+  m.def("sum_count_dense", [](std::uintptr_t data, std::uintptr_t clusters, std::uintptr_t sum,
+                              std::uintptr_t count, int rows, int cols, int ncls, int itemsize) {
+    if (itemsize == 4) {
+      launch_sum_count_dense<float>(data, clusters, sum, count, rows, cols, ncls);
+    } else if (itemsize == 8) {
+      launch_sum_count_dense<double>(data, clusters, sum, count, rows, cols, ncls);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+
+  m.def("sum_count_sparse", [](std::uintptr_t indptr, std::uintptr_t index, std::uintptr_t data,
+                               std::uintptr_t clusters, std::uintptr_t sum, std::uintptr_t count,
+                               int rows, int ncls, int itemsize) {
+    if (itemsize == 4) {
+      launch_sum_count_sparse<float>(indptr, index, data, clusters, sum, count, rows, ncls);
+    } else if (itemsize == 8) {
+      launch_sum_count_sparse<double>(indptr, index, data, clusters, sum, count, rows, ncls);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+
+  m.def("mean_dense", [](std::uintptr_t data, std::uintptr_t clusters, std::uintptr_t g, int rows,
+                         int cols, int ncls, int itemsize) {
+    if (itemsize == 4) {
+      launch_mean_dense<float>(data, clusters, g, rows, cols, ncls);
+    } else if (itemsize == 8) {
+      launch_mean_dense<double>(data, clusters, g, rows, cols, ncls);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+
+  m.def("mean_sparse",
+        [](std::uintptr_t indptr, std::uintptr_t index, std::uintptr_t data,
+           std::uintptr_t clusters, std::uintptr_t g, int rows, int ncls, int itemsize) {
+          if (itemsize == 4) {
+            launch_mean_sparse<float>(indptr, index, data, clusters, g, rows, ncls);
+          } else if (itemsize == 8) {
+            launch_mean_sparse<double>(indptr, index, data, clusters, g, rows, ncls);
+          } else {
+            throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+          }
+        });
+
+  m.def("elementwise_diff", [](std::uintptr_t g, std::uintptr_t total_counts, int n_genes,
+                               int n_clusters, int itemsize) {
+    if (itemsize == 4) {
+      launch_elementwise_diff<float>(g, total_counts, n_genes, n_clusters);
+    } else if (itemsize == 8) {
+      launch_elementwise_diff<double>(g, total_counts, n_genes, n_clusters);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+
+  m.def("interaction", [](std::uintptr_t interactions, std::uintptr_t interaction_clusters,
+                          std::uintptr_t mean, std::uintptr_t res, std::uintptr_t mask,
+                          std::uintptr_t g, int n_iter, int n_inter_clust, int ncls, int itemsize) {
+    if (itemsize == 4) {
+      launch_interaction<float>(interactions, interaction_clusters, mean, res, mask, g, n_iter,
+                                n_inter_clust, ncls);
+    } else if (itemsize == 8) {
+      launch_interaction<double>(interactions, interaction_clusters, mean, res, mask, g, n_iter,
+                                 n_inter_clust, ncls);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+
+  m.def("res_mean",
+        [](std::uintptr_t interactions, std::uintptr_t interaction_clusters, std::uintptr_t mean,
+           std::uintptr_t res_mean, int n_inter, int n_inter_clust, int ncls, int itemsize) {
+          if (itemsize == 4) {
+            launch_res_mean<float>(interactions, interaction_clusters, mean, res_mean, n_inter,
+                                   n_inter_clust, ncls);
+          } else if (itemsize == 8) {
+            launch_res_mean<double>(interactions, interaction_clusters, mean, res_mean, n_inter,
+                                    n_inter_clust, ncls);
+          } else {
+            throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+          }
+        });
+}
diff --git a/src/rapids_singlecell/squidpy_gpu/_ligrec.py b/src/rapids_singlecell/squidpy_gpu/_ligrec.py