scverse
diff --git a/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/harmony/colsum/colsum.cu‎
Lines changed: 56 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/harmony/colsum/colsum.cu‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/harmony/colsum/kernels_colsum.cuh‎
Lines changed: 40 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/harmony/colsum/kernels_colsum.cuh‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/harmony/kmeans/kernels_kmeans.cuh‎
Lines changed: 41 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/harmony/kmeans/kernels_kmeans.cuh‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/harmony/kmeans/kmeans.cu‎
Lines changed: 29 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/harmony/kmeans/kmeans.cu‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/harmony/normalize/kernels_normalize.cuh‎
Lines changed: 31 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/harmony/normalize/kernels_normalize.cuh‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/harmony/normalize/normalize.cu‎
Lines changed: 26 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/harmony/normalize/normalize.cu‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/harmony/outer/kernels_outer.cuh‎
Lines changed: 31 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/harmony/outer/kernels_outer.cuh‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/harmony/outer/outer.cu‎
Lines changed: 52 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/harmony/outer/outer.cu‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/harmony/pen/kernels_pen.cuh‎
Lines changed: 16 additions & 0 deletions b/‎src/rapids_singlecell/_cuda/harmony/pen/kernels_pen.cuh‎
Lines changed: 16 additions & 0 deletions
@@ -55,4 +55,11 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_cooc_cuda         src/rapids_singlecell/_cuda/cooc/cooc.cu)
   add_nb_cuda_module(_aggr_cuda         src/rapids_singlecell/_cuda/aggr/aggr.cu)
   add_nb_cuda_module(_spca_cuda         src/rapids_singlecell/_cuda/spca/spca.cu)
+  # Harmony CUDA modules
+  add_nb_cuda_module(_harmony_scatter_cuda   src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
+  add_nb_cuda_module(_harmony_outer_cuda     src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
+  add_nb_cuda_module(_harmony_colsum_cuda    src/rapids_singlecell/_cuda/harmony/colsum/colsum.cu)
+  add_nb_cuda_module(_harmony_kmeans_cuda    src/rapids_singlecell/_cuda/harmony/kmeans/kmeans.cu)
+  add_nb_cuda_module(_harmony_normalize_cuda src/rapids_singlecell/_cuda/harmony/normalize/normalize.cu)
+  add_nb_cuda_module(_harmony_pen_cuda       src/rapids_singlecell/_cuda/harmony/pen/pen.cu)
 endif()
@@ -0,0 +1,56 @@
+#include <cuda_runtime.h>
+#include <nanobind/nanobind.h>
+#include <cstdint>
+
+#include "kernels_colsum.cuh"
+
+namespace nb = nanobind;
+
+template <typename T>
+static inline void launch_colsum(std::uintptr_t A, std::uintptr_t out, std::size_t rows,
+                                 std::size_t cols) {
+  int threads = 32;
+  int blocks = (int)cols;
+  colsum_kernel<T>
+      <<<blocks, threads>>>(reinterpret_cast<const T*>(A), reinterpret_cast<T*>(out), rows, cols);
+}
+
+template <typename T>
+static inline void launch_colsum_atomic(std::uintptr_t A, std::uintptr_t out, std::size_t rows,
+                                        std::size_t cols) {
+  int tile_rows = (rows + 31) / 32;
+  int tile_cols = (cols + 31) / 32;
+  int blocks = tile_rows * tile_cols;
+  dim3 threads(32, 32);
+  colsum_atomic_kernel<T>
+      <<<blocks, threads>>>(reinterpret_cast<const T*>(A), reinterpret_cast<T*>(out), rows, cols);
+}
+
+NB_MODULE(_harmony_colsum_cuda, m) {
+  m.def("colsum", [](std::uintptr_t A, std::uintptr_t out, std::size_t rows, std::size_t cols,
+                     int dtype_code) {
+    // dtype_code: 0=float32, 1=float64, 2=int32; Back-compat: 4->float32, 8->float64
+    if (dtype_code == 0 || dtype_code == 4) {
+      launch_colsum<float>(A, out, rows, cols);
+    } else if (dtype_code == 1 || dtype_code == 8) {
+      launch_colsum<double>(A, out, rows, cols);
+    } else if (dtype_code == 2) {
+      launch_colsum<int>(A, out, rows, cols);
+    } else {
+      throw nb::value_error("Unsupported dtype_code (expected 0/1/2 or 4/8)");
+    }
+  });
+
+  m.def("colsum_atomic", [](std::uintptr_t A, std::uintptr_t out, std::size_t rows,
+                            std::size_t cols, int dtype_code) {
+    if (dtype_code == 0 || dtype_code == 4) {
+      launch_colsum_atomic<float>(A, out, rows, cols);
+    } else if (dtype_code == 1 || dtype_code == 8) {
+      launch_colsum_atomic<double>(A, out, rows, cols);
+    } else if (dtype_code == 2) {
+      launch_colsum_atomic<int>(A, out, rows, cols);
+    } else {
+      throw nb::value_error("Unsupported dtype_code (expected 0/1/2 or 4/8)");
+    }
+  });
+}
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <typename T>
+__global__ void colsum_kernel(const T* __restrict__ A, T* __restrict__ out, std::size_t rows,
+                              std::size_t cols) {
+  std::size_t tid = threadIdx.x;
+  for (std::size_t col = blockIdx.x; col < cols; col += gridDim.x) {
+    T acc = (T)0;
+    for (std::size_t i = tid; i < rows; i += blockDim.x) {
+      acc += A[i * cols + col];
+    }
+    for (int offset = 16; offset > 0; offset >>= 1)
+      acc += __shfl_down_sync(0xffffffff, acc, offset);
+    __shared__ T s[32];
+    if ((threadIdx.x & 31) == 0) s[threadIdx.x >> 5] = acc;
+    __syncthreads();
+    if (threadIdx.x < 32) {
+      T val = (threadIdx.x < (blockDim.x >> 5)) ? s[threadIdx.x] : (T)0;
+      for (int off = 16; off > 0; off >>= 1) val += __shfl_down_sync(0xffffffff, val, off);
+      if (threadIdx.x == 0) out[col] = val;
+    }
+  }
+}
+
+template <typename T>
+__global__ void colsum_atomic_kernel(const T* __restrict__ A, T* __restrict__ out, std::size_t rows,
+                                     std::size_t cols) {
+  std::size_t tile_cols = (cols + 31) / 32;
+  std::size_t tid = blockIdx.x;
+  std::size_t tile_r = tid / tile_cols;
+  std::size_t tile_c = tid % tile_cols;
+  std::size_t row = tile_r * 32 + threadIdx.x;
+  std::size_t col = tile_c * 32 + threadIdx.y;
+  T v = (T)0;
+  if (row < rows && col < cols) v = A[row * cols + col];
+  for (int off = 16; off > 0; off >>= 1) v += __shfl_down_sync(0xffffffff, v, off);
+  if (threadIdx.x == 0 && col < cols) atomicAdd(&out[col], v);
+}
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <type_traits>
+
+template <typename T>
+__global__ void kmeans_err_kernel(const T* __restrict__ r, const T* __restrict__ dot, std::size_t n,
+                                  T* __restrict__ out) {
+  T acc = (T)0;
+  using Vec = typename std::conditional<std::is_same<T, float>::value, float4, double4>::type;
+
+  std::size_t i = (blockIdx.x * blockDim.x + threadIdx.x) * 4;
+  const std::size_t stride = gridDim.x * blockDim.x * 4;
+
+  while (i + 3 < n) {
+    Vec r4 = *(const Vec*)(r + i);
+    Vec dot4 = *(const Vec*)(dot + i);
+    acc += ((T*)&r4)[0] * (T)2 * ((T)1 - ((T*)&dot4)[0]);
+    acc += ((T*)&r4)[1] * (T)2 * ((T)1 - ((T*)&dot4)[1]);
+    acc += ((T*)&r4)[2] * (T)2 * ((T)1 - ((T*)&dot4)[2]);
+    acc += ((T*)&r4)[3] * (T)2 * ((T)1 - ((T*)&dot4)[3]);
+    i += stride;
+  }
+  while (i < n) {
+    T rv = r[i];
+    T dotv = dot[i];
+    acc += rv * (T)2 * ((T)1 - dotv);
+    i++;
+  }
+
+  for (int offset = 16; offset > 0; offset >>= 1) acc += __shfl_down_sync(0xffffffff, acc, offset);
+  __shared__ T s[32];
+  if ((threadIdx.x & 31) == 0) s[threadIdx.x >> 5] = acc;
+  __syncthreads();
+  if (threadIdx.x < 32) {
+    T val = (threadIdx.x < (blockDim.x >> 5)) ? s[threadIdx.x] : (T)0;
+    for (int offset = 16; offset > 0; offset >>= 1)
+      val += __shfl_down_sync(0xffffffff, val, offset);
+    if (threadIdx.x == 0) atomicAdd(out, val);
+  }
+}
@@ -0,0 +1,29 @@
+#include <cuda_runtime.h>
+#include <nanobind/nanobind.h>
+#include <cstdint>
+
+#include "kernels_kmeans.cuh"
+
+namespace nb = nanobind;
+
+template <typename T>
+static inline void launch_kmeans_err(std::uintptr_t r, std::uintptr_t dot, std::size_t n,
+                                     std::uintptr_t out) {
+  int threads = 256;
+  int blocks = min((int)((n + threads - 1) / threads), (int)(8 * 128));
+  kmeans_err_kernel<T><<<blocks, threads>>>(
+      reinterpret_cast<const T*>(r), reinterpret_cast<const T*>(dot), n, reinterpret_cast<T*>(out));
+}
+
+NB_MODULE(_harmony_kmeans_cuda, m) {
+  m.def("kmeans_err",
+        [](std::uintptr_t r, std::uintptr_t dot, std::size_t n, std::uintptr_t out, int itemsize) {
+          if (itemsize == 4) {
+            launch_kmeans_err<float>(r, dot, n, out);
+          } else if (itemsize == 8) {
+            launch_kmeans_err<double>(r, dot, n, out);
+          } else {
+            throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+          }
+        });
+}
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <typename T>
+__global__ void normalize_kernel_optimized(T* X, long long rows, long long cols) {
+  __shared__ T shared[32];
+  long long row = blockIdx.x;
+  long long tid = threadIdx.x;
+  if (row >= rows) return;
+  T norm = (T)0;
+  for (long long col = tid; col < cols; col += blockDim.x) {
+    T v = X[row * cols + col];
+    norm += (v < 0 ? -v : v);
+  }
+  shared[tid] = norm;
+  __syncthreads();
+  for (long long offset = 16; offset > 0; offset /= 2) {
+    shared[tid] += __shfl_down_sync(0xFFFFFFFF, shared[tid], offset);
+  }
+  __syncthreads();
+  if (tid == 0) {
+    T final_norm = shared[0];
+    final_norm = final_norm < (T)1e-12 ? (T)1e-12 : final_norm;
+    shared[0] = (T)1 / final_norm;
+  }
+  __syncthreads();
+  for (long long col = tid; col < cols; col += blockDim.x) {
+    X[row * cols + col] *= shared[0];
+  }
+}
@@ -0,0 +1,26 @@
+#include <cuda_runtime.h>
+#include <nanobind/nanobind.h>
+#include <cstdint>
+
+#include "kernels_normalize.cuh"
+
+namespace nb = nanobind;
+
+template <typename T>
+static inline void launch_normalize(std::uintptr_t X, long long rows, long long cols) {
+  dim3 block(32);
+  dim3 grid(rows);
+  normalize_kernel_optimized<T><<<grid, block>>>(reinterpret_cast<T*>(X), rows, cols);
+}
+
+NB_MODULE(_harmony_normalize_cuda, m) {
+  m.def("normalize", [](std::uintptr_t X, long long rows, long long cols, int itemsize) {
+    if (itemsize == 4) {
+      launch_normalize<float>(X, rows, cols);
+    } else if (itemsize == 8) {
+      launch_normalize<double>(X, rows, cols);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+}
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <typename T>
+__global__ void outer_kernel(T* __restrict__ E, const T* __restrict__ Pr_b,
+                             const T* __restrict__ R_sum, long long n_cats, long long n_pcs,
+                             long long switcher) {
+  long long i = blockIdx.x * blockDim.x + threadIdx.x;
+  long long N = n_cats * n_pcs;
+  if (i >= N) return;
+  long long row = i / n_pcs;
+  long long col = i % n_pcs;
+  if (switcher == 0)
+    E[i] -= (Pr_b[row] * R_sum[col]);
+  else
+    E[i] += (Pr_b[row] * R_sum[col]);
+}
+
+template <typename T>
+__global__ void harmony_correction_kernel(T* __restrict__ Z, const T* __restrict__ W,
+                                          const int* __restrict__ cats, const T* __restrict__ R,
+                                          long long n_cells, long long n_pcs) {
+  long long i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n_cells * n_pcs) return;
+  long long cell_idx = i / n_pcs;
+  long long pc_idx = i % n_pcs;
+  int cat = cats[cell_idx];
+  T correction = W[(cat + 1) * n_pcs + pc_idx] * R[cell_idx];
+  Z[i] -= correction;
+}
@@ -0,0 +1,52 @@
+#include <cuda_runtime.h>
+#include <nanobind/nanobind.h>
+#include <cstdint>
+
+#include "kernels_outer.cuh"
+
+namespace nb = nanobind;
+
+template <typename T>
+static inline void launch_outer(std::uintptr_t E, std::uintptr_t Pr_b, std::uintptr_t R_sum,
+                                long long n_cats, long long n_pcs, long long switcher) {
+  dim3 block(256);
+  long long N = n_cats * n_pcs;
+  dim3 grid((unsigned)((N + block.x - 1) / block.x));
+  outer_kernel<T><<<grid, block>>>(reinterpret_cast<T*>(E), reinterpret_cast<const T*>(Pr_b),
+                                   reinterpret_cast<const T*>(R_sum), n_cats, n_pcs, switcher);
+}
+
+template <typename T>
+static inline void launch_harmony_corr(std::uintptr_t Z, std::uintptr_t W, std::uintptr_t cats,
+                                       std::uintptr_t R, long long n_cells, long long n_pcs) {
+  dim3 block(256);
+  long long N = n_cells * n_pcs;
+  dim3 grid((unsigned)((N + block.x - 1) / block.x));
+  harmony_correction_kernel<T><<<grid, block>>>(
+      reinterpret_cast<T*>(Z), reinterpret_cast<const T*>(W), reinterpret_cast<const int*>(cats),
+      reinterpret_cast<const T*>(R), n_cells, n_pcs);
+}
+
+NB_MODULE(_harmony_outer_cuda, m) {
+  m.def("outer", [](std::uintptr_t E, std::uintptr_t Pr_b, std::uintptr_t R_sum, long long n_cats,
+                    long long n_pcs, long long switcher, int itemsize) {
+    if (itemsize == 4) {
+      launch_outer<float>(E, Pr_b, R_sum, n_cats, n_pcs, switcher);
+    } else if (itemsize == 8) {
+      launch_outer<double>(E, Pr_b, R_sum, n_cats, n_pcs, switcher);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+
+  m.def("harmony_corr", [](std::uintptr_t Z, std::uintptr_t W, std::uintptr_t cats,
+                           std::uintptr_t R, long long n_cells, long long n_pcs, int itemsize) {
+    if (itemsize == 4) {
+      launch_harmony_corr<float>(Z, W, cats, R, n_cells, n_pcs);
+    } else if (itemsize == 8) {
+      launch_harmony_corr<double>(Z, W, cats, R, n_cells, n_pcs);
+    } else {
+      throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
+    }
+  });
+}
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+template <typename T>
+__global__ void pen_kernel(T* __restrict__ R, const T* __restrict__ penalty,
+                           const int* __restrict__ cats, std::size_t n_rows, std::size_t n_cols) {
+  std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  std::size_t N = n_rows * n_cols;
+  if (i >= N) return;
+  std::size_t row = i / n_cols;
+  std::size_t col = i % n_cols;
+  int cat = cats[row];
+  T scale = penalty[(std::size_t)cat * n_cols + col];
+  R[i] *= scale;
+}