Skip to content

Commit 30414ab

Browse files
committed
add harmony
1 parent 5d327bd commit 30414ab

File tree

23 files changed

+761
-775
lines changed

23 files changed

+761
-775
lines changed

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,11 @@ if (RSC_BUILD_EXTENSIONS)
5555
add_nb_cuda_module(_cooc_cuda src/rapids_singlecell/_cuda/cooc/cooc.cu)
5656
add_nb_cuda_module(_aggr_cuda src/rapids_singlecell/_cuda/aggr/aggr.cu)
5757
add_nb_cuda_module(_spca_cuda src/rapids_singlecell/_cuda/spca/spca.cu)
58+
# Harmony CUDA modules
59+
add_nb_cuda_module(_harmony_scatter_cuda src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
60+
add_nb_cuda_module(_harmony_outer_cuda src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
61+
add_nb_cuda_module(_harmony_colsum_cuda src/rapids_singlecell/_cuda/harmony/colsum/colsum.cu)
62+
add_nb_cuda_module(_harmony_kmeans_cuda src/rapids_singlecell/_cuda/harmony/kmeans/kmeans.cu)
63+
add_nb_cuda_module(_harmony_normalize_cuda src/rapids_singlecell/_cuda/harmony/normalize/normalize.cu)
64+
add_nb_cuda_module(_harmony_pen_cuda src/rapids_singlecell/_cuda/harmony/pen/pen.cu)
5865
endif()
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#include <cuda_runtime.h>
2+
#include <nanobind/nanobind.h>
3+
#include <cstdint>
4+
5+
#include "kernels_colsum.cuh"
6+
7+
namespace nb = nanobind;
8+
9+
template <typename T>
10+
static inline void launch_colsum(std::uintptr_t A, std::uintptr_t out, std::size_t rows,
11+
std::size_t cols) {
12+
int threads = 32;
13+
int blocks = (int)cols;
14+
colsum_kernel<T>
15+
<<<blocks, threads>>>(reinterpret_cast<const T*>(A), reinterpret_cast<T*>(out), rows, cols);
16+
}
17+
18+
template <typename T>
19+
static inline void launch_colsum_atomic(std::uintptr_t A, std::uintptr_t out, std::size_t rows,
20+
std::size_t cols) {
21+
int tile_rows = (rows + 31) / 32;
22+
int tile_cols = (cols + 31) / 32;
23+
int blocks = tile_rows * tile_cols;
24+
dim3 threads(32, 32);
25+
colsum_atomic_kernel<T>
26+
<<<blocks, threads>>>(reinterpret_cast<const T*>(A), reinterpret_cast<T*>(out), rows, cols);
27+
}
28+
29+
NB_MODULE(_harmony_colsum_cuda, m) {
30+
m.def("colsum", [](std::uintptr_t A, std::uintptr_t out, std::size_t rows, std::size_t cols,
31+
int dtype_code) {
32+
// dtype_code: 0=float32, 1=float64, 2=int32; Back-compat: 4->float32, 8->float64
33+
if (dtype_code == 0 || dtype_code == 4) {
34+
launch_colsum<float>(A, out, rows, cols);
35+
} else if (dtype_code == 1 || dtype_code == 8) {
36+
launch_colsum<double>(A, out, rows, cols);
37+
} else if (dtype_code == 2) {
38+
launch_colsum<int>(A, out, rows, cols);
39+
} else {
40+
throw nb::value_error("Unsupported dtype_code (expected 0/1/2 or 4/8)");
41+
}
42+
});
43+
44+
m.def("colsum_atomic", [](std::uintptr_t A, std::uintptr_t out, std::size_t rows,
45+
std::size_t cols, int dtype_code) {
46+
if (dtype_code == 0 || dtype_code == 4) {
47+
launch_colsum_atomic<float>(A, out, rows, cols);
48+
} else if (dtype_code == 1 || dtype_code == 8) {
49+
launch_colsum_atomic<double>(A, out, rows, cols);
50+
} else if (dtype_code == 2) {
51+
launch_colsum_atomic<int>(A, out, rows, cols);
52+
} else {
53+
throw nb::value_error("Unsupported dtype_code (expected 0/1/2 or 4/8)");
54+
}
55+
});
56+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#pragma once
2+
3+
#include <cuda_runtime.h>
4+
5+
template <typename T>
6+
__global__ void colsum_kernel(const T* __restrict__ A, T* __restrict__ out, std::size_t rows,
7+
std::size_t cols) {
8+
std::size_t tid = threadIdx.x;
9+
for (std::size_t col = blockIdx.x; col < cols; col += gridDim.x) {
10+
T acc = (T)0;
11+
for (std::size_t i = tid; i < rows; i += blockDim.x) {
12+
acc += A[i * cols + col];
13+
}
14+
for (int offset = 16; offset > 0; offset >>= 1)
15+
acc += __shfl_down_sync(0xffffffff, acc, offset);
16+
__shared__ T s[32];
17+
if ((threadIdx.x & 31) == 0) s[threadIdx.x >> 5] = acc;
18+
__syncthreads();
19+
if (threadIdx.x < 32) {
20+
T val = (threadIdx.x < (blockDim.x >> 5)) ? s[threadIdx.x] : (T)0;
21+
for (int off = 16; off > 0; off >>= 1) val += __shfl_down_sync(0xffffffff, val, off);
22+
if (threadIdx.x == 0) out[col] = val;
23+
}
24+
}
25+
}
26+
27+
template <typename T>
28+
__global__ void colsum_atomic_kernel(const T* __restrict__ A, T* __restrict__ out, std::size_t rows,
29+
std::size_t cols) {
30+
std::size_t tile_cols = (cols + 31) / 32;
31+
std::size_t tid = blockIdx.x;
32+
std::size_t tile_r = tid / tile_cols;
33+
std::size_t tile_c = tid % tile_cols;
34+
std::size_t row = tile_r * 32 + threadIdx.x;
35+
std::size_t col = tile_c * 32 + threadIdx.y;
36+
T v = (T)0;
37+
if (row < rows && col < cols) v = A[row * cols + col];
38+
for (int off = 16; off > 0; off >>= 1) v += __shfl_down_sync(0xffffffff, v, off);
39+
if (threadIdx.x == 0 && col < cols) atomicAdd(&out[col], v);
40+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#pragma once
2+
3+
#include <cuda_runtime.h>
4+
#include <type_traits>
5+
6+
template <typename T>
7+
__global__ void kmeans_err_kernel(const T* __restrict__ r, const T* __restrict__ dot, std::size_t n,
8+
T* __restrict__ out) {
9+
T acc = (T)0;
10+
using Vec = typename std::conditional<std::is_same<T, float>::value, float4, double4>::type;
11+
12+
std::size_t i = (blockIdx.x * blockDim.x + threadIdx.x) * 4;
13+
const std::size_t stride = gridDim.x * blockDim.x * 4;
14+
15+
while (i + 3 < n) {
16+
Vec r4 = *(const Vec*)(r + i);
17+
Vec dot4 = *(const Vec*)(dot + i);
18+
acc += ((T*)&r4)[0] * (T)2 * ((T)1 - ((T*)&dot4)[0]);
19+
acc += ((T*)&r4)[1] * (T)2 * ((T)1 - ((T*)&dot4)[1]);
20+
acc += ((T*)&r4)[2] * (T)2 * ((T)1 - ((T*)&dot4)[2]);
21+
acc += ((T*)&r4)[3] * (T)2 * ((T)1 - ((T*)&dot4)[3]);
22+
i += stride;
23+
}
24+
while (i < n) {
25+
T rv = r[i];
26+
T dotv = dot[i];
27+
acc += rv * (T)2 * ((T)1 - dotv);
28+
i++;
29+
}
30+
31+
for (int offset = 16; offset > 0; offset >>= 1) acc += __shfl_down_sync(0xffffffff, acc, offset);
32+
__shared__ T s[32];
33+
if ((threadIdx.x & 31) == 0) s[threadIdx.x >> 5] = acc;
34+
__syncthreads();
35+
if (threadIdx.x < 32) {
36+
T val = (threadIdx.x < (blockDim.x >> 5)) ? s[threadIdx.x] : (T)0;
37+
for (int offset = 16; offset > 0; offset >>= 1)
38+
val += __shfl_down_sync(0xffffffff, val, offset);
39+
if (threadIdx.x == 0) atomicAdd(out, val);
40+
}
41+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#include <cuda_runtime.h>
2+
#include <nanobind/nanobind.h>
3+
#include <cstdint>
4+
5+
#include "kernels_kmeans.cuh"
6+
7+
namespace nb = nanobind;
8+
9+
template <typename T>
10+
static inline void launch_kmeans_err(std::uintptr_t r, std::uintptr_t dot, std::size_t n,
11+
std::uintptr_t out) {
12+
int threads = 256;
13+
int blocks = min((int)((n + threads - 1) / threads), (int)(8 * 128));
14+
kmeans_err_kernel<T><<<blocks, threads>>>(
15+
reinterpret_cast<const T*>(r), reinterpret_cast<const T*>(dot), n, reinterpret_cast<T*>(out));
16+
}
17+
18+
NB_MODULE(_harmony_kmeans_cuda, m) {
19+
m.def("kmeans_err",
20+
[](std::uintptr_t r, std::uintptr_t dot, std::size_t n, std::uintptr_t out, int itemsize) {
21+
if (itemsize == 4) {
22+
launch_kmeans_err<float>(r, dot, n, out);
23+
} else if (itemsize == 8) {
24+
launch_kmeans_err<double>(r, dot, n, out);
25+
} else {
26+
throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
27+
}
28+
});
29+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#pragma once
2+
3+
#include <cuda_runtime.h>
4+
5+
template <typename T>
6+
__global__ void normalize_kernel_optimized(T* X, long long rows, long long cols) {
7+
__shared__ T shared[32];
8+
long long row = blockIdx.x;
9+
long long tid = threadIdx.x;
10+
if (row >= rows) return;
11+
T norm = (T)0;
12+
for (long long col = tid; col < cols; col += blockDim.x) {
13+
T v = X[row * cols + col];
14+
norm += (v < 0 ? -v : v);
15+
}
16+
shared[tid] = norm;
17+
__syncthreads();
18+
for (long long offset = 16; offset > 0; offset /= 2) {
19+
shared[tid] += __shfl_down_sync(0xFFFFFFFF, shared[tid], offset);
20+
}
21+
__syncthreads();
22+
if (tid == 0) {
23+
T final_norm = shared[0];
24+
final_norm = final_norm < (T)1e-12 ? (T)1e-12 : final_norm;
25+
shared[0] = (T)1 / final_norm;
26+
}
27+
__syncthreads();
28+
for (long long col = tid; col < cols; col += blockDim.x) {
29+
X[row * cols + col] *= shared[0];
30+
}
31+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#include <cuda_runtime.h>
2+
#include <nanobind/nanobind.h>
3+
#include <cstdint>
4+
5+
#include "kernels_normalize.cuh"
6+
7+
namespace nb = nanobind;
8+
9+
template <typename T>
10+
static inline void launch_normalize(std::uintptr_t X, long long rows, long long cols) {
11+
dim3 block(32);
12+
dim3 grid(rows);
13+
normalize_kernel_optimized<T><<<grid, block>>>(reinterpret_cast<T*>(X), rows, cols);
14+
}
15+
16+
NB_MODULE(_harmony_normalize_cuda, m) {
17+
m.def("normalize", [](std::uintptr_t X, long long rows, long long cols, int itemsize) {
18+
if (itemsize == 4) {
19+
launch_normalize<float>(X, rows, cols);
20+
} else if (itemsize == 8) {
21+
launch_normalize<double>(X, rows, cols);
22+
} else {
23+
throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
24+
}
25+
});
26+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#pragma once
2+
3+
#include <cuda_runtime.h>
4+
5+
template <typename T>
6+
__global__ void outer_kernel(T* __restrict__ E, const T* __restrict__ Pr_b,
7+
const T* __restrict__ R_sum, long long n_cats, long long n_pcs,
8+
long long switcher) {
9+
long long i = blockIdx.x * blockDim.x + threadIdx.x;
10+
long long N = n_cats * n_pcs;
11+
if (i >= N) return;
12+
long long row = i / n_pcs;
13+
long long col = i % n_pcs;
14+
if (switcher == 0)
15+
E[i] -= (Pr_b[row] * R_sum[col]);
16+
else
17+
E[i] += (Pr_b[row] * R_sum[col]);
18+
}
19+
20+
template <typename T>
21+
__global__ void harmony_correction_kernel(T* __restrict__ Z, const T* __restrict__ W,
22+
const int* __restrict__ cats, const T* __restrict__ R,
23+
long long n_cells, long long n_pcs) {
24+
long long i = blockIdx.x * blockDim.x + threadIdx.x;
25+
if (i >= n_cells * n_pcs) return;
26+
long long cell_idx = i / n_pcs;
27+
long long pc_idx = i % n_pcs;
28+
int cat = cats[cell_idx];
29+
T correction = W[(cat + 1) * n_pcs + pc_idx] * R[cell_idx];
30+
Z[i] -= correction;
31+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include <cuda_runtime.h>
2+
#include <nanobind/nanobind.h>
3+
#include <cstdint>
4+
5+
#include "kernels_outer.cuh"
6+
7+
namespace nb = nanobind;
8+
9+
template <typename T>
10+
static inline void launch_outer(std::uintptr_t E, std::uintptr_t Pr_b, std::uintptr_t R_sum,
11+
long long n_cats, long long n_pcs, long long switcher) {
12+
dim3 block(256);
13+
long long N = n_cats * n_pcs;
14+
dim3 grid((unsigned)((N + block.x - 1) / block.x));
15+
outer_kernel<T><<<grid, block>>>(reinterpret_cast<T*>(E), reinterpret_cast<const T*>(Pr_b),
16+
reinterpret_cast<const T*>(R_sum), n_cats, n_pcs, switcher);
17+
}
18+
19+
template <typename T>
20+
static inline void launch_harmony_corr(std::uintptr_t Z, std::uintptr_t W, std::uintptr_t cats,
21+
std::uintptr_t R, long long n_cells, long long n_pcs) {
22+
dim3 block(256);
23+
long long N = n_cells * n_pcs;
24+
dim3 grid((unsigned)((N + block.x - 1) / block.x));
25+
harmony_correction_kernel<T><<<grid, block>>>(
26+
reinterpret_cast<T*>(Z), reinterpret_cast<const T*>(W), reinterpret_cast<const int*>(cats),
27+
reinterpret_cast<const T*>(R), n_cells, n_pcs);
28+
}
29+
30+
NB_MODULE(_harmony_outer_cuda, m) {
31+
m.def("outer", [](std::uintptr_t E, std::uintptr_t Pr_b, std::uintptr_t R_sum, long long n_cats,
32+
long long n_pcs, long long switcher, int itemsize) {
33+
if (itemsize == 4) {
34+
launch_outer<float>(E, Pr_b, R_sum, n_cats, n_pcs, switcher);
35+
} else if (itemsize == 8) {
36+
launch_outer<double>(E, Pr_b, R_sum, n_cats, n_pcs, switcher);
37+
} else {
38+
throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
39+
}
40+
});
41+
42+
m.def("harmony_corr", [](std::uintptr_t Z, std::uintptr_t W, std::uintptr_t cats,
43+
std::uintptr_t R, long long n_cells, long long n_pcs, int itemsize) {
44+
if (itemsize == 4) {
45+
launch_harmony_corr<float>(Z, W, cats, R, n_cells, n_pcs);
46+
} else if (itemsize == 8) {
47+
launch_harmony_corr<double>(Z, W, cats, R, n_cells, n_pcs);
48+
} else {
49+
throw nb::value_error("Unsupported itemsize (expected 4 or 8)");
50+
}
51+
});
52+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#pragma once
2+
3+
#include <cuda_runtime.h>
4+
5+
template <typename T>
6+
__global__ void pen_kernel(T* __restrict__ R, const T* __restrict__ penalty,
7+
const int* __restrict__ cats, std::size_t n_rows, std::size_t n_cols) {
8+
std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
9+
std::size_t N = n_rows * n_cols;
10+
if (i >= N) return;
11+
std::size_t row = i / n_cols;
12+
std::size_t col = i % n_cols;
13+
int cat = cats[row];
14+
T scale = penalty[(std::size_t)cat * n_cols + col];
15+
R[i] *= scale;
16+
}

0 commit comments

Comments
 (0)