Skip to content

Commit 20cf11e

Browse files
committed
move decoupler
1 parent d45d6bf commit 20cf11e

File tree

5 files changed

+53
-116
lines changed

5 files changed

+53
-116
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ if (RSC_BUILD_EXTENSIONS)
5656
add_nb_cuda_module(_aggr_cuda src/rapids_singlecell/_cuda/aggr/aggr.cu)
5757
add_nb_cuda_module(_spca_cuda src/rapids_singlecell/_cuda/spca/spca.cu)
5858
add_nb_cuda_module(_ligrec_cuda src/rapids_singlecell/_cuda/ligrec/ligrec.cu)
59+
add_nb_cuda_module(_pv_cuda src/rapids_singlecell/_cuda/pv/pv.cu)
5960
# Harmony CUDA modules
6061
add_nb_cuda_module(_harmony_scatter_cuda src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
6162
add_nb_cuda_module(_harmony_outer_cuda src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#pragma once
2+
3+
#include <cuda_runtime.h>
4+
5+
__global__ void rev_cummin64_kernel(const double* __restrict__ x, double* __restrict__ y,
6+
int n_rows, int m) {
7+
int r = blockDim.x * blockIdx.x + threadIdx.x;
8+
if (r >= n_rows) return;
9+
10+
const double* xr = x + (size_t)r * m;
11+
double* yr = y + (size_t)r * m;
12+
13+
double cur = xr[m - 1];
14+
yr[m - 1] = cur;
15+
16+
for (int j = m - 2; j >= 0; --j) {
17+
double v = xr[j];
18+
cur = (v < cur) ? v : cur;
19+
yr[j] = cur;
20+
}
21+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#include <cuda_runtime.h>
2+
#include <nanobind/nanobind.h>
3+
#include <cstdint>
4+
5+
#include "kernels_pv.cuh"
6+
7+
namespace nb = nanobind;
8+
9+
static inline void launch_rev_cummin64(std::uintptr_t x, std::uintptr_t y, int n_rows, int m) {
10+
dim3 block(256);
11+
dim3 grid((unsigned)((n_rows + block.x - 1) / block.x));
12+
rev_cummin64_kernel<<<grid, block>>>(reinterpret_cast<const double*>(x),
13+
reinterpret_cast<double*>(y), n_rows, m);
14+
}
15+
16+
NB_MODULE(_pv_cuda, m) {
17+
m.def("rev_cummin64", [](std::uintptr_t x, std::uintptr_t y, int n_rows, int m) {
18+
launch_rev_cummin64(x, y, n_rows, m);
19+
});
20+
}

src/rapids_singlecell/decoupler_gpu/_helper/_pv.py

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,19 @@
11
from __future__ import annotations
22

3+
try:
4+
from rapids_singlecell._cuda import _pv_cuda as _pv
5+
except ImportError:
6+
_pv = None
37
import cupy as cp
48
import numba as nb
59
import numpy as np
610

7-
# Reverse cumulative min along the last axis, per row (float64)
8-
_rev_cummin64 = cp.RawKernel(
9-
r"""
10-
extern "C" __global__
11-
void rev_cummin64(const double* __restrict__ x,
12-
double* __restrict__ y,
13-
const int n_rows,
14-
const int m)
15-
{
16-
int r = blockDim.x * blockIdx.x + threadIdx.x;
17-
if (r >= n_rows) return;
18-
19-
const double* xr = x + (size_t)r * m;
20-
double* yr = y + (size_t)r * m;
21-
22-
double cur = xr[m - 1];
23-
yr[m - 1] = cur;
24-
25-
// right -> left
26-
for (int j = m - 2; j >= 0; --j) {
27-
double v = xr[j];
28-
cur = (v < cur) ? v : cur;
29-
yr[j] = cur;
30-
}
31-
}
32-
""",
33-
"rev_cummin64",
34-
)
11+
12+
def _rev_cummin64(x, n_rows, m):
13+
y = cp.empty_like(x)
14+
15+
_pv.rev_cummin64(x.data.ptr, y.data.ptr, int(n_rows), int(m))
16+
return y
3517

3618

3719
def fdr_bh_axis1_cupy_optimized(ps, *, mem_gb: float = 4.0) -> cp.ndarray:
@@ -78,7 +60,6 @@ def fdr_bh_axis1_cupy_optimized(ps, *, mem_gb: float = 4.0) -> cp.ndarray:
7860

7961
out = cp.empty_like(ps, dtype=cp.float64)
8062

81-
threads = 256 # for the rev_cummin kernel
8263
for s in range(0, n_rows, B):
8364
e = min(n_rows, s + B)
8465
R = e - s
@@ -97,9 +78,7 @@ def fdr_bh_axis1_cupy_optimized(ps, *, mem_gb: float = 4.0) -> cp.ndarray:
9778
ps_bh = ps_sorted * scale # (R, m) float64
9879

9980
# 4) reverse cumulative min via custom kernel
100-
ps_mon = cp.empty_like(ps_bh)
101-
blocks = (R + threads - 1) // threads
102-
_rev_cummin64((blocks,), (threads,), (ps_bh, ps_mon, R, m))
81+
ps_mon = _rev_cummin64(ps_bh, R, m)
10382

10483
# 5) build inverse permutation without argsort (scatter)
10584
inv_order = cp.empty_like(order, dtype=cp.int32) # (R, m) int32

src/rapids_singlecell/tools/_kernels/_nan_mean_kernels.py

Lines changed: 0 additions & 84 deletions
This file was deleted.

0 commit comments

Comments
 (0)