Skip to content

Commit 4ffbec8

Browse files
authored
Merge pull request #169 from sissa-data-science/kstar_correction
added the chance to inlcude bonferroni correction for multiple testing in the computation of kstar. the likelihood ratio test used to select the kstar is now controlled by the quantile alpha instead of the value associated through the chi2 distribution to such quantile
2 parents 1f3b840 + 141fd97 commit 4ffbec8

17 files changed

Lines changed: 99236 additions & 95099 deletions

dadapy/_cython/cython_clustering.c

Lines changed: 10429 additions & 9823 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dadapy/_cython/cython_clustering_v2.c

Lines changed: 10829 additions & 10270 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dadapy/_cython/cython_density.c

Lines changed: 8287 additions & 6837 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dadapy/_cython/cython_density.pyx

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import numpy as np
66
cimport numpy as np
77

88
from scipy.special import gammaln
9+
from scipy.stats import chi2
910

1011
from libc.math cimport exp, log, pi, pow
1112

@@ -23,30 +24,45 @@ ctypedef np.float64_t floatTYPE_t
2324
def _compute_kstar(floatTYPE_t id_sel,
2425
DTYPE_t Nele,
2526
DTYPE_t maxk,
26-
floatTYPE_t Dthr,#=23.92812698,
27+
floatTYPE_t alpha, #=1e-6
2728
np.ndarray[DTYPE_t, ndim = 2] dist_indices,
28-
np.ndarray[floatTYPE_t, ndim = 2] distances
29+
np.ndarray[floatTYPE_t, ndim = 2] distances,
30+
bint bonferroni_deloc,
31+
bint bonferroni_loc,
2932
):
33+
"""Likelihhod ratio test to find the k*, ie the farthest neighbour for which the density is
34+
considered approximately constant. We include also bonferroni correction for multiple testing,
35+
in the delocalised version (alpha scaled by number of points) and the local one. In principle, for
36+
the local one, the threshold is updated at each new test and in principle one should check that
37+
all dL passes test with the new threshold. However, since the threshold is always increasing (as alpha is reduced),
38+
practically one has to check only for the last dL (againt the proper threshold).
3039
40+
"""
3141

32-
cdef floatTYPE_t dL, vvi, vvj
33-
cdef DTYPE_t i, j, ksel
42+
cdef floatTYPE_t dL, vvi, vvj, thr
43+
cdef DTYPE_t i, j, ksel, h
3444
cdef np.ndarray[DTYPE_t, ndim = 1] kstar = np.empty(Nele, dtype=int)
3545
cdef floatTYPE_t prefactor = exp( id_sel / 2.0 * log(pi) - gammaln((id_sel + 2.0) / 2.0) )
46+
cdef floatTYPE_t alpha_eff = alpha / Nele if bonferroni_deloc else alpha
47+
cdef floatTYPE_t Dthr = chi2.isf(alpha_eff,1)
48+
cdef np.ndarray[floatTYPE_t, ndim = 1] Dthr_loc_arr = np.array([chi2.isf(alpha_eff / (h+1), 1) for h in range(maxk)], dtype=float)
3649

3750
for i in range(Nele):
3851
j = 4
3952
dL = 0.0
40-
while j < maxk and dL < Dthr:
53+
h = 0
54+
while j < maxk:
4155
ksel = j - 1
4256
vvi = prefactor * pow(distances[i, ksel], id_sel)
4357
vvj = prefactor * pow(distances[dist_indices[i, j], ksel], id_sel)
4458
dL = -2.0 * ksel * ( log(vvi) + log(vvj) - 2.0 * log(vvi + vvj) + log(4) )
45-
j = j + 1
46-
if j == maxk:
47-
kstar[i] = j - 1
48-
else:
49-
kstar[i] = j - 2
59+
thr = Dthr_loc_arr[h] if bonferroni_loc else Dthr
60+
if dL > thr:
61+
break
62+
else:
63+
j = j + 1
64+
h = h + 1
65+
kstar[i] = j - 1 # fall back to previous iteration where the test had passed
5066

5167
return kstar
5268

@@ -56,30 +72,38 @@ def _compute_kstar(floatTYPE_t id_sel,
5672
def _compute_kstar_interp(floatTYPE_t id_sel,
5773
DTYPE_t Nele,
5874
DTYPE_t maxk,
59-
floatTYPE_t Dthr, #=23.92812698,
75+
floatTYPE_t alpha, #=1e-6,
6076
np.ndarray[DTYPE_t, ndim = 2] cross_dist_indices,
6177
np.ndarray[floatTYPE_t, ndim = 2] cross_distances,
62-
np.ndarray[floatTYPE_t, ndim = 2] data_distances
78+
np.ndarray[floatTYPE_t, ndim = 2] data_distances,
79+
bint bonferroni_deloc,
80+
bint bonferroni_loc,
6381
):
6482

6583

66-
cdef floatTYPE_t dL, vvi, vvj
67-
cdef DTYPE_t i, j, ksel
84+
cdef floatTYPE_t dL, vvi, vvj, thr
85+
cdef DTYPE_t i, j, ksel, h
6886
cdef np.ndarray[DTYPE_t, ndim = 1] kstar = np.empty(Nele, dtype=int)
6987
cdef floatTYPE_t prefactor = exp( id_sel / 2.0 * log(pi) - gammaln((id_sel + 2.0) / 2.0) )
88+
cdef floatTYPE_t alpha_eff = alpha / Nele if bonferroni_deloc else alpha
89+
cdef floatTYPE_t Dthr = chi2.isf(alpha_eff,1)
90+
cdef np.ndarray[floatTYPE_t, ndim = 1] Dthr_loc_arr = np.array([chi2.isf(alpha_eff / (h+1), 1) for h in range(maxk)], dtype=float)
7091

7192
for i in range(Nele):
7293
j = 4
7394
dL = 0.0
74-
while j < maxk and dL < Dthr:
95+
h = 0
96+
while j < maxk:
7597
ksel = j - 1
7698
vvi = prefactor * pow(cross_distances[i, ksel], id_sel)
7799
vvj = prefactor * pow(data_distances[cross_dist_indices[i, j], ksel], id_sel)
78100
dL = -2.0 * ksel * ( log(vvi) + log(vvj) - 2.0 * log(vvi + vvj) + log(4) )
79-
j = j + 1
80-
if j == maxk:
81-
kstar[i] = j - 1
82-
else:
83-
kstar[i] = j - 2
84-
85-
return kstar
101+
thr = Dthr_loc_arr[h] if bonferroni_loc else Dthr
102+
if dL > thr:
103+
break
104+
else:
105+
j = j + 1
106+
h = h + 1
107+
kstar[i] = j - 1 # fall back to previous iteration where the test had passed
108+
109+
return kstar

0 commit comments

Comments
 (0)