|
| 1 | +import numpy as np |
| 2 | + |
| 3 | + |
| 4 | +def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False): |
| 5 | + """ |
| 6 | + This function implements the quantile aggregation method for p-values. |
| 7 | + """ |
| 8 | + # if pvalues are one-dimensional, do nothing |
| 9 | + if pvals.shape[0] == 1: |
| 10 | + return pvals[0] |
| 11 | + if adaptive: |
| 12 | + return _adaptive_quantile_aggregation(pvals, gamma_min) |
| 13 | + else: |
| 14 | + return _fixed_quantile_aggregation(pvals, gamma) |
| 15 | + |
| 16 | + |
| 17 | +def fdr_threshold(pvals, fdr=0.1, method="bhq", reshaping_function=None): |
| 18 | + if method == "bhq": |
| 19 | + return _bhq_threshold(pvals, fdr=fdr) |
| 20 | + elif method == "bhy": |
| 21 | + return _bhy_threshold(pvals, fdr=fdr, reshaping_function=reshaping_function) |
| 22 | + elif method == "ebh": |
| 23 | + return _ebh_threshold(pvals, fdr=fdr) |
| 24 | + else: |
| 25 | + raise ValueError("{} is not support FDR control method".format(method)) |
| 26 | + |
| 27 | + |
| 28 | +def cal_fdp_power(selected, non_zero_index, r_index=False): |
| 29 | + """Calculate power and False Discovery Proportion |
| 30 | +
|
| 31 | + Parameters |
| 32 | + ---------- |
| 33 | + selected: list index (in R format) of selected non-null variables |
| 34 | + non_zero_index: true index of non-null variables |
| 35 | + r_index : True if the index is taken from rpy2 inference |
| 36 | +
|
| 37 | + Returns |
| 38 | + ------- |
| 39 | + fdp: False Discoveries Proportion |
| 40 | + power: percentage of correctly selected variables over total number of |
| 41 | + non-null variables |
| 42 | +
|
| 43 | + """ |
| 44 | + # selected is the index list in R and will be different from index of |
| 45 | + # python by 1 unit |
| 46 | + |
| 47 | + if selected.size == 0: |
| 48 | + return 0.0, 0.0 |
| 49 | + |
| 50 | + n_positives = len(non_zero_index) |
| 51 | + |
| 52 | + if r_index: |
| 53 | + selected = selected - 1 |
| 54 | + |
| 55 | + true_positive = np.intersect1d(selected, non_zero_index) |
| 56 | + false_positive = np.setdiff1d(selected, true_positive) |
| 57 | + |
| 58 | + fdp = len(false_positive) / max(1, len(selected)) |
| 59 | + power = min(len(true_positive), n_positives) / n_positives |
| 60 | + |
| 61 | + return fdp, power |
| 62 | + |
| 63 | + |
| 64 | +def _bhq_threshold(pvals, fdr=0.1): |
| 65 | + """Standard Benjamini-Hochberg for controlling False discovery rate""" |
| 66 | + n_features = len(pvals) |
| 67 | + pvals_sorted = np.sort(pvals) |
| 68 | + selected_index = 2 * n_features |
| 69 | + for i in range(n_features - 1, -1, -1): |
| 70 | + if pvals_sorted[i] <= fdr * (i + 1) / n_features: |
| 71 | + selected_index = i |
| 72 | + break |
| 73 | + if selected_index <= n_features: |
| 74 | + return pvals_sorted[selected_index] |
| 75 | + else: |
| 76 | + return -1.0 |
| 77 | + |
| 78 | + |
| 79 | +def _ebh_threshold(evals, fdr=0.1): |
| 80 | + """e-BH procedure for FDR control (see Wang and Ramdas 2021)""" |
| 81 | + n_features = len(evals) |
| 82 | + evals_sorted = -np.sort(-evals) # sort in descending order |
| 83 | + selected_index = 2 * n_features |
| 84 | + for i in range(n_features - 1, -1, -1): |
| 85 | + if evals_sorted[i] >= n_features / (fdr * (i + 1)): |
| 86 | + selected_index = i |
| 87 | + break |
| 88 | + if selected_index <= n_features: |
| 89 | + return evals_sorted[selected_index] |
| 90 | + else: |
| 91 | + return np.infty |
| 92 | + |
| 93 | + |
| 94 | +def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1): |
| 95 | + """Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input |
| 96 | + shape function. Reference: Ramdas et al (2017) |
| 97 | + """ |
| 98 | + n_features = len(pvals) |
| 99 | + pvals_sorted = np.sort(pvals) |
| 100 | + selected_index = 2 * n_features |
| 101 | + # Default value for reshaping function -- defined in |
| 102 | + # Benjamini & Yekutieli (2001) |
| 103 | + if reshaping_function is None: |
| 104 | + temp = np.arange(n_features) |
| 105 | + sum_inverse = np.sum(1 / (temp + 1)) |
| 106 | + return _bhq_threshold(pvals, fdr / sum_inverse) |
| 107 | + else: |
| 108 | + for i in range(n_features - 1, -1, -1): |
| 109 | + if pvals_sorted[i] <= fdr * reshaping_function(i + 1) / n_features: |
| 110 | + selected_index = i |
| 111 | + break |
| 112 | + if selected_index <= n_features: |
| 113 | + return pvals_sorted[selected_index] |
| 114 | + else: |
| 115 | + return -1.0 |
| 116 | + |
| 117 | + |
| 118 | +def _fixed_quantile_aggregation(pvals, gamma=0.5): |
| 119 | + """Quantile aggregation function based on Meinshausen et al (2008) |
| 120 | +
|
| 121 | + Parameters |
| 122 | + ---------- |
| 123 | + pvals : 2D ndarray (n_sampling_with_repetition, n_test) |
| 124 | + p-value (adjusted) |
| 125 | +
|
| 126 | + gamma : float |
| 127 | + Percentile value used for aggregation. |
| 128 | +
|
| 129 | + Returns |
| 130 | + ------- |
| 131 | + 1D ndarray (n_tests, ) |
| 132 | + Vector of aggregated p-values |
| 133 | + """ |
| 134 | + converted_score = (1 / gamma) * (np.percentile(pvals, q=100 * gamma, axis=0)) |
| 135 | + |
| 136 | + return np.minimum(1, converted_score) |
| 137 | + |
| 138 | + |
| 139 | +def _adaptive_quantile_aggregation(pvals, gamma_min=0.05): |
| 140 | + """adaptive version of the quantile aggregation method, Meinshausen et al. |
| 141 | + (2008)""" |
| 142 | + gammas = np.arange(gamma_min, 1.05, 0.05) |
| 143 | + list_Q = np.array([_fixed_quantile_aggregation(pvals, gamma) for gamma in gammas]) |
| 144 | + |
| 145 | + return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0)) |
| 146 | + |
| 147 | + |
| 148 | +def _lambda_max(X, y, use_noise_estimate=True): |
| 149 | + """Calculation of lambda_max, the smallest value of regularization parameter in |
| 150 | + lasso program for non-zero coefficient |
| 151 | + """ |
| 152 | + n_samples, _ = X.shape |
| 153 | + |
| 154 | + if not use_noise_estimate: |
| 155 | + return np.max(np.dot(X.T, y)) / n_samples |
| 156 | + |
| 157 | + norm_y = np.linalg.norm(y, ord=2) |
| 158 | + sigma_0 = (norm_y / np.sqrt(n_samples)) * 1e-3 |
| 159 | + sig_star = max(sigma_0, norm_y / np.sqrt(n_samples)) |
| 160 | + |
| 161 | + return np.max(np.abs(np.dot(X.T, y)) / (n_samples * sig_star)) |
| 162 | + |
| 163 | + |
| 164 | +def _check_vim_predict_method(method): |
| 165 | + """Check if the method is a valid method for variable importance measure |
| 166 | + prediction""" |
| 167 | + if method in ["predict", "predict_proba", "decision_function", "transform"]: |
| 168 | + return method |
| 169 | + else: |
| 170 | + raise ValueError( |
| 171 | + "The method {} is not a valid method for variable importance measure prediction".format( |
| 172 | + method |
| 173 | + ) |
| 174 | + ) |
0 commit comments