GenomicBasedRegression/tuning_parameters_KNN.txt at main · Nicolas-Radomski/GenomicBasedRegression · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
{
# when combined with any model or pipeline, minimizing the number of hyperparameter combinations helps reduce overall search time

# --- Feature selection (SelectKBest) ---
# used only to modify selected SKB behavior
'feature_selection__k': [25, 50], # number of top features to keep
'feature_selection__score_func': [mutual_info_regression], # score function (avoid chi2 in a context of regression and f_regression it captures only linear dependancies)

# --- Feature selection (SelectFromModel with lasso) ---
# used only to modify laSFM behavior
#'feature_selection__threshold': [-float('inf')], # disable thresholding to rely solely on max_features
#'feature_selection__max_features': [25, 50], # select exactly this number of top features
#'feature_selection__estimator__alpha': [0.01, 0.1, 1.0], # increase regularization to speed up convergence
#'feature_selection__estimator__max_iter': [500], # reduce iterations for faster training
#'feature_selection__estimator__tol': [1e-2], # relax convergence criteria to save time
#'feature_selection__estimator__fit_intercept': [True], # whether to estimate the intercept

# --- Feature selection (SelectFromModel with elasticnet) ---
# used only to modify enSFM behavior
#'feature_selection__threshold': [-float('inf')], # rank features by importance
#'feature_selection__max_features': [25, 50], # number of top features to keep
#'feature_selection__estimator__alpha': [0.1], # moderate regularization strength
#'feature_selection__estimator__l1_ratio': [0.5], # balanced L1/L2 penalty
#'feature_selection__estimator__max_iter': [300], # max iterations for convergence
#'feature_selection__estimator__tol': [1e-2], # relaxed convergence tolerance

# --- Feature selection (SelectFromModel with ridge) ---
# used only to modify riSFM behavior
#'feature_selection__threshold': [-float('inf')], # disable hard thresholding; rank by |coef|
#'feature_selection__max_features': [25, 50], # keep top-ranked features only
#'feature_selection__estimator__alpha': [0.1, 1.0, 10.0], # regularization strength
#'feature_selection__estimator__max_iter': [1000], # ensure convergence for high-dimensional OHE
#'feature_selection__estimator__tol': [1e-3, 1e-4], # trade-off speed vs numerical precision
#'feature_selection__estimator__fit_intercept': [True], # center target; recommended for regression

# --- Feature selection (SelectFromModel with random forest) ---
# used only to modify rfSFM behavior
#'feature_selection__threshold': [-float('inf')], # rank all features by importance
#'feature_selection__max_features': [25, 50], # number of top features to keep
#'feature_selection__estimator__n_estimators': [100], # number of trees
#'feature_selection__estimator__max_depth': [10], # shallow trees for speed

# --- Model tuning (KNeighborsRegressor) ---
# used only to modify KNeighborsRegressor behavior
# simplified tuning used together with SKB
'model__n_neighbors': [5, 15, 25], # small, medium, and larger neighborhood sizes
'model__weights': ['uniform', 'distance'], # equal weighting and distance-based weighting
'model__metric': ['minkowski'], # default Minkowski
'model__p': [1, 2], # Manhattan and Euclidean distances for Minkowski metric
'model__algorithm': ['auto', 'kd_tree'], # automatic and kd_tree backends for variety
'model__leaf_size': [20, 30], # smaller and medium leaf sizes balancing speed and memory

# --- Model tuning (KNeighborsRegressor) ---
# used only to modify KNeighborsRegressor behavior
#'model__n_neighbors': [5, 15, 25], # odd values from 5 to 25; more neighbors = smoother predictions
#'model__weights': ['uniform', 'distance'], # uniform = equal weighting; distance = weight by proximity
#'model__metric': ['minkowski'], # default metric; allows flexibility via p (Minkowski power)
#'model__p': [1, 2], # p=1 (Manhattan), p=2 (Euclidean); controls Minkowski behavior
#'model__algorithm': ['auto', 'ball_tree', 'kd_tree'], # backend choice; explore for large datasets
#'model__leaf_size': [20, 30, 40], # affects tree-based algorithms’ speed vs. memory trade-off
}