Skip to content

Commit a1c1a10

Browse files
committed
feat: add configs and first scripts (#4)
* feat: add configs and scripts * feat: add list of tissues * update coverage badge --------- Co-authored-by: gaetandi <gaetandi@users.noreply.github.com>
1 parent 988693a commit a1c1a10

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+13691
-244
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ repos:
4949
language: system
5050
types: [python]
5151
pass_filenames: false
52-
entry: uv run mypy src/leap/ configs/ --exclude src/leap/_version.py
52+
entry: uv run mypy src/leap/ configs/ scripts/ --exclude src/leap/_version.py
5353
- id: pydoclint
5454
name: Docstring linting with pydoclint
5555
language: system

Makefile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,20 @@ clean: ## Clean up temporary files and caches
4848
@rm -rf .pytest_cache .ruff_cache .mypy_cache htmlcov .coverage dist build *.egg-info
4949
@find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
5050
@echo "✅ Cleanup complete!"
51+
52+
clean-data: ## Remove all processed data, models, and results
53+
@echo "\nThe following files and directories will be permanently deleted:\n"
54+
@find data/processed -mindepth 1 -not -name ".gitkeep" 2>/dev/null || true
55+
@find models -mindepth 1 -not -path "models/README.md" -not -name ".gitkeep" 2>/dev/null || true
56+
@find results -mindepth 1 -not -path "results/README.md" -not -name ".gitkeep" 2>/dev/null || true
57+
@echo "\n⚠️ WARNING: This action cannot be undone."
58+
@read -p "Are you sure you want to continue? (yes/no) " -r; \
59+
if [[ $$REPLY =~ ^[Yy]es$$ ]]; then \
60+
echo "🧹 Removing processed data, models, and results..."; \
61+
find data/processed -mindepth 1 -not -name ".gitkeep" -delete 2>/dev/null || true; \
62+
find models -mindepth 1 -not -path "models/README.md" -not -name ".gitkeep" -delete 2>/dev/null || true; \
63+
find results -mindepth 1 -not -path "results/README.md" -not -name ".gitkeep" -delete 2>/dev/null || true; \
64+
echo "✅ Removal complete!"; \
65+
else \
66+
echo "Aborted."; \
67+
fi

README.md

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,28 @@ make tests # Run tests with coverage
2929

3030
## Usage
3131

32-
<!-- Usage instructions and examples coming soon -->
32+
### Clean up temporary files
33+
```bash
34+
make clean
35+
```
36+
Removes all temporary files, caches, and build artifacts.
37+
38+
### Clean up data and results
39+
```bash
40+
make clean-data
41+
```
42+
Removes all processed data, trained models, and results. **Warning: This action cannot be undone!**
43+
44+
### Run the complete LEAP pipeline
45+
```bash
46+
sh run_pipeline.sh
47+
```
48+
Runs the full LEAP pipeline end-to-end:
49+
1. Pretrain representations
50+
2. Train regression heads for multiple tasks, models, and seeds
51+
3. Ensemble predictions
52+
53+
The pipeline is configured to run on task 1 with the `mae_ps_enet` model across 5 different seeds.
3354

3455
## Citation
3556

badges/cov_badge.svg

Lines changed: 1 addition & 1 deletion
Loading
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""Define perturbation model parameters."""
2+
3+
PRED_MODEL_NAME: dict[str, str] = {
4+
"mae_pp_tdnn": "dnn_regressor",
5+
"mae_pp_mlp": "mlp_regressor",
6+
"mae_pp_lgbm": "lgbm_regressor",
7+
"mae_ps_knn": "knn_regressor",
8+
"mae_ps_mlp": "mlp_regressor_small",
9+
"mae_ps_lgbm": "lgbm_regressor_small",
10+
"mae_ps_enet": "elastic_net_regressor",
11+
}
12+
13+
PRED_MODEL_TYPE: dict[str, str] = {
14+
"mae_pp_tdnn": "pan_perturbation",
15+
"mae_pp_mlp": "pan_perturbation",
16+
"mae_pp_lgbm": "pan_perturbation",
17+
"mae_ps_knn": "multi_label",
18+
"mae_ps_mlp": "perturbation_specific",
19+
"mae_ps_lgbm": "perturbation_specific",
20+
"mae_ps_enet": "perturbation_specific",
21+
}
22+
23+
RPZ_MODEL_NAME: dict[str, str] = {
24+
"mae_pp_tdnn": "mae",
25+
"mae_pp_mlp": "mae",
26+
"mae_pp_lgbm": "mae",
27+
"mae_ps_knn": "mae",
28+
"mae_ps_mlp": "mae",
29+
"mae_ps_lgbm": "mae",
30+
"mae_ps_enet": "mae",
31+
}
32+
33+
USE_TRAINED_PREPROCESSOR: dict[str, bool] = {
34+
"mae_pp_tdnn": True,
35+
"mae_pp_mlp": True,
36+
"mae_pp_lgbm": True,
37+
"mae_ps_knn": True,
38+
"mae_ps_mlp": True,
39+
"mae_ps_lgbm": True,
40+
"mae_ps_enet": True,
41+
}
42+
43+
44+
USE_TRAINED_RPZ: dict[str, bool] = {
45+
"mae_pp_tdnn": True,
46+
"mae_pp_mlp": True,
47+
"mae_pp_lgbm": True,
48+
"mae_ps_knn": True,
49+
"mae_ps_mlp": True,
50+
"mae_ps_lgbm": True,
51+
"mae_ps_enet": True,
52+
}
53+
54+
# IMPORTANT: in LEAP we actually use depmap_gdsc_pdx (using all available data)
55+
PRETRAINED_DATA: dict[str, str] = {
56+
"mae_pp_tdnn": "depmap",
57+
"mae_pp_mlp": "depmap",
58+
"mae_pp_lgbm": "depmap",
59+
"mae_ps_knn": "depmap",
60+
"mae_ps_mlp": "depmap",
61+
"mae_ps_lgbm": "depmap",
62+
"mae_ps_enet": "depmap",
63+
}
64+
65+
66+
ENSEMBLING: dict[str, bool] = {
67+
"mae_pp_tdnn": True,
68+
"mae_pp_mlp": True,
69+
"mae_pp_lgbm": True,
70+
"mae_ps_knn": True,
71+
"mae_ps_mlp": True,
72+
"mae_ps_lgbm": True,
73+
"mae_ps_enet": True,
74+
}
75+
76+
ENSEMBLING_SAVE_MODELS_TO_DISK: dict[str, bool] = {
77+
"mae_pp_tdnn": True,
78+
"mae_pp_mlp": True,
79+
"mae_pp_lgbm": True,
80+
"mae_ps_knn": False,
81+
"mae_ps_mlp": False,
82+
"mae_ps_lgbm": False,
83+
"mae_ps_enet": False,
84+
}
85+
86+
USE_RAY: dict[str, bool] = {
87+
"mae_pp_tdnn": False,
88+
"mae_pp_mlp": False,
89+
"mae_pp_lgbm": False,
90+
"mae_ps_knn": False,
91+
"mae_ps_mlp": True,
92+
"mae_ps_lgbm": True,
93+
"mae_ps_enet": True,
94+
}
95+
96+
RAY_REMOTE_PARAMS: dict[str, dict | None] = {
97+
"mae_pp_tdnn": None,
98+
"mae_pp_mlp": None,
99+
"mae_pp_lgbm": None,
100+
"mae_ps_knn": None,
101+
"mae_ps_mlp": {"num_cpus": 1, "num_gpus": 0.05},
102+
"mae_ps_lgbm": {"num_cpus": 8, "num_gpus": 0},
103+
"mae_ps_enet": {"num_cpus": 1, "num_gpus": 0},
104+
}

configs/config_regression_model.py

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
"""Define configs for regression models to use in the pipeline."""
2+
3+
from ml_collections import config_dict
4+
5+
from leap.regression_models import ElasticNet, KnnRegressor, LGBMRegressor, TorchMLPRegressor
6+
from leap.regression_models.utils import AlphaGridElasticNet
7+
8+
9+
REGRESSION_MODEL: dict[str, config_dict.ConfigDict] = {
10+
"knn_regressor": config_dict.ConfigDict(
11+
{
12+
"_target_": KnnRegressor,
13+
"n_sample_neighbors": 5, # default
14+
"weights": "uniform", # default
15+
"n_jobs": 30,
16+
}
17+
),
18+
"elastic_net_regressor": config_dict.ConfigDict(
19+
{
20+
"_target_": ElasticNet,
21+
"l1_ratio": 1.0,
22+
}
23+
),
24+
"lgbm_regressor": config_dict.ConfigDict(
25+
{
26+
"_target_": LGBMRegressor,
27+
"subsample_for_bin": 400000,
28+
"num_leaves": 4000,
29+
"min_split_gain": 0,
30+
"min_child_weight": 0.01,
31+
"min_child_samples": 5,
32+
"max_depth": 20,
33+
"learning_rate": 0.03,
34+
"reg_lambda": 0,
35+
"reg_alpha": 1,
36+
"colsample_bytree": 0.8,
37+
"n_estimators": 500,
38+
"subsample": 1,
39+
"random_state": 0,
40+
"n_jobs": 50, # launch two in // on large vm
41+
"verbose": -1,
42+
}
43+
),
44+
"lgbm_regressor_small": config_dict.ConfigDict(
45+
{
46+
# Comment every time the default is changed
47+
"_target_": LGBMRegressor,
48+
"boosting_type": "gbdt",
49+
"num_leaves": 31,
50+
"max_depth": 10, # After small grid, systematically better default is -1
51+
"learning_rate": 0.01, # TO TUNE, but 0.01 works well. default is 0.1
52+
"n_estimators": 400, # Default is 100, but 400 is better
53+
"subsample_for_bin": 200000,
54+
"objective": None,
55+
"class_weight": None,
56+
"min_split_gain": 0,
57+
"min_child_weight": 1e-3,
58+
"min_child_samples": (5), # Tuning it is the next best thing to do, default 20
59+
"subsample": 1,
60+
"subsample_freq": 0,
61+
"colsample_bytree": 0.1, # TO TUNE, much better when small, default is 1.0
62+
"reg_alpha": 1, # After small grid, better when 1, default is 0
63+
"reg_lambda": 1, # After small grid, marginally better when 1, default is 0
64+
"random_state": 0, # for reproducibility
65+
"n_jobs": 8, # small model so we can use less cores
66+
"verbose": -1, # disable prints
67+
}
68+
),
69+
"mlp_regressor": config_dict.ConfigDict(
70+
{
71+
"_target_": TorchMLPRegressor,
72+
"hidden_layer_sizes": (512, 256, 128, 64, 32, 16),
73+
"activation": "relu",
74+
"learning_rate_init": 0.001,
75+
"max_epochs": 200,
76+
"batch_size": 2048,
77+
"dropout_rate": 0.2, # Best based on tests on 1a-small
78+
"random_seed": 0,
79+
"early_stopping_use": True,
80+
"early_stopping_split": 0.2,
81+
"early_stopping_patience": 20,
82+
"early_stopping_delta": 0.001,
83+
"optimizer_type": "adam",
84+
"weight_decay": 1e-5,
85+
"learning_rate_scheduler": True, # Best based on tests on 1a-small
86+
"scheduler_factor": 0.1,
87+
# If the threshold is the same as the delta,
88+
# this needs to be smaller than the patience of the early stopping
89+
"scheduler_patience": 10,
90+
"scheduler_threshold": 0.001,
91+
"metric": "spearman",
92+
"scaler_name": "robust",
93+
"loss_function_name": "spearman",
94+
}
95+
),
96+
"mlp_regressor_small": config_dict.ConfigDict(
97+
{
98+
"_target_": TorchMLPRegressor,
99+
"hidden_layer_sizes": (20, 20),
100+
"activation": "relu",
101+
"learning_rate_init": 0.001,
102+
"max_epochs": 200,
103+
"batch_size": 2048,
104+
"dropout_rate": 0.2, # Best based on tests on 1a-small
105+
"random_seed": 0,
106+
"early_stopping_use": True,
107+
"early_stopping_split": 0.2,
108+
"early_stopping_patience": 20,
109+
"early_stopping_delta": 0.001,
110+
"optimizer_type": "adam",
111+
"weight_decay": 1e-5,
112+
"learning_rate_scheduler": True, # Best based on tests on 1a-small
113+
"scheduler_factor": 0.1,
114+
# If the threshold is the same as the delta,
115+
# this needs to be smaller than the patience of the early stopping
116+
"scheduler_patience": 10,
117+
"scheduler_threshold": 0.001,
118+
"metric": "spearman",
119+
"scaler_name": "robust",
120+
"loss_function_name": "spearman",
121+
}
122+
),
123+
# For the ETL tDNN paper comparison
124+
"dnn_regressor": config_dict.ConfigDict(
125+
{
126+
"_target_": TorchMLPRegressor,
127+
"hidden_layer_sizes": (250, 125, 60, 30),
128+
"activation": "relu",
129+
# "The learning rate was initialized at 0.001"
130+
"learning_rate_init": 0.001,
131+
# "otherwise the full learning process would take 100 epochs"
132+
"max_epochs": 100,
133+
"batch_size": 2048,
134+
"dropout_rate": 0.0,
135+
"random_seed": 0,
136+
# "The learning process would be early stopped if the reduction of
137+
# validation loss was smaller than 0.00001 in 20 epochs"
138+
"early_stopping_use": True,
139+
"early_stopping_split": 0.2,
140+
"early_stopping_patience": 20,
141+
"early_stopping_delta": 0.00001,
142+
# "The Adam optimizer was used with default setting for model learning"
143+
"optimizer_type": "adam",
144+
"weight_decay": 1e-5,
145+
# "The learning rate [...] was reduced by a factor of 10 if the reduction of
146+
# validation loss was smaller than 0.00001 in 10 epochs."
147+
"learning_rate_scheduler": True,
148+
"scheduler_factor": 0.1,
149+
"scheduler_patience": 10,
150+
"scheduler_threshold": 0.00001,
151+
"metric": "mse", # In the ETL paper (tDNN) it's the mse (loss)
152+
"scaler_name": "standard",
153+
"loss_function_name": "mse",
154+
}
155+
),
156+
}
157+
158+
HPT_TUNING_PARAM_GRID: dict[str, config_dict.ConfigDict | None] = {
159+
"knn_regressor": None,
160+
"elastic_net_regressor": config_dict.ConfigDict(
161+
{
162+
"alpha": config_dict.ConfigDict(
163+
{
164+
"_target_": AlphaGridElasticNet,
165+
"alpha_min_ratio": 1e-3,
166+
"n_alphas": 10,
167+
}
168+
),
169+
}
170+
),
171+
"lgbm_regressor": config_dict.ConfigDict(
172+
{
173+
"reg_alpha": [0, 1],
174+
# log-spaced between 1e-2 and 2e-1, rounded to the first non-zero decimal
175+
"learning_rate": [0.01, 0.02, 0.04, 0.09, 0.2],
176+
}
177+
),
178+
"lgbm_regressor_small": config_dict.ConfigDict(
179+
{
180+
"learning_rate": [0.005, 0.01],
181+
"colsample_bytree": [0.05, 0.1, 0.15, 0.2, 0.25],
182+
}
183+
),
184+
"mlp_regressor": config_dict.ConfigDict(
185+
{
186+
# log-spaced between 5e-4 and 1e-2, rounded to the first non-zero decimal
187+
"learning_rate_init": [0.0005, 0.001, 0.002, 0.005, 0.01],
188+
"batch_size": [2048, 8192],
189+
}
190+
),
191+
"mlp_regressor_small": config_dict.ConfigDict(
192+
{
193+
# log-spaced between 5e-4 and 1e-2, rounded to the first non-zero decimal
194+
"learning_rate_init": [0.0005, 0.001, 0.002, 0.005, 0.01],
195+
"hidden_layer_sizes": [
196+
(20,),
197+
(20, 20),
198+
],
199+
}
200+
),
201+
"dnn_regressor": config_dict.ConfigDict(
202+
{
203+
# This correspond to the HPT done in the ETL paper (tDNN)
204+
# "In the analysis, the dropout rate was selected among 0, 0.1, 0.25, 0.45,
205+
# and 0.7 by minimizing the validation loss. It was the only hyperparameter
206+
# optimized in the model learning process.""
207+
"dropout_rate": [0, 0.1, 0.25, 0.45, 0.7],
208+
}
209+
),
210+
}

0 commit comments

Comments
 (0)