Skip to content

Commit 1eacd3b

Browse files
authored
Merge pull request #40 from openproblems-bio/jalil
qc on perturbation effect added
2 parents a03da55 + b1506f2 commit 1eacd3b

File tree

10 files changed

+485
-220
lines changed

10 files changed

+485
-220
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,10 @@ Of note, we are using the `resources_test` datasets, which are small versions of
9090
Once got the prediction for a given dataset (e.g. op), use the following code to obtain evaluation scores.
9191

9292
```bash
93-
scripts/single_grn_evaluation.sh output/net.h5ad op --test_run
93+
bash scripts/run_grn_evaluation.sh --prediction=output/net.h5ad --save_dir=output/ --dataset=op --build_images=true --test_run=true
9494
```
95-
**This** outputs the scores into `output/test_run/score_uns.yaml`. Of note, by passing `--test_run`, the evaluations are done on the test data. To use the actual data (`resources` folder), omit this flag.
95+
96+
**This** outputs the scores into `output/score_uns.yaml`. Of note, by passing `--test_run`, the evaluations are done on the test data. To use the actual data (`resources` folder), omit this flag.
9697

9798

9899
## Add a GRN inference method, evaluation metric, or dataset
@@ -109,6 +110,7 @@ To add a new component to the repository, follow the [Documentation](https://gen
109110
| Antoine Passimier | contributor |
110111
| Marco Stock | contributor |
111112
| Christian Arnold | contributor |
113+
| Jérémie Kalfon | contributor |
112114

113115
## API
114116

docs/source/evaluation.rst

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,16 @@ The inferred network should have a tabular format with the following columns:
3232
See `resources_test/grn_models/op/collectri.h5ad` for an example of the expected format.
3333

3434
For the regression based approaches, we used the pseudobulk version of the perturbation data while for the Wasserstein distance, the single cell data are used.
35-
See 'scripts/single_grn_evaluation.sh' for an example of how to run the evaluation.
3635

3736
It should be noted that for Wasserstein distance, we have already computed all possible combination of TF-gene pairs and stored it in the `resources/grn_benchmark/prior/` folder.
3837
This substantially reduces the computation time during evaluation.
38+
39+
To run the evalution for a given GRN and dataset, use the following command:
40+
```bash
41+
bash scripts/run_grn_evaluation.sh --prediction=<inferred GRN (e.g.collectri.h5ad)> --save_dir=<e.g.output/> --dataset=<e.g. replogle> --build_images=<true or false. true for the first time running> --run_test=<true or false. true to run on test data>
42+
```
43+
44+
example command:
45+
```bash
46+
bash scripts/run_grn_evaluation.sh --prediction=resources/grn_models/op/collectri.h5ad --save_dir=output/ --dataset=op --build_images=true --test_run=false
47+
```

scripts/run_all.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
set -e
22

3-
datasets=('parsebioscience' 'xaira_HCT116' 'xaira_HEK293T') #'replogle' 'op' 'nakatake' 'adamson' 'norman'
4-
run_local=false
3+
datasets=('replogle') #'replogle' 'op' 'nakatake' 'adamson' 'norman'
4+
run_local=false # set to true to run locally, false to run on AWS
55

6-
run_grn_inference=false
6+
run_grn_inference=true
77
run_grn_evaluation=false
8-
run_download=true
8+
run_download=false
99

1010
for dataset in "${datasets[@]}"; do
1111
if [ "$run_grn_inference" = true ]; then
@@ -33,7 +33,7 @@ for dataset in "${datasets[@]}"; do
3333
fi
3434

3535
echo "Running GRN evaluation for dataset: $dataset"
36-
bash scripts/run_grn_evaluation.sh $dataset $run_local
36+
bash scripts/run_grn_evaluation.sh --dataset=$dataset --run_local=$run_local --build_images=false
3737
fi
3838

3939
if [ "$run_download" = true ]; then

scripts/run_grn_evaluation.sh

Lines changed: 104 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,98 @@
1+
# to run a single GRN evaluation, use the following command:
2+
# bash scripts/run_grn_evaluation.sh --prediction=<prediction file (e.g. prediction.h5ad)> --save_dir=<save dir> --dataset=<dataset (replogle)> --build_images=<true/false (building docker images-only needed one time)> --test_run=<true/false (to use test data)> --run_local=true>
3+
4+
5+
16
#!/bin/bash
2-
DATASET="${1:-replogle}" # Default dataset if not provided
3-
run_local="${2:-false}"
7+
set -e
8+
9+
RUN_LOCAL="true"
10+
RUN_TEST=false
11+
PREDICTION="none"
12+
SAVE_DIR="none"
13+
BUILD_IMAGES=true
14+
15+
# Parse arguments
16+
for arg in "$@"; do
17+
case $arg in
18+
--dataset=*)
19+
DATASET="${arg#*=}"
20+
shift
21+
;;
22+
--prediction=*)
23+
PREDICTION="${arg#*=}"
24+
shift
25+
;;
26+
--test_run=*)
27+
RUN_TEST="${arg#*=}"
28+
shift
29+
;;
30+
--save_dir=*)
31+
SAVE_DIR="${arg#*=}"
32+
shift
33+
;;
34+
--build_images=*)
35+
BUILD_IMAGES="${arg#*=}"
36+
shift
37+
;;
38+
--run_local=*)
39+
RUN_LOCAL="${arg#*=}"
40+
shift
41+
;;
42+
*)
43+
echo "Unknown argument: $arg"
44+
exit 1
45+
;;
46+
esac
47+
done
448

5-
# datasets="norman replogle op nakatake adamson"
6-
datasets="$DATASET" #xaira_HCT116 xaira_HEK293T parsebioscience replogle
49+
if [ -z "${DATASET:-}" ]; then
50+
echo "Error: DATASET must be provided. Use --dataset=<dataset_name>."
51+
exit 1
52+
fi
753

854
num_workers=10
955
metric_ids="[regression_2, ws_distance]" #regression_1, regression_2, ws_distance
1056
RUN_ID="${DATASET}_evaluation"
57+
1158
models_folder="${DATASET}/"
1259
reg_type="ridge"
1360
apply_skeleton=false
1461
apply_tf=true
1562
layer='lognorm'
63+
if [ "$RUN_TEST" = "false" ]; then
64+
resource_folder="resources/"
65+
else
66+
resource_folder="resources_test/"
67+
fi
1668

17-
grn_names=(
18-
"positive_control"
19-
"pearson_corr"
20-
"negative_control"
21-
"scglue"
22-
"scenicplus"
23-
"celloracle"
24-
"granie"
25-
"figr"
26-
"grnboost2"
27-
"ppcor"
28-
"portia"
29-
"scenic"
30-
"scprint"
31-
)
32-
33-
if [ "$run_local" = true ]; then
34-
resources_dir="./resources/"
69+
if [ "$RUN_LOCAL" = true ]; then
70+
resources_dir="./${resource_folder}"
3571
else
36-
resources_dir="s3://openproblems-data/resources/grn"
72+
resources_dir="s3://openproblems-data/${resource_folder}/grn"
3773
fi
3874

75+
if [ "$SAVE_DIR" != "none" ]; then
76+
publish_dir="${SAVE_DIR}"
77+
else
78+
publish_dir="${resources_dir}/results/${models_folder}"
79+
fi
3980

40-
publish_dir="${resources_dir}/results/${models_folder}"
81+
mkdir -p "$publish_dir"
4182
echo "Publish dir: $publish_dir"
42-
grn_models_folder="${resources_dir}/results/${models_folder}/"
43-
grn_models_folder_local="./resources/results/${models_folder}/" # just to control the hetergenity of the models for different datasets
4483

4584

4685
params_dir="./params"
86+
mkdir -p "$params_dir"
4787
param_file="${params_dir}/${RUN_ID}.yaml"
4888
param_local="${params_dir}/${RUN_ID}_param_local.yaml"
4989
param_aws="s3://openproblems-data/resources/grn/results/params/${RUN_ID}_param_local.yaml"
5090

51-
# Print GRN names correctly
52-
echo "GRN models: ${grn_names[@]}"
53-
5491
# Ensure param_file is clean
5592
> "$param_local"
5693
> "$param_file"
5794

58-
if [ "$run_local" = true ]; then
95+
if [ "$RUN_LOCAL" = true ]; then
5996
cat >> "$param_local" << HERE
6097
param_list:
6198
HERE
@@ -65,7 +102,8 @@ fi
65102

66103
append_entry() {
67104
local grn_name="$1"
68-
local dataset="$2"
105+
local prediction="$2"
106+
local dataset="$3"
69107
if [[ "$dataset" =~ ^(norman|nakatake|adamson)$ ]]; then
70108
layer_='X_norm'
71109
else
@@ -77,7 +115,7 @@ append_entry() {
77115
evaluation_data: ${resources_dir}/grn_benchmark/evaluation_data/${dataset}_bulk.h5ad
78116
tf_all: ${resources_dir}/grn_benchmark/prior/tf_all.csv
79117
regulators_consensus: ${resources_dir}/grn_benchmark/prior/regulators_consensus_${dataset}.json
80-
prediction: ${grn_models_folder}/${dataset}.${grn_name}.${grn_name}.prediction.h5ad
118+
prediction: ${prediction}
81119
skeleton: ${resources_dir}/grn_benchmark/prior/skeleton.csv
82120
apply_skeleton: ${apply_skeleton}
83121
apply_tf: ${apply_tf}
@@ -94,31 +132,55 @@ HERE
94132
fi
95133
}
96134

97-
# Iterate over datasets and GRN models
98-
99-
for dataset in $datasets; do
135+
if [ "$PREDICTION" != "none" ]; then
136+
append_entry "single_run" $PREDICTION "$DATASET"
137+
else
138+
grn_names=(
139+
"positive_control"
140+
"pearson_corr"
141+
"negative_control"
142+
"scglue"
143+
"scenicplus"
144+
"celloracle"
145+
"granie"
146+
"figr"
147+
"grnboost2"
148+
"ppcor"
149+
"portia"
150+
"scenic"
151+
"scprint"
152+
)
153+
grn_models_folder="${resources_dir}/results/${models_folder}/"
154+
grn_models_folder_local="./resources/results/${models_folder}/" # just to control the hetergenity of the models for different datasets
155+
156+
# Iterate over GRN models
100157
available_methods=()
101158
for grn_name in "${grn_names[@]}"; do
102-
prediction_file="${grn_models_folder_local}/${dataset}.${grn_name}.${grn_name}.prediction.h5ad"
159+
prediction_file="${grn_models_folder_local}/${DATASET}.${grn_name}.${grn_name}.prediction.h5ad"
103160
if [[ -f "${prediction_file}" ]]; then
104-
append_entry "$grn_name" "$dataset"
161+
prediction_file=${grn_models_folder}/${DATASET}.${grn_name}.${grn_name}.prediction.h5ad
162+
append_entry "$grn_name" $prediction_file "$DATASET"
105163
available_methods+=("$grn_name")
106-
else
107-
echo "File not found: ${prediction_file}"
108164
fi
109165
done
110166
echo "Available methods:"
111167
printf '%s\n' "${available_methods[@]}" | sort -u
112-
done
168+
169+
fi
113170

114171

115172
# Append final fields
116-
if [ "$run_local" = true ]; then
173+
if [ "$RUN_LOCAL" = true ]; then
117174
cat >> "$param_local" << HERE
118175
output_state: "state.yaml"
119176
publish_dir: "$publish_dir"
120177
HERE
121-
viash ns build --parallel
178+
if [ "$BUILD_IMAGES" = true ]; then
179+
echo "Building Docker images..."
180+
viash ns build --parallel --setup build -s src/metrics/
181+
else
182+
viash ns build --parallel
183+
fi
122184
echo "Parameter file created: $param_local"
123185
nextflow run . \
124186
-main-script target/nextflow/workflows/run_grn_evaluation/main.nf \

scripts/run_process_data.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
#SBATCH --output=logs/%j.out
44
#SBATCH --error=logs/%j.err
55
#SBATCH --ntasks=1
6-
#SBATCH --cpus-per-task=20
6+
#SBATCH --cpus-per-task=2
77
#SBATCH --time=20:00:00
8-
#SBATCH --mem=1500GB
8+
#SBATCH --mem=1000GB
99
#SBATCH --partition=cpu
1010
#SBATCH --mail-type=END,FAIL
1111
#SBATCH --mail-user=jalil.nourisa@gmail.com
@@ -18,6 +18,6 @@ set -e
1818
# python src/process_data/norman/script.py
1919

2020
# python src/process_data/opsca/script.py
21-
# python src/process_data/replogle/script.py #--run_test #--run_test
22-
# python src/process_data/xaira/script.py #--run_test
23-
python src/process_data/parse_bioscience/script.py #--run_test
21+
# python src/process_data/replogle/script.py #--run_test #--run_test
22+
python src/process_data/xaira/script.py #--run_test
23+
# python src/process_data/parse_bioscience/script.py #--run_test

scripts/sync_resources.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ set -e
1818
# aws s3 sync s3://openproblems-data/resources/grn/grn_models resources/grn_models --delete
1919
# aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete
2020
aws s3 sync resources/grn_benchmark/ s3://openproblems-data/resources/grn/grn_benchmark --delete
21-
aws s3 sync resources/extended_data/ s3://openproblems-data/resources/grn/extended_data --delete
21+
# aws s3 sync resources/extended_data/ s3://openproblems-data/resources/grn/extended_data --delete

src/methods/single_omics/scprint/config.vsh.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,4 @@ runners:
6969
# docker_run_args: --gpus all
7070
- type: nextflow
7171
directives:
72-
label: [midtime, midmem, midcpu, biggpu]
72+
label: [midtime, highmem, midcpu, biggpu]

0 commit comments

Comments
 (0)