Merge pull request #40 from openproblems-bio/jalil

janursa · web-flow · commit 1eacd3bb46a6 · 2025-08-17T15:37:00.000+02:00
qc on perturbation effect added
diff --git a/README.md b/README.md
@@ -90,9 +90,10 @@ Of note, we are using the `resources_test` datasets, which are small versions of
 Once got the prediction for a given dataset (e.g. op), use the following code to obtain evaluation scores. 
 
 ```bash
-scripts/single_grn_evaluation.sh output/net.h5ad op --test_run
+bash scripts/run_grn_evaluation.sh --prediction=output/net.h5ad --save_dir=output/ --dataset=op --build_images=true --test_run=true
 ```
-**This** outputs the scores into `output/test_run/score_uns.yaml`. Of note, by passing `--test_run`, the evaluations are done on the test data. To use the actual data (`resources` folder), omit this flag.
+
+**This** outputs the scores into `output/score_uns.yaml`. Of note, by passing `--test_run`, the evaluations are done on the test data. To use the actual data (`resources` folder), omit this flag.
 
 
 ## Add a GRN inference method, evaluation metric, or dataset
@@ -109,6 +110,7 @@ To add a new component to the repository, follow the [Documentation](https://gen
 | Antoine Passimier | contributor |
 | Marco Stock       | contributor |
 | Christian Arnold  | contributor |
+| Jérémie Kalfon    | contributor |
 
 ## API
 
diff --git a/docs/source/evaluation.rst b/docs/source/evaluation.rst
@@ -32,7 +32,16 @@ The inferred network should have a tabular format with the following columns:
 See `resources_test/grn_models/op/collectri.h5ad` for an example of the expected format.
 
 For the regression based approaches, we used the pseudobulk version of the perturbation data while for the Wasserstein distance, the single cell data are used.
-See 'scripts/single_grn_evaluation.sh' for an example of how to run the evaluation.
 
 It should be noted that for Wasserstein distance, we have already computed all possible combination of TF-gene pairs and stored it in the `resources/grn_benchmark/prior/` folder.
 This substantially reduces the computation time during evaluation.
+
+To run the evalution for a given GRN and dataset, use the following command:
+```bash
+bash scripts/run_grn_evaluation.sh --prediction=<inferred GRN (e.g.collectri.h5ad)> --save_dir=<e.g.output/> --dataset=<e.g. replogle> --build_images=<true or false. true for the first time running> --run_test=<true or false. true to run on test data>
+```
+
+example command:
+```bash
+bash scripts/run_grn_evaluation.sh --prediction=resources/grn_models/op/collectri.h5ad --save_dir=output/ --dataset=op --build_images=true --test_run=false
+```
diff --git a/scripts/run_all.sh b/scripts/run_all.sh
@@ -1,11 +1,11 @@
 set -e
 
-datasets=('parsebioscience' 'xaira_HCT116' 'xaira_HEK293T') #'replogle' 'op' 'nakatake' 'adamson' 'norman' 
-run_local=false
+datasets=('replogle') #'replogle' 'op' 'nakatake' 'adamson' 'norman' 
+run_local=false # set to true to run locally, false to run on AWS
 
-run_grn_inference=false
+run_grn_inference=true
 run_grn_evaluation=false
-run_download=true
+run_download=false
 
 for dataset in "${datasets[@]}"; do
     if [ "$run_grn_inference" = true ]; then
@@ -33,7 +33,7 @@ for dataset in "${datasets[@]}"; do
         fi
 
         echo "Running GRN evaluation for dataset: $dataset"
-        bash scripts/run_grn_evaluation.sh $dataset $run_local
+        bash scripts/run_grn_evaluation.sh --dataset=$dataset --run_local=$run_local --build_images=false
     fi
 
     if [ "$run_download" = true ]; then
diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh
@@ -1,61 +1,98 @@
+# to run a single GRN evaluation, use the following command:
+# bash scripts/run_grn_evaluation.sh --prediction=<prediction file (e.g. prediction.h5ad)> --save_dir=<save dir> --dataset=<dataset (replogle)> --build_images=<true/false (building docker images-only needed one time)> --test_run=<true/false (to use test data)> --run_local=true>
+
+
+
 #!/bin/bash
-DATASET="${1:-replogle}" # Default dataset if not provided
-run_local="${2:-false}" 
+set -e
+
+RUN_LOCAL="true"
+RUN_TEST=false
+PREDICTION="none"
+SAVE_DIR="none"
+BUILD_IMAGES=true
+
+# Parse arguments
+for arg in "$@"; do
+    case $arg in
+        --dataset=*)
+            DATASET="${arg#*=}"
+            shift
+            ;;
+        --prediction=*)
+            PREDICTION="${arg#*=}"
+            shift
+            ;;
+        --test_run=*)
+            RUN_TEST="${arg#*=}"
+            shift
+            ;;
+        --save_dir=*)
+            SAVE_DIR="${arg#*=}"
+            shift
+            ;;
+        --build_images=*)
+            BUILD_IMAGES="${arg#*=}"
+            shift
+            ;;
+        --run_local=*)
+            RUN_LOCAL="${arg#*=}"
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $arg"
+            exit 1
+            ;;
+    esac
+done
 
-# datasets="norman replogle op nakatake adamson"
-datasets="$DATASET" #xaira_HCT116 xaira_HEK293T parsebioscience replogle
+if [ -z "${DATASET:-}" ]; then
+    echo "Error: DATASET must be provided. Use --dataset=<dataset_name>."
+    exit 1
+fi
 
 num_workers=10
 metric_ids="[regression_2, ws_distance]" #regression_1, regression_2, ws_distance
 RUN_ID="${DATASET}_evaluation"
+
 models_folder="${DATASET}/"
 reg_type="ridge"
 apply_skeleton=false
 apply_tf=true
 layer='lognorm'
+if [ "$RUN_TEST" = "false" ]; then
+    resource_folder="resources/"
+else
+    resource_folder="resources_test/"
+fi
 
-grn_names=(
-    "positive_control"
-    "pearson_corr"
-    "negative_control"
-    "scglue"
-    "scenicplus"
-    "celloracle"
-    "granie"
-    "figr"
-    "grnboost2"
-    "ppcor"
-    "portia"
-    "scenic"
-    "scprint"
-)
-
-if [ "$run_local" = true ]; then
-  resources_dir="./resources/"
+if [ "$RUN_LOCAL" = true ]; then
+  resources_dir="./${resource_folder}"
 else
-  resources_dir="s3://openproblems-data/resources/grn"
+  resources_dir="s3://openproblems-data/${resource_folder}/grn"
 fi
 
+if [ "$SAVE_DIR" != "none" ]; then
+  publish_dir="${SAVE_DIR}"
+else
+  publish_dir="${resources_dir}/results/${models_folder}"
+fi
 
-publish_dir="${resources_dir}/results/${models_folder}"
+mkdir -p "$publish_dir"
 echo "Publish dir: $publish_dir"
-grn_models_folder="${resources_dir}/results/${models_folder}/"
-grn_models_folder_local="./resources/results/${models_folder}/" # just to control the hetergenity of the models for different datasets
 
 
 params_dir="./params"
+mkdir -p "$params_dir"
 param_file="${params_dir}/${RUN_ID}.yaml"
 param_local="${params_dir}/${RUN_ID}_param_local.yaml"
 param_aws="s3://openproblems-data/resources/grn/results/params/${RUN_ID}_param_local.yaml"
 
-# Print GRN names correctly
-echo "GRN models: ${grn_names[@]}"
-
 # Ensure param_file is clean
 > "$param_local"
 > "$param_file"
 
-if [ "$run_local" = true ]; then
+if [ "$RUN_LOCAL" = true ]; then
   cat >> "$param_local" << HERE
 param_list:
 HERE
@@ -65,7 +102,8 @@ fi
 
 append_entry() {
   local grn_name="$1"
-  local dataset="$2"
+  local prediction="$2"
+  local dataset="$3"
   if [[ "$dataset" =~ ^(norman|nakatake|adamson)$ ]]; then
     layer_='X_norm'
   else
@@ -77,7 +115,7 @@ append_entry() {
     evaluation_data: ${resources_dir}/grn_benchmark/evaluation_data/${dataset}_bulk.h5ad
     tf_all: ${resources_dir}/grn_benchmark/prior/tf_all.csv
     regulators_consensus: ${resources_dir}/grn_benchmark/prior/regulators_consensus_${dataset}.json
-    prediction: ${grn_models_folder}/${dataset}.${grn_name}.${grn_name}.prediction.h5ad
+    prediction: ${prediction}
     skeleton: ${resources_dir}/grn_benchmark/prior/skeleton.csv
     apply_skeleton: ${apply_skeleton}
     apply_tf: ${apply_tf}
@@ -94,31 +132,55 @@ HERE
   fi
 }
 
-# Iterate over datasets and GRN models
-
-for dataset in $datasets; do
+if [ "$PREDICTION" != "none" ]; then
+  append_entry "single_run" $PREDICTION "$DATASET"
+else
+  grn_names=(
+      "positive_control"
+      "pearson_corr"
+      "negative_control"
+      "scglue"
+      "scenicplus"
+      "celloracle"
+      "granie"
+      "figr"
+      "grnboost2"
+      "ppcor"
+      "portia"
+      "scenic"
+      "scprint"
+  )
+  grn_models_folder="${resources_dir}/results/${models_folder}/"
+  grn_models_folder_local="./resources/results/${models_folder}/" # just to control the hetergenity of the models for different datasets
+
+  # Iterate over GRN models
   available_methods=()
   for grn_name in "${grn_names[@]}"; do
-    prediction_file="${grn_models_folder_local}/${dataset}.${grn_name}.${grn_name}.prediction.h5ad"
+    prediction_file="${grn_models_folder_local}/${DATASET}.${grn_name}.${grn_name}.prediction.h5ad"
     if [[ -f "${prediction_file}" ]]; then
-      append_entry "$grn_name" "$dataset"
+      prediction_file=${grn_models_folder}/${DATASET}.${grn_name}.${grn_name}.prediction.h5ad
+      append_entry "$grn_name" $prediction_file "$DATASET"
       available_methods+=("$grn_name")
-    else
-      echo "File not found: ${prediction_file}"
     fi
   done
   echo "Available methods:"
   printf '%s\n' "${available_methods[@]}" | sort -u
-done
+
+fi
 
 
 # Append final fields
-if [ "$run_local" = true ]; then
+if [ "$RUN_LOCAL" = true ]; then
   cat >> "$param_local" << HERE
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
-  viash ns build --parallel 
+  if [ "$BUILD_IMAGES" = true ]; then
+    echo "Building Docker images..."
+    viash ns build --parallel --setup build -s src/metrics/
+  else
+    viash ns build --parallel 
+  fi
   echo "Parameter file created: $param_local"
   nextflow run . \
     -main-script  target/nextflow/workflows/run_grn_evaluation/main.nf \
diff --git a/scripts/run_process_data.sh b/scripts/run_process_data.sh
@@ -3,9 +3,9 @@
 #SBATCH --output=logs/%j.out
 #SBATCH --error=logs/%j.err
 #SBATCH --ntasks=1
-#SBATCH --cpus-per-task=20
+#SBATCH --cpus-per-task=2
 #SBATCH --time=20:00:00
-#SBATCH --mem=1500GB
+#SBATCH --mem=1000GB
 #SBATCH --partition=cpu
 #SBATCH --mail-type=END,FAIL      
 #SBATCH --mail-user=jalil.nourisa@gmail.com   
@@ -18,6 +18,6 @@ set -e
 # python src/process_data/norman/script.py
 
 # python src/process_data/opsca/script.py 
-# python src/process_data/replogle/script.py #--run_test  #--run_test
-# python src/process_data/xaira/script.py   #--run_test
-python src/process_data/parse_bioscience/script.py  #--run_test
+# python src/process_data/replogle/script.py  #--run_test  #--run_test
+python src/process_data/xaira/script.py   #--run_test
+# python src/process_data/parse_bioscience/script.py  #--run_test
diff --git a/scripts/sync_resources.sh b/scripts/sync_resources.sh
@@ -18,4 +18,4 @@ set -e
 # aws s3 sync  s3://openproblems-data/resources/grn/grn_models resources/grn_models --delete 
 # aws s3 sync  resources_test/ s3://openproblems-data/resources_test/grn/  --delete 
 aws s3 sync  resources/grn_benchmark/ s3://openproblems-data/resources/grn/grn_benchmark  --delete 
-aws s3 sync  resources/extended_data/ s3://openproblems-data/resources/grn/extended_data  --delete 
+# aws s3 sync  resources/extended_data/ s3://openproblems-data/resources/grn/extended_data  --delete 
diff --git a/src/methods/single_omics/scprint/config.vsh.yaml b/src/methods/single_omics/scprint/config.vsh.yaml
@@ -69,4 +69,4 @@ runners:
    # docker_run_args: --gpus all
   - type: nextflow
     directives:
-      label: [midtime, midmem, midcpu, biggpu]
+      label: [midtime, highmem, midcpu, biggpu]
diff --git a/src/process_data/helper_data.py b/src/process_data/helper_data.py
diff --git a/src/process_data/xaira/script.py b/src/process_data/xaira/script.py
diff --git a/test.ipynb b/test.ipynb