openproblems-bio
diff --git a/‎scripts/process_data/parsebioscience.sh‎
Lines changed: 1 addition & 1 deletion b/‎scripts/process_data/parsebioscience.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/process_data/rest.sh‎
Lines changed: 4 additions & 4 deletions b/‎scripts/process_data/rest.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎scripts/run_all.sh‎
Lines changed: 22 additions & 19 deletions b/‎scripts/run_all.sh‎
Lines changed: 22 additions & 19 deletions
diff --git a/‎scripts/run_grn_inference.sh‎
Lines changed: 2 additions & 2 deletions b/‎scripts/run_grn_inference.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/sync_resources.sh‎
Lines changed: 5 additions & 3 deletions b/‎scripts/sync_resources.sh‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/metrics/sem/helper.py‎
Lines changed: 104 additions & 98 deletions b/‎src/metrics/sem/helper.py‎
Lines changed: 104 additions & 98 deletions
diff --git a/‎src/metrics/sem/run_local.sh‎
Lines changed: 1 addition & 1 deletion b/‎src/metrics/sem/run_local.sh‎
Lines changed: 1 addition & 1 deletion
@@ -14,4 +14,4 @@
 set -e
 
 echo "Processing bioscience"
-python src/process_data/main/parse_bioscience/script.py  #--run_test
+python src/process_data/main/parsebioscience/script.py  #--run_test
@@ -13,9 +13,9 @@
 
 set -e
 
-python src/process_data/main/adamson/script.py 
+# python src/process_data/main/adamson/script.py 
 # python src/process_data/main/nakatake/script.py 
-python src/process_data/main/norman/script.py
+# python src/process_data/main/norman/script.py
 
 # echo "Processing opsca"
 # python src/process_data/main/opsca/script.py 
@@ -27,5 +27,5 @@ python src/process_data/main/norman/script.py
 
 # echo "Processing 300BCG"
 # python src/process_data/main/300BCG/script.py 
-# echo "Processing IBD"
-# python src/process_data/main/ibd/script.py 
+echo "Processing IBD"
+python src/process_data/main/ibd/script.py 
@@ -1,11 +1,11 @@
 set -e
 
 # datasets=( 'replogle' 'op' 'nakatake' 'adamson' 'norman'  'xaira_HEK293T' 'xaira_HCT116'  'parsebioscience' 'ibd_uc' 'ibd_cd' '300BCG' ) #'replogle' 'op' 'nakatake' 'adamson' 'norman'  'xaira_HEK293T' 'xaira_HCT116'  'parsebioscience' 'ibd_uc' 'ibd_cd'  '300BCG') #
-datasets=( 'op' 'replogle'  ) #'replogle' 'op' 'nakatake' 'adamson' 'norman'  'xaira_HEK293T' 'xaira_HCT116'  'parsebioscience' 'ibd_uc' 'ibd_cd' '300BCG') #
+datasets=( 'ibd_uc' 'ibd_cd' ) #'replogle' 'op' 'nakatake' 'adamson' 'norman'  'xaira_HEK293T' 'xaira_HCT116'  'parsebioscience' 'ibd_uc' 'ibd_cd' '300BCG') #
 run_local=false # set to true to run locally, false to run on AWS
 
-run_grn_inference=false
-run_grn_evaluation=true
+run_grn_inference=true
+run_grn_evaluation=false
 run_download=false
 
 num_workers=20
@@ -33,29 +33,32 @@ for dataset in "${datasets[@]}"; do
     fi
 
     if [ "$run_grn_evaluation" = true ]; then
-        # if [ -f "$trace_file" ]; then
-        #     dir=$(dirname "$trace_file")
-        #     base=$(basename "$trace_file" .txt)
-        #     today=$(date +%Y-%m-%d)
-        #     cp "$trace_file" "${dir}/${base}_${today}.txt"
-        # fi
+
+        if [ -f "$trace_file" ]; then
+            dir=$(dirname "$trace_file")
+            base=$(basename "$trace_file" .txt)
+            today=$(date +%Y-%m-%d)
+            cp "$trace_file" "${dir}/${base}_${today}.txt"
+        fi
 
         # if [ "$run_local" = false ]; then
         #     echo "Uploading inference results to AWS"
         #     aws s3 sync  resources/results/$dataset s3://openproblems-data/resources/grn/results/$dataset 
         #     aws s3 sync  s3://openproblems-data/resources/grn/results/$dataset resources/results/$dataset 
         # fi 
-        # if [ "$run_local" = false ]; then
-        #     echo "Downloading inference results from AWS"
-        #     aws s3 sync  s3://openproblems-data/resources/grn/results/$dataset resources/results/$dataset 
-        # fi
-        # echo "Running consensus for dataset: $dataset"
-        # bash scripts/prior/run_consensus.sh $dataset # run consensus for Regression and ws distance -> needs to be run after adding each method and dataset
 
-        # if [ "$run_local" = false ]; then
-        #     echo "Syncing prior results to AWS"
-        #     aws s3 sync  resources/grn_benchmark/prior s3://openproblems-data/resources/grn/grn_benchmark/prior 
-        # fi
+
+        if [ "$run_local" = false ]; then
+            echo "Downloading inference results from AWS"
+            aws s3 sync  s3://openproblems-data/resources/grn/results/$dataset resources/results/$dataset 
+        fi
+        echo "Running consensus for dataset: $dataset"
+        bash scripts/prior/run_consensus.sh $dataset # run consensus for Regression and ws distance -> needs to be run after adding each method and dataset
+        
+        if [ "$run_local" = false ]; then
+            echo "Syncing prior results to AWS"
+            aws s3 sync  resources/grn_benchmark/prior s3://openproblems-data/resources/grn/grn_benchmark/prior 
+        fi
 
         echo "Running GRN evaluation for dataset: $dataset"
         bash scripts/run_grn_evaluation.sh --dataset=$dataset --run_local=$run_local --build_images=false --num_workers=$num_workers
 
@@ -117,14 +117,14 @@ HERE
 
 if [[ "$DATASET" =~ ^(replogle|parsebioscience|xaira_HEK293T|xaira_HCT116)$ ]]; then
   methods="[pearson_corr, negative_control, positive_control, grnboost, portia, scenic, geneformer, scgpt, spearman_corr]"
-  # methods="[grnboost, scenic]"
+  # methods="[pearson_corr, negative_control, positive_control, portia, geneformer, scgpt, spearman_corr]"
   append_entry "$DATASET" "$methods" 
   append_entry "$DATASET" "[scprint]" "true"
 
   echo $methods 
 elif [ "$DATASET" = "op" ] || [ "$DATASET" = "ibd_cd" ] || [ "$DATASET" = "ibd_uc" ]; then
   methods="[pearson_corr, spearman_corr, negative_control, positive_control, grnboost, portia, scenic, scprint, geneformer, scgpt, figr, scenicplus, celloracle, granie, scglue]" 
-  methods="[celloracle, scglue]" 
+  # methods="[celloracle, scglue]" 
 
   append_entry "$DATASET" "$methods" 
   echo $methods 
 
@@ -22,12 +22,14 @@ set -e
 
 # aws s3 sync   s3://openproblems-data/resources/grn/grn_benchmark resources/grn_benchmark/ --no-sign-request
 # aws s3 sync  resources/grn_benchmark/prior s3://openproblems-data/resources/grn/grn_benchmark/prior --delete
-# aws s3 sync  resources/extended_data/ s3://openproblems-data/resources/grn/extended_data --delete
+aws s3 sync  resources/extended_data/ s3://openproblems-data/resources/grn/extended_data --delete
 # aws s3 sync resources/results/experiment  s3://openproblems-data/resources/grn/results/experiment --delete 
 # aws s3 sync resources_test  s3://openproblems-data/resources_test/grn/ --delete 
 # aws s3 sync  s3://openproblems-data/resources_test/grn/ resources_test --delete 
-aws s3 sync  s3://openproblems-data/resources/grn/grn_benchmark/ground_truth resources_test/grn_benchmark/ground_truth
+# aws s3 sync  s3://openproblems-data/resources/grn/grn_benchmark/ground_truth resources_test/grn_benchmark/ground_truth
 # aws s3 sync  resources/grn_benchmark/ground_truth s3://openproblems-data/resources/grn/grn_benchmark/ground_truth 
-# aws s3 sync  resources/grn_benchmark/ s3://openproblems-data/resources/grn/grn_benchmark/ --delete
+aws s3 sync  resources/grn_benchmark/ s3://openproblems-data/resources/grn/grn_benchmark/ --delete
+# aws s3 sync  resources/results/ibd_uc s3://openproblems-data/resources/grn/results/ibd_uc 
+# aws s3 sync  resources/results/ibd_cd s3://openproblems-data/resources/grn/results/ibd_cd 
 
 # aws s3 sync   s3://openproblems-data/resources/grn/grn_benchmark/ground_truth resources/grn_benchmark/ground_truth --no-sign-request
@@ -6,6 +6,7 @@
 import pandas as pd
 import torch
 import anndata as ad
+import scanpy as sc
 import tqdm
 from scipy.sparse import csr_matrix
 from scipy.stats import ttest_rel, spearmanr, pearsonr, wilcoxon
@@ -378,40 +379,63 @@ def main(par):
             j = gene_dict[target]
             A[i, j] = float(weight)
 
-    # Only consider the genes that are actually present in the inferred GRN,
-    # and keep only the most-connected genes (for speed).
-    gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
+    # Compute HVGs from full evaluation data (for HVG-based evaluation)
+    print("\n======== Computing HVGs from full evaluation data ========")
+    n_top_hvg = par['n_top_genes']
+    sc.pp.highly_variable_genes(adata, n_top_genes=n_top_hvg, flavor='seurat', layer=layer)
+    hvg_mask_full = adata.var['highly_variable'].values
+    hvg_genes = gene_names[hvg_mask_full]
+    print(f"Total HVGs identified: {hvg_mask_full.sum()}")
+
+    # For GRN-based evaluation: keep only most-connected genes in the GRN
+    print("\n======== Filtering genes for GRN-based evaluation ========")
+    gene_mask_grn = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
     in_degrees = np.sum(A != 0, axis=0)
     out_degrees = np.sum(A != 0, axis=1)
-    # n_genes = par['n_top_genes']
-    n_genes = 3000
-    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-n_genes]
-    gene_mask[idx] = False
-    X = X[:, gene_mask]
-    X = X.toarray() if isinstance(X, csr_matrix) else X
-    A = A[gene_mask, :][:, gene_mask]
-    gene_names = gene_names[gene_mask]
+    n_genes_grn = par['n_top_genes']
+    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-n_genes_grn]
+    gene_mask_grn[idx] = False
+    
+    X_grn = X[:, gene_mask_grn]
+    X_grn = X_grn.toarray() if isinstance(X_grn, csr_matrix) else X_grn
+    A_grn = A[gene_mask_grn, :][:, gene_mask_grn]
+    gene_names_grn = gene_names[gene_mask_grn]
+    print(f"Genes for GRN-based evaluation: {len(gene_names_grn)}")
 
     # Remove self-regulations
     np.fill_diagonal(A, 0)
+    np.fill_diagonal(A_grn, 0)
 
     # Check whether the inferred GRN contains signed predictions
     if False:
         use_signs = np.any(A < 0)
     else:
         use_signs = False
 
-    # Center and scale dataset
-    scaler = StandardScaler()
-    scaler.fit(X[are_controls, :])  # Use controls only to infer statistics (to avoid data leakage)
-    X = scaler.transform(X)
-
-    # Get negative controls
-    X_controls = X[are_controls, :]
-    delta_X = compute_perturbations(X, are_controls, match_groups, loose_match_groups)
+    # Center and scale dataset for GRN-based evaluation
+    scaler_grn = StandardScaler()
+    X_grn_controls = X_grn[are_controls, :]
+    scaler_grn.fit(X_grn_controls)
+    X_grn_scaled = scaler_grn.transform(X_grn)
+    X_grn_controls_scaled = X_grn_scaled[are_controls, :]
+    delta_X_grn = compute_perturbations(X_grn_scaled, are_controls, match_groups, loose_match_groups)
+    delta_X_grn = delta_X_grn[~are_controls, :]
+
+    # Center and scale dataset for HVG-based evaluation (use all HVG genes, even if not in GRN)
+    X_hvg = X[:, hvg_mask_full]
+    X_hvg = X_hvg.toarray() if isinstance(X_hvg, csr_matrix) else X_hvg
+    A_hvg = A[hvg_mask_full, :][:, hvg_mask_full]
+    gene_names_hvg = gene_names[hvg_mask_full]
+    scaler_hvg = StandardScaler()
+    X_hvg_controls = X_hvg[are_controls, :]
+    scaler_hvg.fit(X_hvg_controls)
+    X_hvg_scaled = scaler_hvg.transform(X_hvg)
+    X_hvg_controls_scaled = X_hvg_scaled[are_controls, :]
+    delta_X_hvg = compute_perturbations(X_hvg_scaled, are_controls, match_groups, loose_match_groups)
+    delta_X_hvg = delta_X_hvg[~are_controls, :]
+    print(f"Genes for HVG-based evaluation: {len(gene_names_hvg)}")
 
     # Remove negative controls from downstream analysis
-    delta_X = delta_X[~are_controls, :]
     cv_groups = cv_groups[~are_controls]
     match_groups = match_groups[~are_controls]
     loose_match_groups = loose_match_groups[~are_controls]
@@ -420,94 +444,76 @@ def main(par):
     # Make sure that no compound ends up in both sets.
     try:
         splitter = GroupShuffleSplit(test_size=0.5, n_splits=2, random_state=seed)  # Use consistent seed
-        train_idx, _ = next(splitter.split(delta_X, groups=cv_groups))
+        train_idx, _ = next(splitter.split(delta_X_grn, groups=cv_groups))
     except ValueError:
         print("Group k-fold failed. Using k-fold CV instead.")
         splitter = KFold(n_splits=2, random_state=seed, shuffle=True)  # Use consistent seed
-        train_idx, _ = next(splitter.split(delta_X))
-    is_train = np.zeros(len(delta_X), dtype=bool)
+        train_idx, _ = next(splitter.split(delta_X_grn))
+    is_train = np.zeros(len(delta_X_grn), dtype=bool)
     is_train[train_idx] = True
 
-    # Create a split between genes: reporter genes and evaluation genes.
-    # All TFs and IEGs should be included in the reporter gene set.
-    n_genes = A.shape[1]
-    reg_mask = np.asarray(A != 0).any(axis=1)
-    ieg_mask = np.asarray([gene_name in IEG for gene_name in gene_names], dtype=bool)
-    is_reporter = np.logical_or(reg_mask, ieg_mask)
-    print(f"Proportion of reporter genes: {np.mean(is_reporter)}")
-    print(f"Use regulatory modes/signs: {use_signs}")
+    # ========== GRN-based evaluation ==========
+    print("\n======== Evaluate inferred GRN (GRN-based: most connected genes) ========")
+    n_genes_grn = A_grn.shape[1]
+    reg_mask_grn = np.asarray(A_grn != 0).any(axis=1)
+    ieg_mask_grn = np.asarray([gene_name in IEG for gene_name in gene_names_grn], dtype=bool)
+    is_reporter_grn = np.logical_or(reg_mask_grn, ieg_mask_grn)
+    print(f"Proportion of reporter genes (GRN): {np.mean(is_reporter_grn)}")
 
-    # Create baseline model
-    try:
-        A_baseline = create_grn_baseline(A)
-    except:
-        print("Failed to create baseline GRN. Using zero baseline.")
-        raise ValueError("Failed to create baseline GRN.")
-
-    # Evaluate inferred GRN
-    print("\n======== Evaluate inferred GRN ========")
-    scores = evaluate_grn(X_controls, delta_X, is_train, is_reporter, A, signed=use_signs)
+
+    scores_grn = evaluate_grn(X_grn_controls_scaled, delta_X_grn, is_train, is_reporter_grn, A_grn, signed=use_signs)
+    valid_scores_grn = scores_grn[~np.isnan(scores_grn)]
+
+    if len(valid_scores_grn) == 0:
+        print("WARNING: No valid genes to evaluate for GRN-based!")
+        sem_grn_score = 0.0
+    else:
+        sem_grn_score = float(np.mean(valid_scores_grn))
+        print(f"SEM GRN score (mean R²): {sem_grn_score:.4f}")
+        print(f"Valid genes evaluated: {len(valid_scores_grn)}/{len(scores_grn)}")
+        print(f"SEM GRN score (min): {np.min(valid_scores_grn):.4f}")
+        print(f"SEM GRN score (max): {np.max(valid_scores_grn):.4f}")
+
+    # ========== HVG-based evaluation ==========
+    print("\n======== Evaluate inferred GRN (HVG-based: highly variable genes) ========")
+    n_genes_hvg = A_hvg.shape[1]
+    reg_mask_hvg = np.asarray(A_hvg != 0).any(axis=1)
+    ieg_mask_hvg = np.asarray([gene_name in IEG for gene_name in gene_names_hvg], dtype=bool)
+    is_reporter_hvg = np.logical_or(reg_mask_hvg, ieg_mask_hvg)
+    print(f"Proportion of reporter genes (HVG): {np.mean(is_reporter_hvg)}")
+
+    scores_hvg = evaluate_grn(X_hvg_controls_scaled, delta_X_hvg, is_train, is_reporter_hvg, A_hvg, signed=use_signs)
+    
+    # For HVGs: genes with no GRN connections get score of 0 (penalize missing connections)
+    has_parent_hvg = (np.asarray(A_hvg != 0).any(axis=0))
+    eval_mask_hvg = ~is_reporter_hvg
+    scores_hvg_penalized = scores_hvg.copy()
+    for j in range(len(scores_hvg_penalized)):
+        if eval_mask_hvg[j]:
+            if not has_parent_hvg[j]:  # Gene has no connections in GRN
+                scores_hvg_penalized[j] = 0.0  # Penalize by setting score to 0
+            elif np.isnan(scores_hvg_penalized[j]):
+                scores_hvg_penalized[j] = 0.0  # Also set NaN to 0
 
-    # Keep only valid scores (non-NaN)
-    valid_scores = scores[~np.isnan(scores)]
+    valid_scores_hvg = scores_hvg_penalized[eval_mask_hvg]
 
-    if len(valid_scores) == 0:
-        # No valid genes to evaluate
-        print("WARNING: No valid genes to evaluate!")
-        results = {'sem': [0.0]}
+    if len(valid_scores_hvg) == 0:
+        print("WARNING: No valid HVG genes to evaluate!")
+        sem_hvg_score = 0.0
     else:
-        # Final score is mean of valid R² scores
-        final_score = float(np.mean(valid_scores))
-        
-        print(f"\nMethod: {method_id}")
-        print(f"SEM score (mean R²): {final_score:.4f}")
-        print(f"Valid genes evaluated: {len(valid_scores)}/{len(scores)}")
-        print(f"SEM score (min): {np.min(valid_scores):.4f}")
-        print(f"SEM score (max): {np.max(valid_scores):.4f}")
-        
-        results = {'sem': [float(final_score)]}
+        sem_hvg_score = float(np.mean(valid_scores_hvg))
+        n_missing = np.sum(~has_parent_hvg[eval_mask_hvg])
+        print(f"SEM HVG score (mean R²): {sem_hvg_score:.4f}")
+        print(f"HVG genes evaluated: {len(valid_scores_hvg)}")
+        print(f"HVG genes missing in GRN (penalized with 0): {n_missing}")
+        print(f"SEM HVG score (min): {np.min(valid_scores_hvg):.4f}")
+        print(f"SEM HVG score (max): {np.max(valid_scores_hvg):.4f}")
 
-    # Evaluate baseline GRN
-    if False:
-        print("\n======== Evaluate shuffled GRN ========")
-        scores_baseline = evaluate_grn(X_controls, delta_X, is_train, is_reporter, A_baseline, signed=use_signs)
-
-        # Keep only the genes for which both GRNs got a score
-        mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline))
-        scores = scores[mask]
-        scores_baseline = scores_baseline[mask]
-
-        rr_all = {}
-        # Perform rank test between actual scores and baseline
-        rr_all['spearman'] = float(np.mean(scores))
-        rr_all['spearman_shuffled'] = float(np.mean(scores_baseline))
-        if len(scores) == 0:
-            raise ValueError("No valid scores to compare between inferred GRN and baseline GRN.")
-        elif np.all(scores - scores_baseline == 0):
-            # Identical performance (suspicious - likely an error)
-            raise ValueError("Identical performance between inferred GRN and baseline GRN - likely an error.")
-        else:
-            res = wilcoxon(scores - scores_baseline, zero_method='wilcox', alternative='greater')
-            rr_all['Wilcoxon pvalue'] = float(res.pvalue)
-
-            print(rr_all)
-            
-            eps = 1e-300  # very small number to avoid log(0)
-            pval_clipped = max(res.pvalue, eps)
-            
-            # Set to 0 if not significant (p >= 0.05)
-            if res.pvalue >= 0.05:
-                score = 0.0
-                print(f"p-value: {res.pvalue:.6f} (not significant, p >= 0.05)")
-                print(f"SEM score set to 0")
-            else:
-                # Compute final score
-                score = -np.log10(pval_clipped)
-                print(f"p-value: {res.pvalue:.6f} (significant)")
-            
-            print(f"Final score: {score}")
-        results['sem_precision'] = [float(np.log2(np.mean(scores) / (np.mean(scores_baseline) + 1e-6)))]
-        results['sem_n'] = [float(score)]
+    results = {
+        'sem_grn': [float(sem_grn_score)],
+        'sem_hvg': [float(sem_hvg_score)],
+        'sem': [float((sem_grn_score + sem_hvg_score) / 2)]
+    }
 
     df_results = pd.DataFrame(results)
     return df_results
@@ -16,7 +16,7 @@ save_dir="output/sem"
 mkdir -p "$save_dir"
 
 # datasets to process
-datasets=('op' 'replogle' "300BCG" 'ibd_uc' 'ibd_cd' 'parsebioscience' ) #"300BCG" "ibd" 'parsebioscience'
+datasets=('op' 'replogle' "300BCG" 'parsebioscience' ) #"300BCG" "ibd" 'parsebioscience'
 # datasets=('op')
 # methods to process
 methods=("grnboost" "pearson_corr" "negative_control" "positive_control" "ppcor" "portia" "scenic" "scprint" "scenicplus" "celloracle" "scglue" "figr" "granie")