Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
5422747
create new directory for enrichment analysis
ajlee21 Jan 8, 2021
e3473c3
move files
ajlee21 Jan 8, 2021
88ac5d1
update env to include gsva
ajlee21 Jan 8, 2021
f0cb337
add GSVA method
ajlee21 Jan 8, 2021
90f0fc0
update env to run ROAST
ajlee21 Jan 11, 2021
76ea2d4
update scripts and notebooks to run ROAST
ajlee21 Jan 11, 2021
03ff480
update scripts to use multiple enrichment methods and update test not…
ajlee21 Jan 12, 2021
0168256
fix error in test
ajlee21 Jan 12, 2021
3f43242
fix assert statments
ajlee21 Jan 12, 2021
c6fd746
fix file path for test
ajlee21 Jan 12, 2021
a494ec3
update gsa param based on updated scripts
ajlee21 Jan 12, 2021
6fbf1c2
add CAMERA method and start formating
ajlee21 Jan 12, 2021
af2b5d5
add clusterProfiler to env
ajlee21 Jan 13, 2021
5849169
update function and nb to run ORA
ajlee21 Jan 13, 2021
cfd2f35
update comment about ORA output
ajlee21 Jan 13, 2021
cda056f
update enrichment scripts that were causing output errors
ajlee21 Jan 14, 2021
713f0ea
update analysis notebook
ajlee21 Jan 15, 2021
9e2d72f
format enrichment outputs
ajlee21 Jan 15, 2021
27bd9fd
plot summary ranking trend
ajlee21 Jan 15, 2021
4b00f8a
run enrichment analyses and add result files
ajlee21 Jan 16, 2021
42553c6
update comments
ajlee21 Jan 18, 2021
6acf7f3
update comments about takeaway and methods
ajlee21 Jan 18, 2021
7b6533f
fix conflict
ajlee21 Jan 18, 2021
1328b51
Merge branch 'ajlee21-add_enrich'
ajlee21 Jan 18, 2021
8f04367
fixed merge conflict
ajlee21 Jan 21, 2021
ccf38bb
Merge remote-tracking branch 'upstream/master'
ajlee21 Jan 22, 2021
2be744a
update scripts to allow for validation of other datasets
ajlee21 Jan 22, 2021
d932409
add and update pa notebooks to validate against GAPE
ajlee21 Jan 22, 2021
8eecbf9
fix one conflict message
ajlee21 Jan 22, 2021
270164e
update compare gene rank output in notebook
ajlee21 Jan 22, 2021
841ba56
update and verify pa analysis run
ajlee21 Jan 22, 2021
8700aed
add new notebook to create new training compendium with only pao1 sam…
ajlee21 Jan 23, 2021
b402f45
update processing script to account for different training set shape
ajlee21 Jan 23, 2021
dce8f2e
update processing of template experiment that was incorrectly removed
ajlee21 Jan 23, 2021
14ad494
train new compendium
ajlee21 Jan 23, 2021
93ed15d
add new nb and data files for new analysis using new training compendium
ajlee21 Jan 23, 2021
b6ce97f
update correlation plot to use different labeling depending on reference
ajlee21 Jan 25, 2021
dd5bfd7
update result figure and comments
ajlee21 Jan 25, 2021
2895eb9
updated based on PR
ajlee21 Jan 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions configs/config_pseudomonas_33245.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ normalized_compendium_filename "/home/alexandra/Documents/Data/Generic_expressio
shared_genes_filename "/home/alexandra/Documents/Data/Generic_expression_patterns/shared_genes_pseudomonas.pickle"
scaler_filename "/home/alexandra/Documents/Data/Generic_expression_patterns/scaler_transform_pseudomonas.pickle"
rank_genes_by "logFC"
reference_gene_filename "GAPE_proportions.txt"
reference_gene_name_col "gene id"
reference_rank_col "prop DEGs"
pathway_DB_filename "https://raw.githubusercontent.com/greenelab/adage/master/Node_interpretation/pseudomonas_KEGG_terms.txt"
gsea_statistic 'log2FoldChange'
rank_pathways_by "padj"
Expand Down
28 changes: 28 additions & 0 deletions configs/config_pseudomonas_pao1.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
local_dir "/home/alexandra/Documents/Data/Generic_expression_patterns_pao1/"
dataset_name "pseudomonas_analysis"
raw_template_filename "/home/alexandra/Documents/Data/Generic_expression_patterns_pao1/raw_pseudomonas_template_data.tsv"
processed_template_filename "/home/alexandra/Documents/Data/Generic_expression_patterns_pao1/processed_pseudomonas_template_data.tsv"
raw_compendium_filename "/home/alexandra/Documents/Data/Generic_expression_patterns_pao1/raw_pseudomonas_compendium_data.tsv"
processed_compendium_filename "/home/alexandra/Documents/Data/Generic_expression_patterns_pao1/processed_pseudomonas_compendium_data.tsv"
normalized_compendium_filename "/home/alexandra/Documents/Data/Generic_expression_patterns_pao1/normalized_pseudomonas_compendium_data.tsv"
shared_genes_filename "/home/alexandra/Documents/Data/Generic_expression_patterns_pao1/shared_genes_pseudomonas.pickle"
scaler_filename "/home/alexandra/Documents/Data/Generic_expression_patterns_pao1/scaler_transform_pseudomonas.pickle"
rank_genes_by "logFC"
reference_gene_filename "GAPE_proportions.txt"
reference_gene_name_col "gene id"
reference_rank_col "prop DEGs"
pathway_DB_filename "https://raw.githubusercontent.com/greenelab/adage/master/Node_interpretation/pseudomonas_KEGG_terms.txt"
gsea_statistic 'log2FoldChange'
rank_pathways_by "padj"
NN_architecture "NN_2500_30_pao1"
learning_rate 0.001
batch_size 10
epochs 100
kappa 0.01
intermediate_dim 2500
latent_dim 30
epsilon_std 1.0
validation_frac 0.25
project_id "E-GEOD-33245"
metadata_colname 'ml_data_source'
num_simulated 25
28 changes: 6 additions & 22 deletions generic_expression_patterns_modules/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,15 +358,12 @@ def map_recount2_data(
ofh.write(sample_id + "\t" + "\t".join(output_values) + "\n")


# TO DO: decide if not needed
def process_raw_template_pseudomonas(
processed_compendium_filename,
project_id,
dataset_name,
metadata_colname,
sample_id_metadata_filename,
raw_template_filename,
processed_template_filename,
):
"""
Create processed pseudomonas template data file based on
Expand All @@ -388,23 +385,6 @@ def process_raw_template_pseudomonas(

template_data.to_csv(raw_template_filename, sep="\t")

sample_ids_to_drop = set()
if os.path.exists(sample_id_metadata_filename):
# Read in metadata and get samples to be dropped:
metadata = pd.read_csv(
sample_id_metadata_filename, sep="\t", header=0, index_col=0
)
sample_ids_to_drop = set(metadata[metadata["processing"] == "drop"].index)

# Write the processed pseudomonas template output file on disk
with open(raw_template_filename) as ifh, open(
processed_template_filename, "w"
) as ofh:
for idx, line in enumerate(ifh):
sample_id = line.split("\t")[0]
if idx == 0 or sample_id not in sample_ids_to_drop:
ofh.write(line)


def normalize_compendium(
mapped_filename, normalized_filename, scaler_filename,
Expand Down Expand Up @@ -444,17 +424,21 @@ def process_raw_compendium_pseudomonas(
"""
Create processed pseudomonas compendium data file based on raw compendium
data file (`raw_filename`), and normalize the processed compendium.

Note: This function was designed to processed data from the pseudomonas
compendium defined in the ADAGE paper
(https://msystems.asm.org/content/1/1/e00025-15).
"""

# Create processed pseudomonas compendium data file
raw_compendium = pd.read_csv(raw_filename, header=0, index_col=0, sep="\t")

if raw_compendium.shape != (950, 5549):
if raw_compendium.shape[1] != 5549:
processed_compendium = raw_compendium.T
else:
processed_compendium = raw_compendium

assert processed_compendium.shape == (950, 5549)
assert processed_compendium.shape[1] == 5549

# Save transformed compendium data
processed_compendium.to_csv(processed_filename, sep="\t")
Expand Down
27 changes: 19 additions & 8 deletions generic_expression_patterns_modules/ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from scipy import stats
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler


Expand Down Expand Up @@ -422,7 +423,9 @@ def get_shared_rank_scaled(
]

# Get correlation
r, p, ci_low, ci_high = spearman_ci(0.95, shared_rank_scaled_df, 1000, data_type)
r, p, ci_low, ci_high = spearman_ci(
0.95, shared_rank_scaled_df, 1000, ref_rank_col, data_type
)

correlations = {"r": r, "p": p, "ci_low": ci_low, "ci_high": ci_high}

Expand Down Expand Up @@ -467,9 +470,15 @@ def compare_gene_ranking(
marginal_kws={"color": "white"},
)

fig.set_axis_labels(
"SOPHIE", "DE prior (Crow et. al. 2019)", fontsize=14, fontname="Verdana"
)
if ref_rank_col == "DE_Prior_Rank":
fig.set_axis_labels(
"SOPHIE", "DE prior (Crow et. al. 2019)", fontsize=14, fontname="Verdana"
)
elif ref_rank_col == "prop DEGs":
fig.set_axis_labels(
"SOPHIE", "GAPE (Stanton lab, 2020)", fontsize=14, fontname="Verdana"
)
plt.colorbar()

fig.savefig(
output_figure_filename,
Expand All @@ -480,7 +489,7 @@ def compare_gene_ranking(
dpi=300,
)

return correlations
return correlations, shared_gene_rank_scaled_df


def compare_pathway_ranking(summary_df, reference_filename, output_figure_filename):
Expand Down Expand Up @@ -585,7 +594,7 @@ def add_pseudomonas_gene_name_col(summary_gene_ranks, base_dir):
return summary_gene_ranks


def spearman_ci(ci, gene_rank_df, num_permutations, data_type):
def spearman_ci(ci, gene_rank_df, num_permutations, ref_rank_col, data_type):
"""
Returns spearman correlation score and confidence interval

Expand All @@ -597,11 +606,13 @@ def spearman_ci(ci, gene_rank_df, num_permutations, data_type):
Dataframe containing the our rank and Crow et. al. rank
num_permutations: int
The number of permutations to estimate the confidence interval
ref_rank_col: str
Name of column header containing reference ranks of genes
data_type: 'DE' or 'GSA'
"""
if data_type.lower() == "de":
r, p = stats.spearmanr(
gene_rank_df["Rank (simulated)"], gene_rank_df["DE_Prior_Rank"]
gene_rank_df["Rank (simulated)"], gene_rank_df[ref_rank_col]
)
elif data_type.lower() == "gsa":
r, p = stats.spearmanr(
Expand All @@ -615,7 +626,7 @@ def spearman_ci(ci, gene_rank_df, num_permutations, data_type):

if data_type.lower() == "de":
r_perm, p_perm = stats.spearmanr(
sample["Rank (simulated)"], sample["DE_Prior_Rank"]
sample["Rank (simulated)"], sample[ref_rank_col]
)
elif data_type.lower() == "gsa":
r_perm, p_perm = stats.spearmanr(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1151,7 +1151,7 @@
"\n",
"figure_filename = f\"gene_ranking_{col_to_rank_genes}.svg\"\n",
"\n",
"ranking.compare_gene_ranking(\n",
"corr, shared_ranking = ranking.compare_gene_ranking(\n",
" summary_gene_ranks,\n",
" DE_prior_filename,\n",
" ref_gene_col,\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ def shift_template_experiment_with_metadatafile(

figure_filename = f"gene_ranking_{col_to_rank_genes}.svg"

ranking.compare_gene_ranking(
corr, shared_ranking = ranking.compare_gene_ranking(
summary_gene_ranks,
DE_prior_filename,
ref_gene_col,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@
"\n",
"figure_filename = f\"gene_ranking_{col_to_rank_genes}.svg\"\n",
"\n",
"ranking.compare_gene_ranking(\n",
"corr, shared_ranking = ranking.compare_gene_ranking(\n",
" summary_gene_ranks,\n",
" DE_prior_filename,\n",
" ref_gene_col,\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@

figure_filename = f"gene_ranking_{col_to_rank_genes}.svg"

ranking.compare_gene_ranking(
corr, shared_ranking = ranking.compare_gene_ranking(
summary_gene_ranks,
DE_prior_filename,
ref_gene_col,
Expand Down
90 changes: 38 additions & 52 deletions other_enrichment_methods/nbconverted/1_simulate_data.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@

# coding: utf-8

# # Simulate gene expression data
#
#
# This notebook simulates gene expression data that can then be plugged into different enrichment methods

# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('load_ext', 'rpy2.ipython')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic("load_ext", "autoreload")
get_ipython().run_line_magic("load_ext", "rpy2.ipython")
get_ipython().run_line_magic("autoreload", "2")

import os
import sys
Expand All @@ -19,6 +18,7 @@
import pickle

from rpy2.robjects import pandas2ri

pandas2ri.activate()

from ponyo import utils, simulate_expression_data
Expand All @@ -45,36 +45,28 @@

# Load params
local_dir = params["local_dir"]
dataset_name = params['dataset_name']
NN_architecture = params['NN_architecture']
num_runs = params['num_simulated']
project_id = params['project_id']
metadata_col_id = params['metadata_colname']
mapped_template_filename = params['mapped_template_filename']
processed_template_filename = params['processed_template_filename']
normalized_compendium_filename = params['normalized_compendium_filename']
scaler_filename = params['scaler_filename']
col_to_rank_genes = params['rank_genes_by']
col_to_rank_pathways = params['rank_pathways_by']
statistic = params['gsea_statistic']
count_threshold = params['count_threshold']
dataset_name = params["dataset_name"]
NN_architecture = params["NN_architecture"]
num_runs = params["num_simulated"]
project_id = params["project_id"]
metadata_col_id = params["metadata_colname"]
mapped_template_filename = params["mapped_template_filename"]
processed_template_filename = params["processed_template_filename"]
normalized_compendium_filename = params["normalized_compendium_filename"]
scaler_filename = params["scaler_filename"]
col_to_rank_genes = params["rank_genes_by"]
col_to_rank_pathways = params["rank_pathways_by"]
statistic = params["gsea_statistic"]
count_threshold = params["count_threshold"]

# Load metadata file with grouping assignments for samples
sample_id_metadata_filename = os.path.join(
base_dir,
dataset_name,
"data",
"metadata",
f"{project_id}_process_samples.tsv"
base_dir, dataset_name, "data", "metadata", f"{project_id}_process_samples.tsv"
)

# Load metadata file with grouping assignments for samples
metadata_filename = os.path.join(
base_dir,
dataset_name,
"data",
"metadata",
f"{project_id}_groups.tsv"
base_dir, dataset_name, "data", "metadata", f"{project_id}_groups.tsv"
)

# Load pickled file
Expand All @@ -87,22 +79,18 @@

# Output files
gene_summary_filename = os.path.join(
base_dir,
dataset_name,
f"generic_gene_summary_{project_id}.tsv"
base_dir, dataset_name, f"generic_gene_summary_{project_id}.tsv"
)

pathway_summary_filename = os.path.join(
base_dir,
dataset_name,
f"generic_pathway_summary_{project_id}.tsv"
base_dir, dataset_name, f"generic_pathway_summary_{project_id}.tsv"
)


# ### Simulate experiments using selected template experiment
#
#
# Workflow:
#
#
# 1. Get the gene expression data for the selected template experiment
# 2. Encode this experiment into a latent space using the trained VAE model
# 3. Linearly shift the encoded template experiment in the latent space
Expand All @@ -113,11 +101,11 @@


# Simulate multiple experiments
# This step creates the following files in "<local_dir>/pseudo_experiment/" directory:
# This step creates the following files in "<local_dir>/pseudo_experiment/" directory:
# - selected_simulated_data_SRP012656_<n>.txt
# - selected_simulated_encoded_data_SRP012656_<n>.txt
# - template_normalized_data_SRP012656_test.txt
# in which "<n>" is an integer in the range of [0, num_runs-1]
# in which "<n>" is an integer in the range of [0, num_runs-1]
os.makedirs(os.path.join(local_dir, "pseudo_experiment"), exist_ok=True)
for run_id in range(num_runs):
simulate_expression_data.shift_template_experiment(
Expand All @@ -129,12 +117,12 @@
scaler,
local_dir,
base_dir,
run_id
run_id,
)


# ## Process template and simulated experiments
#
#
# * Remove samples not required for comparison
# * Make sure ordering of samples matches metadata for proper comparison
# * Make sure values are cast as integers for using DESeq
Expand All @@ -145,31 +133,29 @@

if not os.path.exists(sample_id_metadata_filename):
sample_id_metadata_filename = None

stats.process_samples_for_DESeq(
mapped_template_filename,
metadata_filename,
processed_template_filename,
count_threshold,
sample_id_metadata_filename,
)
mapped_template_filename,
metadata_filename,
processed_template_filename,
count_threshold,
sample_id_metadata_filename,
)

for i in range(num_runs):
simulated_filename = os.path.join(
local_dir,
"pseudo_experiment",
f"selected_simulated_data_{project_id}_{i}.txt"
local_dir, "pseudo_experiment", f"selected_simulated_data_{project_id}_{i}.txt"
)
out_simulated_filename = os.path.join(
local_dir,
"pseudo_experiment",
f"selected_simulated_data_{project_id}_{i}_processed.txt"
f"selected_simulated_data_{project_id}_{i}_processed.txt",
)
stats.process_samples_for_DESeq(
simulated_filename,
metadata_filename,
out_simulated_filename,
count_threshold,
sample_id_metadata_filename,
)
)

Loading