Skip to content

Commit 286883d

Browse files
committed
final touches to prep for PR
Removed samples corresponding to organoids in experiments file, fixed checking of prior drug file using code from pancPDO directory, removed some print lines and comments, general tidying. Added 'sarcpdo' to build_datasets.py and tested. Also added a few lines to the docker-compose so that build_datasets.py would run.
1 parent 93e7530 commit 286883d

File tree

7 files changed

+59
-56
lines changed

7 files changed

+59
-56
lines changed

build/build_dataset.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def process_docker(dataset,validate):
4343
'mpnst': ['mpnst'],
4444
'mpnstpdx': ['mpnstpdx'],
4545
'cptac': ['cptac'],
46+
'sarcpdo': ['sarcpdo'],
4647
'genes': ['genes'],
4748
'upload': ['upload']
4849
}
@@ -123,7 +124,8 @@ def process_omics(executor, dataset, should_continue):
123124
'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
124125
'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
125126
'hcmi': ['mutations', 'transcriptomics'],
126-
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics']
127+
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
128+
'sarcpdo': ['mutations', 'transcriptomics']
127129
}
128130

129131
expected_omics = dataset_omics_files.get(dataset, [])

build/docker/docker-compose.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ services:
5353
HTTPS_PROXY: ${HTTPS_PROXY}
5454
platform: linux/amd64
5555
image: mpnstpdx:latest
56+
5657
cptac:
5758
build:
5859
context: ../../
@@ -62,6 +63,15 @@ services:
6263
platform: linux/amd64
6364
image: cptac:latest
6465

66+
sarcpdo:
67+
build:
68+
context: ../../
69+
dockerfile: build/docker/Dockerfile.sarcpdo
70+
args:
71+
HTTPS_PROXY: ${HTTPS_PROXY}
72+
platform: linux/amd64
73+
image: sarcpdo:latest
74+
6575
genes:
6676
build:
6777
context: ../../

build/sarcpdo/00_createSarcPDOSampleFile.py

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -92,42 +92,34 @@ def download_and_format_rna_samples(synLoginObject):
9292

9393
return rna_samples
9494

95-
#def generate_samples_file(prev_samples_path):
96-
97-
# if prev_samples_path == "":
98-
#maxval = 0
99-
# else:
100-
# maxval = max(pd.read_csv(prev_samples_path).improve_sample_id)
95+
10196

10297
if __name__ == "__main__":
103-
print('in main')
98+
10499
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Sarcoma PDO project into a single samplesheet")
105-
print('in line 97')
100+
106101
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
107102

108103
parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation")
109104

110105
args = parser.parse_args()
111-
print(args)
106+
112107
print("Logging into Synapse")
113108
PAT = args.token
114109
synObject = synapseclient.login(authToken=PAT)
115110

116111
rnaTable = download_and_format_rna_samples(synObject)
117-
print(rnaTable.shape)
118112
geneticTable = download_and_format_genetic_samples(synObject)
119-
print(geneticTable.shape)
120113
merged = rnaTable.merge(geneticTable, how='outer')
121-
print(merged.shape)
122-
# change dash to underscore to align with omics data
123-
#merged['other_id'] = merged['other_id'].str.replace("-2", "_2")
114+
115+
116+
if (args.prevSamples):
117+
prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
118+
else:
119+
prev_max_improve_id = 0
124120

125-
prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
126121
merged['improve_sample_id'] = range(prev_max_improve_id+1, prev_max_improve_id+merged.shape[0]+1)
127122

128123
merged.to_csv('/tmp/sarcpdo_samples.csv', index=False)
129124

130-
# validate with: linkml validate -s coderdata/schema/coderdata.yaml ~/Downloads/sarcpdo_samples.csv
131-
132-
# test script : python3 00_createSarcPDOSampleFile.py -t $SYNAPSE_AUTH_TOKEN -p '~/Downloads/mpnstpdx_samples.csv'
133-
125+

build/sarcpdo/01_createSarcPDOOmicsFiles.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,9 @@ def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTabl
5555
# reformat variant classification column to be accepted by linkML and correct
5656
mutation_merged["variant_classification"] =mutation_merged['Canonical_Variant_Classification']
5757

58-
#mutation_merged['variant_classification'] =
59-
#mutation_merged['variant_classification'].replace("Missense", "Missense_Mutation", inplace=True)
58+
6059
mutation_merged.replace({'variant_classification': "Missense"}, "Missense_Mutation", inplace=True)
61-
#mutation_merged['variant_classification'] =
60+
6261
mutation_merged.replace({'variant_classification': "Splice_Donor"}, "Splice_Site", inplace=True)
6362
mutation_merged.replace({'variant_classification': "Splice_Acceptor"}, "Splice_Site", inplace=True)
6463
mutation_merged.replace({'variant_classification': "Nonsense"}, "Nonsense_Mutation", inplace=True)
@@ -69,12 +68,6 @@ def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTabl
6968
mutation_merged.replace({'variant_classification': "Frameshift"}, "Frameshift_Variant", inplace=True)
7069
mutation_merged.replace({'variant_classification': "intergenic_variant"}, "Silent", inplace=True)
7170

72-
# mutation_merged['variant_classification'] = mutation_merged['variant_classification'].replace("Nonsense", "Nonsense_Mutation", inplace=True)
73-
#mutation_merged['variant_classification'] = mutation_merged['variant_classification'].replace('intron', 'Intron', inplace=True)
74-
#mutation_merged['variant_classification'] = mutation_merged['variant_classification'].replace("synonymous", "Silent", inplace=True)
75-
#mutation_merged['variant_classification'] = mutation_merged['variant_classification'].replace("Inframe_Del", "In_Frame_Del", inplace=True)
76-
#mutation_merged['variant_classification'] = mutation_merged['variant_classification'].replace("5_prime_UTR", "5' UTR", inplace=True)
77-
#mutation_merged['variant_classification'] = mutation_merged['variant_classification'].replace("intergenic_variant", "Silent", inplace=True)
7871
mutation_merged_select = mutation_merged[['entrez_id', 'Sample_ID_Tumor', 'Name', 'variant_classification']]
7972
#merge with improve_ids
8073
samples['other_id_no_dash'] = samples['other_id'].str.replace("-2", "_2")
@@ -112,13 +105,9 @@ def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTabl
112105
if args.expression:
113106
download_and_format_transcriptomic(synObject, genes, samples).to_csv("/tmp/sarcpdo_transcriptomics.csv", index=False)
114107

115-
# if args.copy:
116-
# download_and_format_copy_number(synObject, genes, samples).to_csv('sarcpdo_copynumber.csv', index=False)
117-
108+
118109
if args.mutation:
119-
download_and_format_genomic_mutation(synObject, genes, samples).to_csv('/tmp/sarcpdo_mutation.csv', index=False)
110+
download_and_format_genomic_mutation(synObject, genes, samples).to_csv('/tmp/sarcpdo_mutations.csv', index=False)
120111

121-
# validate with: linkml validate -s coderdata/schema/coderdata.yaml ~/Downloads/sarcpdo_samples.csv
122112

123113

124-
# command line testing: python3 01_createSarcPDOOmicsFiles.py -t $SYNAPSE_AUTH_TOKEN -s dev-environment/sarcpdo_samples.csv -g genes.csv -e

build/sarcpdo/02_createSarcPDODrugsFile.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,38 @@
44
import argparse
55
import os
66

7+
#from utils.pubchem_retrieval import update_dataframe_and_write_tsv
78
from pubchem_retrieval import update_dataframe_and_write_tsv
89

910

1011
def create_sarcpdo_drugs_file(synObject, prevDrugFilepath, outputPath):
1112
drug_query = synObject.tableQuery("select * from syn61892224")
1213
drug_data = drug_query.asDataFrame()
13-
# check status of previous drug file
14-
if not prevDrugFilepath:
15-
# if sarcpdo_drugs.tsv is null, create the empty dataframe.
16-
empty_drugs = pd.DataFrame(columns = ['improve_drug_id', 'chem_name', 'pubchem_id', 'canSMILES', 'InChIKey', 'formula', 'weight'])
17-
empty_drugs.to_csv('outputPath', sep='\t', index=False)
1814

1915
# get unique drugs
20-
unique_drugs = drug_data['Drug_Name'].unique()
16+
newdrugnames = drug_data['Drug_Name'].unique()
2117
# use helper functions in pubchem_retrieval.py
22-
update_dataframe_and_write_tsv(unique_drugs, output_filename=outputPath, # specify ignore_chems as null?
23-
batch_size=1, isname=True, time_limit=48 * 60 * 60)
18+
alldrugs = []
19+
if prevDrugFilepath is not None and prevDrugFilepath is not "":
20+
prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')]
21+
alldrugs = pd.concat(prevdrugs).drop_duplicates()
22+
23+
imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)]
24+
newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
25+
26+
##write drugs
27+
newdrugs.to_csv(outputPath, sep='\t', index=False)
28+
29+
if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match
30+
print('Missing drugs in existing file, querying pubchem')
31+
update_dataframe_and_write_tsv(newdrugnames,outputPath)
2432

2533

2634
if __name__ == "__main__":
27-
print('in main')
35+
2836
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Sarcoma PDO project")
29-
parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for sarcpdo',default=None)
30-
parser.add_argument('-o', '--outputPath', help='Output path for updated sarcpdo drug file', default = None)
37+
parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for sarcpdo', default = None)
38+
parser.add_argument('-o', '--outputPath', help='Output path for updated sarcpdo drug file', default = "/tmp/sarcpdo_drugs.tsv")
3139
parser.add_argument('-t', '--token', help='Synapse token')
3240

3341
args = parser.parse_args()
@@ -37,4 +45,3 @@ def create_sarcpdo_drugs_file(synObject, prevDrugFilepath, outputPath):
3745

3846
create_sarcpdo_drugs_file(synObject, args.prevDrugFilePath, args.outputPath)
3947

40-
# command line testing: python3 02_createSarcPDODrugsFile.py -t $SYNAPSE_AUTH_TOKEN -d ../../../sarcpdo_drugs.csv -o sarcpdo_drugs.csv

build/sarcpdo/03_createSarcPDOExperimentFile.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
parser.add_argument('-d', '--drugFile', nargs = "?", type=str, default = "", help = "Use this to provide previously generated drugs file for this dataset to link with to experiment data.")
1515

1616
args = parser.parse_args()
17-
print(args)
17+
1818
print("Logging into Synapse")
1919
PAT = args.token
2020
synObject = synapseclient.login(authToken=PAT)
@@ -38,10 +38,13 @@
3838

3939
# inner merge with samples because there are samples without experiment info and many Sample_ID's in experiments data without sample info
4040
experiments = drug_data.merge(sarcpdo_drugs, how='left').merge(sarcpdo_samples, how='inner')
41-
42-
final_experiment = experiments[['improve_sample_id', 'improve_drug_id', 'Viability_Score']]
41+
# drop rows corresponding to organoids
42+
tumor_only = experiments[~experiments['model_type'].str.contains("organoid")]
43+
# select relevant columns
44+
final_experiment = tumor_only[['improve_sample_id', 'improve_drug_id', 'Viability_Score']]
45+
# add static info
4346
final_experiment.loc[:,['study']] = 'Landscape of Sarcoma'
44-
final_experiment.loc[:,['source']] = 'pharmacoGX'
47+
final_experiment.loc[:,['source']] = 'AlShihabietal2024'
4548
final_experiment.loc[:,['time']] = None
4649
final_experiment.loc[:,['time_unit']]= None
4750
final_experiment.loc[:,['dose_response_metric']] = 'published_auc'
@@ -50,7 +53,3 @@
5053
toReturn = final_experiment[['source', 'improve_sample_id', 'improve_drug_id', 'study', 'time', 'time_unit', 'dose_response_metric', 'dose_response_value']]
5154

5255
toReturn.to_csv('/tmp/sarcpdo_experiments.tsv', sep='\t', index=False)
53-
54-
55-
# to test run
56-
# python3 03_createSarcPDOExperimentFile.py -t $SYNAPSE_AUTH_TOKEN -s sarcpdo_samples.csv -d sarcpdo_drugs.tsv

build/sarcpdo/build_drugs.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ set -euo pipefail
44
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
55

66
echo "Running script with token and drugFile $1"
7-
python3 02_createSarcPDODrugsFile --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/sarcpdo_drugs.tsv
7+
# for running locally (from build directory):
8+
#python3 -m sarcpdo.02_createSarcPDODrugsFile --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/sarcpdo_drugs.tsv
9+
python3 02_createSarcPDODrugsFile.py --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/sarcpdo_drugs.tsv
810

911
echo "Running build_drug_desc.py..."
12+
#for running locally:
13+
#python3 utils/build_drug_desc.py --drugtable /tmp/sarcpdo_drugs.tsv --desctable /tmp/sarcpdo_drug_descriptors.tsv.gz
1014
python3 build_drug_desc.py --drugtable /tmp/sarcpdo_drugs.tsv --desctable /tmp/sarcpdo_drug_descriptors.tsv.gz

0 commit comments

Comments
 (0)