Skip to content

Commit 8d6f25d

Browse files
committed
added output parameters
1 parent fbeea3a commit 8d6f25d

File tree

8 files changed

+70
-65
lines changed

8 files changed

+70
-65
lines changed

README.md

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -60,31 +60,29 @@ Options:
6060

6161
### Examples
6262
Local Annotator
63-
Annotate somatic variants and CNAs using the local annotator:
63+
1. Annotate somatic variants and CNAs using the local annotator:
6464
> python main.py --annotator local --output path/to/output --somatic_variants path/to/snvs.tsv --ascatestimates path/to/ascat.tsv
6565
66-
Annotate CNAs using the local annotator:
66+
2. Annotate CNAs using the local annotator:
6767
> python main.py --annotator local --output path/to/output --copy_number_alterations path/to/cnas.tsv --ascatestimates path/to/ascat.tsv
6868
69-
External Annotator
70-
Annotate CNAs using OncoKB:
71-
> python main.py --annotator external --output path/to/output --oncokbcna --copy_number_alterations path/to/cnas.tsv
69+
External Annotator (execution order is important as the CGI annotations are supplementing the OncoKB annotations in default)
70+
3. Annotate CNAs using OncoKB:
71+
> python main.py --annotator external --output path/to/output --oncokbcna --copy_number_alterations path/to/locally_annotated_cnas.tsv
7272
73-
Annotate somatic variants using OncoKB:
74-
>python main.py --annotator external --output path/to/output --oncokbsnv --somatic_variants path/to/snvs.tsv
73+
4. Annotate somatic variants using OncoKB:
74+
>python main.py --annotator external --output path/to/output --oncokbsnv --somatic_variants path/to/locally_annotated_snvs.tsv
7575
76-
Annotate CNAs using Cancer Genome Interpreter:
76+
5. Annotate CNAs using Cancer Genome Interpreter:
7777

78-
>python external_annotator.py --cgiquery --copy_number_alterations path/to/cnas.tsv
78+
>python external_annotator.py --cgiquery --copy_number_alterations path/to/oncokb_annotated_cnas.tsv
7979
80-
Annotate somatic variants using Cancer Genome Interpreter:
81-
>python main.py --annotator external --output path/to/output --cgiquery --somatic_variants path/to/snvs.tsv
80+
6. Annotate somatic variants using Cancer Genome Interpreter:
81+
>python main.py --annotator external --output path/to/output --cgiquery --somatic_variants path/to/oncokb_annotated_snvs.tsv
8282
83-
SLURM Scripts
83+
SLURM Scripts: edit the scripts to set the correct paths and SLURM sbatch parameters.
8484

8585
Submit a batch job to SLURM cluster to annotate on multiple computing nodes:
8686
>./slurm_scripts/annotate_cnas.sh path/to/sample_list.txt
8787
8888
>./slurm_scripts/snv_annotation.sbatch path/to/sample_list.txt
89-
90-
### License

cgi_annotator.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def generate_cgi_cna_file_from_list(genelist):
105105
file2.write(row)
106106
file2.close()
107107

108-
def launch_cgi_job_with_mulitple_variant_types(mutations_file, cnas_file, transloc_file, cancer_type, reference):
108+
def launch_cgi_job_with_mulitple_variant_types(mutations_file=None, cnas_file=None, transloc_file=None, cancer_type="HGSOC", reference="GRCh38"):
109109
"""
110110
This function launches a CGI (Cancer Genome Interpreter) job with multiple variant types,
111111
using the CGI API. It takes in mutation, cnas, and translocation files, cancer type, and
@@ -174,7 +174,7 @@ def launch_cgi_job_with_mulitple_variant_types(mutations_file, cnas_file, transl
174174
return 0
175175

176176

177-
def query_cgi_job(jobid, snv_annotations: pd.DataFrame = None, cna_annotations: pd.DataFrame = None):
177+
def query_cgi_job(jobid, output, snv_annotations: pd.DataFrame = None, cna_annotations: pd.DataFrame = None):
178178
"""
179179
Query the CGI API with a job ID and save the results to the database.
180180
@@ -206,10 +206,8 @@ def query_cgi_job(jobid, snv_annotations: pd.DataFrame = None, cna_annotations:
206206
cgi_snvdf = None
207207
cgi_cnadf = None
208208
treatments = []
209+
209210
for fn in fnames:
210-
# reader = z.open(f)
211-
# for row in reader.readlines():
212-
# print(row)
213211
z.extract(fn)
214212
df = pd.read_csv(fn, sep="\t")
215213
print(fn)
@@ -277,14 +275,14 @@ def query_cgi_job(jobid, snv_annotations: pd.DataFrame = None, cna_annotations:
277275
snv_annotations.at[indxs, 'tumorTypeSummary'] = handle_string_field(cgi_snv["driver_statement"])
278276

279277
if isinstance(snv_annotations, pd.DataFrame):
280-
snv_annotations.to_csv("snv_annotated_cgi.csv", index=False, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'consequence', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
278+
snv_annotations.to_csv(output, index=False, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'consequence', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
281279
trdf = pd.DataFrame(treatments)
282-
trdf.to_csv("treatments_cgi_snv.csv", index=False, sep="\t")
280+
trdf.to_csv("treatments.csv", mode="a", index=False, sep="\t")
283281

284282
if isinstance(cna_annotations, pd.DataFrame):
285-
cna_annotations.to_csv("cna_annotated_cgi.csv", index=False, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
283+
cna_annotations.to_csv(output, index=False, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
286284
trdf = pd.DataFrame(treatments)
287-
trdf.to_csv("treatments_cgi_cna.csv", index=False, sep="\t")
285+
trdf.to_csv("treatments.csv", mode="a", index=False, sep="\t")
288286

289287
return 1
290288
else:
@@ -303,7 +301,7 @@ def generate_cgi_cna_file_from_list(genelist):
303301
file2.write(row)
304302
file2.close()
305303

306-
def generate_temp_cgi_query_files(snv_annotations: pd.DataFrame = None, cna_annotations: pd.DataFrame = None, translocs: pd.DataFrame = None):
304+
def generate_temp_cgi_query_files(snv_annotations: pd.DataFrame = None, cna_annotations: pd.DataFrame = None, translocs: pd.DataFrame = None, append_to_annotations: bool = True):
307305
"""
308306
Generate temporary CGI query files from annotations.
309307
@@ -315,16 +313,27 @@ def generate_temp_cgi_query_files(snv_annotations: pd.DataFrame = None, cna_anno
315313
header = "chr\tpos\tref\talt\tsample\n"
316314
try:
317315
if isinstance(snv_annotations, pd.DataFrame):
318-
with open("./tmp/snvs.ext", "w") as file1:
319-
file1.write(header)
320-
321-
uniques = snv_annotations[['alteration']].drop_duplicates()
322-
for indx, snv in uniques.iterrows():
323-
id = "SNV:"+snv['alteration']
324-
alt_split = snv['alteration'].split(':')
325-
row = alt_split[1]+'\t'+alt_split[2]+'\t'+alt_split[3]+'\t'+alt_split[4]+'\t'+id+'\n'
326-
file1.write(row)
327-
file1.close()
316+
if append_to_annotations:
317+
with open("./tmp/snvs.ext", "w") as file1:
318+
file1.write(header)
319+
320+
uniques = snv_annotations[['alteration']].drop_duplicates()
321+
for indx, snv in uniques.iterrows():
322+
id = "SNV:"+snv['alteration']
323+
alt_split = snv['alteration'].split(':')
324+
row = alt_split[1]+'\t'+alt_split[2]+'\t'+alt_split[3]+'\t'+alt_split[4]+'\t'+id+'\n'
325+
file1.write(row)
326+
file1.close()
327+
else:
328+
with open("./tmp/snvs.ext", "w") as file1:
329+
file1.write(header)
330+
331+
uniques = snv_annotations[['hugoSymbol', 'chromosome', 'position', 'reference_allele', 'sample_allele', 'tumorType', 'referenceGenome']].drop_duplicates()
332+
for indx, snv in uniques.iterrows():
333+
id = "SNV:"+snv['hugoSymbol']+':'+snv['chromosome']+':'+str(snv['position'])+':'+snv['reference_allele']+':'+snv['sample_allele']
334+
row = snv['chromosome']+'\t'+str(snv['position'])+'\t'+snv['reference_allele']+'\t'+snv['sample_allele']+'\t'+id+'\n' #+'\t'+cryptocode.encrypt(snv.samples, settings.CRYPTOCODE)+'\n'
335+
file1.write(row)
336+
file1.close()
328337

329338
if isinstance(cna_annotations, pd.DataFrame):
330339
header = "gene\tcna\tsample\n"

external_annotator.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,21 @@
1515
--cgijobid <str> Download results from CGI by jobid and apply annotations.
1616
--copy_number_alterations <str> Path to copy number alterations file.
1717
--somatic_variants <str> Path to somatic variants file.
18+
--output <str> Path to output file.
19+
1820
1921
Examples:
20-
python external_annotator.py --oncokbcna --copy_number_alterations path/to/cnas.tsv
21-
python external_annotator.py --oncokbsnv --somatic_variants path/to/snvs.tsv
22-
python external_annotator.py --cgiquery --somatic_variants path/to/snvs.tsv
23-
python external_annotator.py --cgiquery --copy_number_alterations path/to/cnas.tsv
24-
python external_annotator.py --cgiquery --cgijobid <jobid> --somatic_variants path/to/snvs.tsv
25-
python external_annotator.py --cgiquery --cgijobid <jobid> --copy_number_alterations path/to/cnas.tsv
22+
python external_annotator.py --oncokbcna --copy_number_alterations path/to/cnas.tsv --output path/to/output
23+
python external_annotator.py --oncokbsnv --somatic_variants path/to/snvs.tsv --output path/to/output
24+
python external_annotator.py --cgiquery --somatic_variants path/to/snvs.tsv --output path/to/output
25+
python external_annotator.py --cgiquery --copy_number_alterations path/to/cnas.tsv --output path/to/output
26+
python external_annotator.py --cgiquery --cgijobid <jobid> --somatic_variants path/to/snvs.tsv --output path/to/output
27+
python external_annotator.py --cgiquery --cgijobid <jobid> --copy_number_alterations path/to/cnas.tsv --output path/to/output
2628
'''
2729

2830
def main(**kwargs):
2931

30-
32+
output = kwargs.get("output", ".")
3133
if kwargs["oncokbcna"] and kwargs["copy_number_alterations"]:
3234

3335
cnas = pd.read_csv(kwargs["copy_number_alterations"], sep="\t")
@@ -46,7 +48,7 @@ def main(**kwargs):
4648
i = 0
4749
for c in chunks:
4850
i += 1
49-
query_oncokb_cnas_to_csv(c, i)
51+
query_oncokb_cnas_to_csv(c, output, i)
5052

5153

5254
if kwargs["oncokbsnv"] and kwargs["somatic_variants"]:
@@ -68,18 +70,18 @@ def main(**kwargs):
6870
i = 0
6971
for c in chunks:
7072
i += 1
71-
query_oncokb_somatic_mutations(c, i)
73+
query_oncokb_somatic_mutations(c, output, i)
7274

7375
if kwargs["cgiquery"] and kwargs["somatic_variants"]:
7476
snvs = pd.read_csv(kwargs["somatic_variants"], sep="\t", dtype='string')
7577

7678
if kwargs["cgijobid"]:
7779
jobid = kwargs["cgijobid"]
7880
else:
79-
generate_temp_cgi_query_files(snvs, None, None)
80-
jobid = launch_cgi_job_with_mulitple_variant_types("./tmp/snvs.ext",None, None, "OVSE", "hg38").replace('"', '')
81+
generate_temp_cgi_query_files(snv_annotations=snvs)
82+
jobid = launch_cgi_job_with_mulitple_variant_types(mutations_file="./tmp/snvs.ext", cancer_type="OVSE", reference="hg38").replace('"', '')
8183
time.sleep(30)
82-
while query_cgi_job(jobid, snvs) == 0:
84+
while query_cgi_job(jobid, output, snv_annotations=snvs) == 0:
8385
print("Waiting 30 seconds for the next try...")
8486
time.sleep(30)
8587

@@ -89,11 +91,11 @@ def main(**kwargs):
8991
if kwargs["cgijobid"]:
9092
jobid = kwargs["cgijobid"]
9193
else:
92-
generate_temp_cgi_query_files(None, cnas, None)
93-
jobid = launch_cgi_job_with_mulitple_variant_types(None, "./tmp/cnas.ext", None, "OVSE", "hg38").replace('"', '')
94+
generate_temp_cgi_query_files(cna_annotations=cnas)
95+
jobid = launch_cgi_job_with_mulitple_variant_types(cnas_file="./tmp/cnas.ext", cancer_type="OVSE", reference="hg38").replace('"', '')
9496

9597
time.sleep(30)
96-
while query_cgi_job(jobid, None, cnas) == 0:
98+
while query_cgi_job(jobid, output, cna_annotations=cnas) == 0:
9799
print("Waiting 30 seconds for the next try...")
98100
time.sleep(30)
99101

@@ -106,6 +108,8 @@ def add_arguments(parser):
106108
parser.add_argument('--cgijobid', type=str, help='Download results from CGI by jobid')
107109
parser.add_argument('--copy_number_alterations', type=str, help='Path to copy number alterations file')
108110
parser.add_argument('--somatic_variants', type=str, help='Path to somatic variants file')
111+
parser.add_argument('--output', type=str, default=".", help='Path to output directory for annotated files')
112+
109113

110114

111115
parser = argparse.ArgumentParser()

local_annotator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,9 @@ def process_sample(gname, cna_grp, snv_grps, output, cores, ascats):
135135

136136
if __name__ == "__main__":
137137
parser = argparse.ArgumentParser(description="Annotate genomic alterations.")
138-
parser.add_argument("--output", type=str, required=True, help="Path to output file")
139-
parser.add_argument("--somatic_variants", type=str, help="Path to somatic variants file")
140-
parser.add_argument("--copy_number_alterations", type=str, help="Path to copy number alterations file")
138+
parser.add_argument("--output", type=str, required=True, help="Path to output files")
139+
parser.add_argument("--somatic_variants", type=str, help="Path to somatic variants files")
140+
parser.add_argument("--copy_number_alterations", type=str, help="Path to copy number alterations files")
141141
parser.add_argument("--ascatestimates", type=str, required=True, help="Path to ASCAT estimates file")
142142
parser.add_argument("--cn_annotations", type=str, help="Path to filtered and annotated CNAs")
143143
parser.add_argument("--tumortype", type=str, default="HGSOC", help="Tumor type identifier (default: HGSOC)")

main.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ def run_external_annotator(args):
5656

5757

5858
def main():
59-
parser = argparse.ArgumentParser(description="Run local and external annotators")
6059
parser = argparse.ArgumentParser(description="Run local and external annotators")
6160
parser.add_argument("--annotator", choices=["local", "external", "both"], required=True,
6261
help="Choose which annotator to run")

oncokb_annotator.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def handle_treatments_oncokb(jsondata, alt_type, alteration):
3939
'description': description,
4040
'treatment': drugs,
4141
'level_of_evidence': level,
42+
'cgi_level':"",
4243
'citations': pmids,
4344
'tumorType': tumortype
4445
}))
@@ -65,7 +66,7 @@ def handle_drugs_field(jsondata):
6566
return None
6667

6768

68-
def query_oncokb_cnas_to_csv(cna_annotations: pd.DataFrame, i):
69+
def query_oncokb_cnas_to_csv(cna_annotations: pd.DataFrame, output, i):
6970

7071
"""
7172
Query OncoKB API to get annotations for copy number alterations (CNAs) and save the results to a CSV file.
@@ -146,16 +147,16 @@ def query_oncokb_cnas_to_csv(cna_annotations: pd.DataFrame, i):
146147
#print("Updated "+str(updatedf.count())+" CNAs")
147148
#cna_annotations.drop(columns=cna_annotations.columns[0], axis=1, inplace=True)
148149
header = False if i > 1 else True
149-
cna_annotations.to_csv("cna_annotated_oncokb.csv", mode="a", index=False, header=header, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'consequence', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
150+
cna_annotations.to_csv(output, mode="a", index=False, header=header, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'consequence', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'cgi_level', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
150151
trdf = pd.DataFrame(treatments)
151-
trdf.to_csv("treatments_oncokb.csv", mode="a", header=header, index=False, sep="\t")
152+
trdf.to_csv("treatments.csv", mode="a", header=header, index=False, sep="\t")
152153
else:
153154
print("Unable to request. Response: ", response.text)
154155

155156
return response
156157

157158

158-
def query_oncokb_somatic_mutations(snv_annotations: pd.DataFrame, i):
159+
def query_oncokb_somatic_mutations(snv_annotations: pd.DataFrame, output, i):
159160
"""
160161
Query OncoKB API to get annotations for somatic mutations and save the results to a CSV file.
161162
@@ -225,9 +226,9 @@ def query_oncokb_somatic_mutations(snv_annotations: pd.DataFrame, i):
225226

226227
print(snv_annotations)
227228
header = False if i > 1 else True
228-
snv_annotations.to_csv("snv_annotated_oncokb.csv", mode="a", header=header, index=False, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'consequence', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
229+
snv_annotations.to_csv(output, mode="a", header=header, index=False, sep="\t", columns=['patient_id', 'sample_id', 'alteration', 'hugoSymbol', 'tumorType', 'consequence', 'oncogenic', 'mutationEffectDescription', 'gene_role', 'citationPMids', 'level_of_evidence', 'cgi_level', 'geneSummary', 'variantSummary', 'tumorTypeSummary'])
229230
trdf = pd.DataFrame(treatments)
230-
trdf.to_csv("treatments_oncokb_snv.csv", header=header, mode="a", index=False, sep="\t")
231+
trdf.to_csv("treatments.csv", header=header, mode="a", index=False, sep="\t")
231232
#print("Updated " + str(len(snvdf)) + " CNAs")
232233
else:
233234
print("[ERROR] Unable to request. Response: ", print(response.text))

slurm_scripts/cna_annotation.sbatch

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@
55
#SBATCH --error=./logs/%A_%a.err
66
#SBATCH --cpus-per-task=1
77
#SBATCH --mem-per-cpu=5G
8-
#SBATCH --partition=general,evmbig
9-
#SBATCH --exclude=evm06,evm07,evm08,evm09,evm10,evmfull01,evmbig
10-
#SBATCH --array=1-850
118

129
echo $1
1310
ODANNOTATOR_PATH=""

slurm_scripts/snv_annotation.sbatch

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@
55
#SBATCH --error=./logs/%A_%a.err
66
#SBATCH --cpus-per-task=1
77
#SBATCH --mem-per-cpu=5G
8-
#SBATCH --partition=general,evmbig
9-
#SBATCH --exclude=evm06,evm07,evm08,evm09,evm10,evmfull01,evmbig
10-
#SBATCH --array=1-850
118

129
echo $1
1310
ODANNOTATOR_PATH=""

0 commit comments

Comments
 (0)