Skip to content

Commit 93acd5c

Browse files
authored
Merge pull request #10 from bbi-lab/v1.0.1
functional class logic
2 parents 3b4bc11 + 842d559 commit 93acd5c

File tree

2 files changed

+25
-12
lines changed

2 files changed

+25
-12
lines changed

bin/scoreSNVs

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ import sge_target
1818
from scipy.stats import norm
1919

2020
# Function to assign the most important SO summary term
21-
def get_simplified_consequence(vep_terms):
22-
consequence_df = pd.read_csv("/net/bbi/vol1/data/sge-analysis/etc/extended_ensembl_consequence.tsv", sep='\t')
21+
def get_simplified_consequence(vep_terms, ensemblfile):
22+
consequence_df = pd.read_csv(ensemblfile, sep='\t')
2323
# Create a dictionary mapping terms to their SO summary term (prioritizing importance)
2424
mapping_dict = (
2525
consequence_df.sort_values(by="Importance", ascending=False)
@@ -54,13 +54,13 @@ def saveZDistAndThresholds(args, mu, std, lthresh, uthresh):
5454

5555

5656
def makeFunctionalClasses(args, scoredf, exclude_nonsense_coords):
57-
neutral_cnd = ((scoredf["simplified_consequence"].isin(["synonymous_variant", "intron_variant"])) &
57+
neutral_cnd = ((scoredf["consequence"].isin(["synonymous_variant", "intron_variant"])) &
5858
(scoredf["variant_qc_flag"] == "PASS") &
5959
(~scoredf["pos"].isin(exclude_nonsense_coords)))
60-
nonsense_cnd = ((scoredf["simplified_consequence"].isin(["stop_gained",])) &
60+
nonsense_cnd = ((scoredf["consequence"].isin(["stop_gained",])) &
6161
(scoredf["variant_qc_flag"] == "PASS") &
6262
(~scoredf["pos"].isin(exclude_nonsense_coords)))
63-
nonsense_cnd_nofilt = ((scoredf["simplified_consequence"].isin(["stop_gained",])) &
63+
nonsense_cnd_nofilt = ((scoredf["consequence"].isin(["stop_gained",])) &
6464
(scoredf["variant_qc_flag"] == "PASS"))
6565
nonsense_count = scoredf[nonsense_cnd].shape[0]
6666
nonsense_count_nofilt = scoredf[nonsense_cnd_nofilt].shape[0]
@@ -382,6 +382,9 @@ def main():
382382
help='Gene to score')
383383
parser.add_argument('-V', '--vepdir', required=False,
384384
help="Path to directory with VEP files")
385+
parser.add_argument('-e', '--ensemblfile', required=False,
386+
default="/net/bbi/vol1/data/sge-analysis/etc/extended_ensembl_consequence.tsv",
387+
help="Path to ensembl consequence groupings in TSV format")
385388
parser.add_argument('-t', '--targetfile', required=False,
386389
help='Path to target file (can be auto detected)')
387390
parser.add_argument('-v', '--verbose', required=False, default=False,
@@ -477,11 +480,21 @@ def main():
477480
scoredf = scoredf.merge(annotdf[["pos_id", "Consequence",
478481
"amino_acid_change"]], on="pos_id")
479482

480-
scoredf["simplified_consequence"] = scoredf["Consequence"].apply(get_simplified_consequence)
481-
483+
scoredf["simplified_consequence"] = scoredf["Consequence"].apply(get_simplified_consequence, ensemblfile=args.ensemblfile)
484+
scoredf = scoredf.drop(columns=["Consequence"]).rename(columns={'simplified_consequence': 'consequence',
485+
'allele': 'alt'})
486+
482487
scoredf["variant_qc_flag"] = "PASS"
483-
scoredf = makeFunctionalClasses(args, scoredf, exclude_nonsense_coords)
484-
488+
if scoredf.shape[0] >= 1000:
489+
if args.verbose:
490+
sys.stderr.write("INFO: Dataset has at least 1000 variants (%d); proceeding with functional class assignment\n" % scoredf.shape[0])
491+
scoredf = makeFunctionalClasses(args, scoredf, exclude_nonsense_coords)
492+
else:
493+
if args.verbose:
494+
sys.stderr.write("INFO: Dataset has less than 1000 variants (%d); not performing functional class assignment\n" % scoredf.shape[0])
495+
scoredf["functional_consequence"] = ""
496+
scoredf["functional_consequence_zscore"] = ""
497+
485498
# now add counts
486499
snvlibs = countsdf.groupby(['pos_id'])['snvlib_count'].apply(lambda x: pd.Series(list(x))).unstack().reset_index().rename(columns={0: 'snvlib_lib1', 1: 'snvlib_lib2'})
487500

@@ -503,8 +516,8 @@ def main():
503516
).merge(d13r3, on="pos_id")
504517

505518
scoredf = scoredf[[
506-
"chrom", "pos", "ref", "allele", "exon", "target",
507-
"simplified_consequence", "score", "standard_error", "95_ci_upper", "95_ci_lower",
519+
"chrom", "pos", "ref", "alt", "exon", "target",
520+
"consequence", "score", "standard_error", "95_ci_upper", "95_ci_lower",
508521
"amino_acid_change", "functional_consequence",
509522
"functional_consequence_zscore", "variant_qc_flag",
510523
"snvlib_lib1", "snvlib_lib2",

etc/extended_ensembl_consequence.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ Intronic 135 intron_variant A transcript variant occurring within an intron
4343
NMD_transcript_variant NMD_transcript_variant A variant in a transcript that is the target of NMD SO:0001621 NMD transcript variant MODIFIER 130
4444
non_coding_transcript_variant non_coding_transcript_variant A transcript variant of a non coding RNA gene SO:0001619 Non coding transcript variant MODIFIER 120
4545
coding_transcript_variant coding_transcript_variant A transcript variant of a protein coding gene SO:0001968 Coding transcript variant MODIFIER 110
46-
upstream_gene_variant upstream_gene_variant A sequence variant located 5' of a gene SO:0001631 Upstream gene variant MODIFIER 100
46+
upstream_gene_variant upstream_gene_variant A sequence variant located 5' of a gene SO:0001631 Upstream gene variant MODIFIER 100 upstream_gene_variant
4747
downstream_gene_variant downstream_gene_variant A sequence variant located 3' of a gene SO:0001632 Downstream gene variant MODIFIER 90
4848
TFBS_ablation TFBS_ablation A feature ablation whereby the deleted region includes a transcription factor binding site SO:0001895 TFBS ablation MODIFIER 80
4949
TFBS_amplification TFBS_amplification A feature amplification of a region containing a transcription factor binding site SO:0001892 TFBS amplification MODIFIER 70

0 commit comments

Comments
 (0)