seqExtract/rescue.py at master · sym24/seqExtract · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python

import os
import sys
from pybedtools import BedTool
import pysam
import csv

def get_cosmic_info(target_file):
    """Extract cosmic genes approved symbol into a set and
    a dictionary contains chromsome location
    """
    chrom = {}
    gene_set = set()
    with open(target_file, 'r') as csvfile:
        incsv = csv.reader(csvfile)
        hasheader = csv.Sniffer().has_header(csvfile.read(1024))
        csvfile.seek(0)
        if hasheader:
            next(incsv)
        for line in incsv:
            gene_set.add(line[0])
            if line[0] not in chrom:
                chrom[line[0]] = line[3]
        if 'Gene Symbol' in gene_set:
            print 'Failed to skip first row of csv file'
    print 'Total number of genes: ', len(gene_set)
    return gene_set, chrom

def rescue_miss_gene(miss_genes, hugo_table):
    """Rescue missing genes using hgnc_table
    by replacing old genes with new gene name
    """
    print '...Rescue missing genes...'
    newdict = {}
    # Store old gene names as dic key, new gene names and
    with open(hugo_table, 'r') as fi1:
        for line in csv.reader(fi1, dialect='excel-tab'):
            old = line[2].split(', ') + line[3].split(', ')
            for ele in old:
                if ele in newdict:
                    newdict[ele].append(line[0])
                else:
                    newdict[ele] = [line[0]]
    new_list = []
    still_miss_list = []
    new_gene_num = 0
    for oldgene in miss_genes:
        if oldgene in newdict:
            new_gene_num += 1
            for ele in newdict[oldgene]:
                new_list.append(ele)
        else:
            still_miss_list.append(oldgene)
    still_miss_set = set(still_miss_list)
    new_set = set(new_list)
    print 'New genes set: ', ', '.join(new_set)
    print 'Still missing old genes: ', ', '.join(still_miss_set)
    print '%d old genes found new names: '%new_gene_num
    return (new_set, still_miss_set, newdict, new_gene_num)

def check_rescue_bounds(miss_gene, chrom, newdict, new_set, buffer, gtf_file):
    """
    Extract rescued genes' bounds and check chromsome match to initial cosmic genes
    """
    print '...Get rescued gene boundaries...'
    bounds = {}
    for feature in BedTool(gtf_file):
        if feature[2] == 'exon' and feature.attrs.has_key('gene_id'):
             gene = feature.attrs['gene_id']
             if gene in new_set:
                 startp = int(feature.start) + 1 - buffer
                 endp = int(feature.stop) + buffer
                 try:
                     # update the starting and end point
                     if startp < bounds[gene][1]:
                         bounds[gene][1] = startp
                     if endp > bounds[gene][2]:
                         bounds[gene][2] = endp
                 except KeyError:
                     # add new key
                     bounds[gene] = [feature[0], startp, endp]

    failed_rescue = []
    new2old = {}
    for oldgene in miss_gene:
        if oldgene in newdict:
            print '{oldgene: newgenes}  {%s: %s}'%(oldgene, newdict[oldgene])
            flag1 = flag2 = 0
            for newgene in newdict[oldgene]:
                if newgene in bounds:
                    coord = bounds[newgene]
                    if coord[0][3:] == chrom[oldgene]:
                        print '...chromsome matches...old gene %s rescued: '%oldgene
                        print '...new gene %s is extracted t'%newgene
                        # if chromsome match, add old gene to new gene bounds
                        bounds[newgene].append(oldgene)
                        if newgene not in new2old:
                            new2old[newgene]=oldgene
                        else:
                            print '......!!!New gene %s already in bounds!!!...... '%newgene
                            print 'New gene %s ==> Existing old gene %s'%\
                            (newgene, new2old[newgene])
                    else:
                        flag2 += 1
                        print 'Sequence chromsome does not match. Delete key', newgene
                        del bounds[newgene]
                        if flag2 == len(newdict[oldgene]):
                            failed_rescue.append(oldgene)
                else:
                    flag1 += 1
                    print 'Unable to extract bounds for ', newgene
                    if flag1 == len(newdict[oldgene]):
                        failed_rescue.append(oldgene)
        else:
            print 'Old gene %s is not found in new gene dictionary'%oldgene

    return (bounds, failed_rescue)

def extract_rescue_seq(target_fa, ref_file, bounds):
    """
    Extract genes from reference genome and write it to output file.
    """
    print '...Extract rescued gene sequence...'
    with open(target_fa, 'a') as fi:
        ref_fasta = pysam.Fastafile(ref_file)
        for gene, coord in bounds.iteritems():
            seq = ref_fasta.fetch(coord[0], coord[1], coord[2])
            fi.write("%s%s(%s) %s:%d-%d\n%s\n"\
            %('>',gene,', '.join(coord[3:]),coord[0],coord[1],coord[2],seq))

if __name__ == '__main__':
    print 'rescue funtions for sequence extraction'