bertax_training/preprocessing/filter_fragments.py at c57316560ca5755cb9c195a4a04b8a91e6674d4e · rnajena/bertax_training · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import json
import argparse
from ete3 import NCBITaxa

offline_db = "./taxa.sqlite"

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', nargs='+')
    parser.add_argument('taxid', nargs='+', type=int)
    parser.add_argument('--out_prefix', default='filtered_')
    parser.add_argument('--save_other', action='store_true')
    parser.add_argument('--reverse', action='store_true')
    args = parser.parse_args()
    if (len(args.input) == 2):
        fragments = json.load(open(args.input[0]))
        species_list = [int(line.strip()) for line in
                        open(args.input[1]).readlines()]
        assert len(species_list) == len(fragments)
        iterator = zip(fragments, species_list)
        out_mode = 'json'
        print('json + txt ')
    elif (len(args.input) == 1):
        from Bio.SeqIO import parse
        records = parse(open(args.input[0]), 'fasta')
        iterator = ((str(r.seq), int(r.id)) for r in records)
        out_mode = 'fasta'
    else:
        raise Exception(
            'input either has to be a json and a txt-list or a fasta')
    print(('json + txt' if out_mode == 'json' else 'fasta')
          + ' has been provided as input, output will be of the same format')
    ncbi = NCBITaxa(dbfile=offline_db)
    species_filter = []
    # if (len(args.taxid) == 1 and str(args.taxid[0]).endswith('%')):
    #     perc = int(str(args.taxid[0])[:str(args.taxid[0]).index('%')])
    #     print(f'filtering {perc}% of all families')
    #     families = []
    #     for _, sp in iterator:
    #         fam = [_ for _ in ncbi.get_lineage(sp)
    #                if ncbi.get_rank([_])[_] == 'family']
    #         if len(fam) >= 1:
    #             families.append(fam[0])
    #     species_ =
    print('filtering ' + ','.join(ncbi.translate_to_names(args.taxid)))
    for taxid in args.taxid:
        try:
            species_filter.extend(ncbi.get_descendant_taxa(taxid, True))
        except ValueError:
            print('could not get descendant taxa of ' + ','.join(
                ncbi.translate_to_names([taxid]))
                + ', filtering only for this taxon; ')
            species_filter.append(taxid)
    filtered_fragments = []
    filtered_species = []
    filtered_fragments_other = []
    filtered_species_other = []
    for fragment, species in iterator:
        if ((not args.reverse and species not in species_filter) or (
                args.reverse and species in species_filter)):
            filtered_fragments.append(fragment)
            filtered_species.append(species)
        elif (args.save_other):
            filtered_fragments_other.append(fragment)
            filtered_species_other.append(species)
    tuples = [(filtered_fragments, filtered_species, '')]
    if (args.save_other):
        tuples.append((filtered_fragments_other, filtered_species_other,
                       '_other'))
    if (out_mode == 'json'):
        for fragments, species, suffix in tuples:
            json.dump(fragments, open(
                f'{args.out_prefix}_fragments{suffix}.json', 'w'))
            with open(f'{args.out_prefix}_species{suffix}.txt', 'w') as f:
                for sp in species:
                    f.write(str(sp) + '\n')
    else:
        for fragments, species, suffix in tuples:
            with open(args.out_prefix + suffix + '.fa', 'w') as f:
                for f, s in zip(fragments, species):
                    f.write(f'>{s}\n{f}\n')