-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfasta_sumvals.py
More file actions
73 lines (65 loc) · 2.66 KB
/
fasta_sumvals.py
File metadata and controls
73 lines (65 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python
import argparse, collections, json, os, sys
def get_val_by_key(fhdr, key):
val = None
for i, tok in enumerate(fhdr.split()):
if i > 0 and len(tok.split('=')) == 2:
k, v = tok.split('=')
if k == key:
assert val == None, 'The header {} has duplicated key {}'.format(fhdr, key)
val = v
return val
def rm_key_val(fhdr, key):
ret = []
for i, tok in enumerate(fhdr.split()):
if i > 0 and len(tok.split('=')) == 2:
k, v = tok.split('=')
if k != key:
ret.append(tok)
else:
ret.append(tok)
return ' '.join(ret)
def main():
parser = argparse.ArgumentParser(description = 'Read FASTA records from stdin, merge records with duplicated sequences while summing values of the key (where the key-value pair is of the form <key>=<val> in comments), and write the merged FASTA records to stdout. ',
formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--key', default = '', type = str,
help = 'The key by which the summation is performed (e.g., TPM for FASTA header with TPM=... in its comment). ')
parser.add_argument('--seq2hdrs', default = '', type = str,
help = 'The original sequence-to-header mapping is stored to this json file. Skip generating this json file if this param is set to the empty string. ')
args = parser.parse_args()
fhdr = None
uhdrs = []
uhdr2seq = {}
seq2hdrs = collections.defaultdict(list)
seq2val = collections.defaultdict(int)
for line in sys.stdin:
if line.startswith('>'):
if fhdr:
fseq = ''.join(fseq)
if fseq not in seq2hdrs:
uhdrs.append(uhdr)
uhdr2seq[uhdr] = fseq
seq2hdrs[fseq].append(fhdr)
seq2val[fseq] += val
fhdr = line.strip()
val = get_val_by_key(fhdr, args.key)
assert val, F'The header {line} does not have the key {args.key}!'
val = float(val)
uhdr = rm_key_val(fhdr, args.key)
fseq = []
else:
fseq.append(line.strip())
if fhdr:
fseq = ''.join(fseq)
if fseq not in seq2hdrs:
uhdrs.append(uhdr)
uhdr2seq[uhdr] = fseq
seq2hdrs[fseq].append(fhdr)
seq2val[fseq] += val
for uhdr in uhdrs:
fseq = uhdr2seq[uhdr]
print(F'''{uhdr} {args.key}={seq2val[fseq]}\n{(fseq)}''')
if args.seq2hdrs:
with open(args.seq2hdrs, 'w') as file:
json.dump(seq2hdrs, file, indent=2)
if __name__ == '__main__': main()