11
11
'''
12
12
Generate original name - 90AA rename list.
13
13
Peptides are named: >GMSC10.90AA.XXX_XXX_XXX
14
- Numbers were assigned in order of increasing number of copies.
15
- So that the lower the number, the lower the number of copies of that peptide was present in the input data.
16
- And if the number of copies is same, numbers were assigned in order of letters of peptides.
17
14
'''
18
15
def rename (infile1 ,infile2 ,outfile ,n ,prefix ):
19
16
number = {}
20
- seqnumber_list = []
17
+ seqnumber_list = []
21
18
22
19
with gzip .open (infile2 ,"rt" ) as f2 :
23
20
for line in f2 :
24
- line = line .strip ()
25
- linelist = line .split ("\t " )
26
- if linelist [0 ] in number .keys ():
27
- number [linelist [0 ]] += 1
21
+ cluster ,member = line .strip ().split ("\t " )
22
+ if cluster in number .keys ():
23
+ number [cluster ] += 1
28
24
else :
29
- number [linelist [ 0 ] ] = 1
25
+ number [cluster ] = 1
30
26
31
27
for ID ,seq in fasta_iter (infile1 ):
32
28
seqnumber_tup = (int (number [ID ]),seq ,ID )
33
29
seqnumber_list .append (seqnumber_tup )
34
30
35
- sortseqnumber_list = sorted (seqnumber_list ,key = itemgetter (0 ,1 ))
31
+ sortseqnumber_list = sorted (seqnumber_list ,key = itemgetter (0 ,1 ))
36
32
with lzma .open (outfile ,"wt" ) as out :
37
- for i in range ( len ( sortseqnumber_list )) :
33
+ for item in sortseqnumber_list :
38
34
nf = f'{ n :09} '
39
- out .write (f'{ sortseqnumber_list [ i ] [2 ]} \t { prefix } .{ nf [:3 ]} _{ nf [3 :6 ]} _{ nf [6 :9 ]} \n ' )
40
- n += 1
35
+ out .write (f'{ item [2 ]} \t { prefix } .{ nf [:3 ]} _{ nf [3 :6 ]} _{ nf [6 :9 ]} \n ' )
36
+ n += 1
41
37
42
38
'''
43
- Generate originalname - 100AA - 90AA rename list.
39
+ Generate original name - 100AA - 90AA rename list.
44
40
'''
45
41
def rename_all (infile1 ,infile2 ,outfile ):
46
42
name = {}
47
43
out1 = lzma .open (outfile , "wt" )
48
44
49
45
with lzma .open (infile1 ,"rt" ) as f1 :
50
46
for line in f1 :
51
- line = line .strip ()
52
- linelist = line .split ("\t " )
53
- name [linelist [0 ]] = linelist [1 ]
47
+ old ,new = line .strip ().split ("\t " )
48
+ name [old ] = new
54
49
55
50
with gzip .open (infile2 ,"rt" ) as f2 :
56
51
for line in f2 :
57
- line = line .strip ().strip (">" )
58
- linelist = line .split ("\t " )
59
- if linelist [0 ] in name .keys ():
60
- out1 .write (linelist [0 ]+ "\t " + linelist [1 ]+ "\t " + name [linelist [0 ]]+ "\n " )
61
- out1 .close ()
52
+ old ,new = line .strip ().split ("\t " )
53
+ if old in name .keys ():
54
+ out1 .write (f'{ old } \t { new } \t { name [new ]} \n ' )
55
+ out1 .close ()
62
56
63
57
'''
64
58
Generate rename and sequence of 90AA faa.
65
59
'''
66
60
def getfaa (infile1 ,infile2 ,outfile ):
67
61
name = {}
68
- out1 = lzma .open (outfile , "wt" )
62
+ out = lzma .open (outfile , "wt" )
69
63
70
64
with lzma .open (infile1 ,"rt" ) as f1 :
71
65
for line in f1 :
72
- line = line .strip ()
73
- linelist = line .split ("\t " )
74
- name [linelist [0 ]] = linelist [1 ]
75
-
76
-
66
+ old ,new = line .strip ().split ("\t " )
67
+ name [old ] = new
68
+
77
69
for ID ,seq in fasta_iter (infile2 ):
78
- out1 .write (f'>{ name [ID ]} \n { seq } \n ' )
79
- out1 .close ()
70
+ out .write (f'>{ name [ID ]} \n { seq } \n ' )
71
+ out .close ()
80
72
81
73
'''
82
74
Generate rename and sequence of 90AA fna.
83
75
'''
84
76
def getfna (infile1 ,infile2 ,outfile ):
85
77
fasta = {}
86
78
table = {}
87
- out1 = lzma .open (outfile , "wt" )
79
+
80
+ out = lzma .open (outfile , "wt" )
81
+
88
82
with lzma .open (infile1 ,"rt" ) as f1 :
89
83
for line in f1 :
90
- line = line .strip ()
91
- linelist = line .split ("\t " )
92
- table [linelist [1 ]] = linelist [2 ]
84
+ old ,name100 ,name90 = line .strip ().split ("\t " )
85
+ table [name100 ] = name90
93
86
94
87
for ID ,seq in fasta_iter (infile2 ):
95
88
if ID in table .keys ():
96
89
fasta [table [ID ]] = seq
97
- table = {}
90
+
98
91
for ID ,seq in sorted (fasta .items ()):
99
- out1 .write (f">{ ID } \n { seq } \n " )
100
- out1 .close ()
92
+ out .write (f">{ ID } \n { seq } \n " )
93
+ out .close ()
101
94
102
- INPUT_FILE_1 = "./clust_result/0.5_result/ metag_ProG_nonsingleton_0.9_clu_rep.faa.gz"
103
- INPUT_FILE_2 = "./clust_result/0.5_result/ metag_ProG_nonsingleton_0.9_clu.tsv.gz"
104
- INPUT_FILE_3 = "./data/ 100AA_rename.tsv.xz"
105
- INPUT_FILE_4 = "./data/frozen/ 100AA_GMSC.fna.xz"
95
+ INPUT_FILE_1 = "metag_ProG_nonsingleton_0.9_clu_rep.faa.gz"
96
+ INPUT_FILE_2 = "metag_ProG_nonsingleton_0.9_clu.tsv.gz"
97
+ INPUT_FILE_3 = "100AA_rename.tsv.xz"
98
+ INPUT_FILE_4 = "100AA_GMSC.fna.xz"
106
99
107
- OUTPUT_FILE_1 = "./data/frozen/ 90AA_rename.tsv.xz"
108
- OUTPUT_FILE_2 = "./data/frozen/ 90AA_rename_all.tsv.xz"
109
- OUTPUT_FILE_3 = "./data/frozen/ 90AA_GMSC.faa.xz"
110
- OUTPUT_FILE_4 = "./data/frozen/ 90AA_GMSC.fna.xz"
100
+ OUTPUT_FILE_1 = "90AA_rename.tsv.xz"
101
+ OUTPUT_FILE_2 = "90AA_rename_all.tsv.xz"
102
+ OUTPUT_FILE_3 = "90AA_GMSC.faa.xz"
103
+ OUTPUT_FILE_4 = "90AA_GMSC.fna.xz"
111
104
112
105
rename (INPUT_FILE_1 ,INPUT_FILE_2 ,OUTPUT_FILE_1 ,0 ,'GMSC10.90AA' )
113
106
rename_all (OUTPUT_FILE_1 ,INPUT_FILE_3 ,OUTPUT_FILE_2 )
114
107
getfaa (OUTPUT_FILE_1 ,INPUT_FILE_1 ,OUTPUT_FILE_3 )
115
- getfna (OUTPUT_FILE_2 ,INPUT_FILE_4 ,OUTPUT_FILE_4 )
108
+ getfna (OUTPUT_FILE_2 ,INPUT_FILE_4 ,OUTPUT_FILE_4 )
0 commit comments