1
+ '''
2
+ Map Pfam clan.
3
+ Count Pfam number.
4
+ '''
5
+
6
+ def select_pfam (infile ,outfile ):
7
+ with open (outfile ,'wt' ) as out :
8
+ with open (infile ,'rt' ) as f :
9
+ for line in f :
10
+ if line .startswith ('smorf' ):
11
+ out .write (line )
12
+ else :
13
+ linelist = line .strip ().split ('\t ' )
14
+ if len (linelist ) > 10 and linelist [10 ].startswith ('pfam' ):
15
+ out .write (line )
16
+
17
+ # map Pfam clan A
18
+ def pfam2clan (in_file1 ,in_file2 ,out_file ):
19
+ pfam_dict = {}
20
+
21
+ with open (in_file1 ,'rt' ) as f1 :
22
+ for line in f1 :
23
+ linelist = line .strip ().split ('\t ' )
24
+ pfam = linelist [0 ].replace ('PF' ,'' )
25
+ pfam_dict [pfam ] = f'{ linelist [1 ]} \t { linelist [2 ]} \t { linelist [4 ]} '
26
+
27
+ with open (out_file ,'wt' ) as out :
28
+ out .write (f'smorf\t cdd\t query_length\t score\t align_length\t identity\t evalue\t target_length\t tcov\t number\t accession\t short_name\t description\t PSSM_Length\t clan_id\t clan\t short_description\n ' )
29
+ with open (in_file2 ,'rt' ) as f2 :
30
+ for line in f2 :
31
+ if line .startswith ('smorf' ):
32
+ continue
33
+ else :
34
+ line = line .replace ('\n ' ,'' )
35
+ linelist = line .split ('\t ' )
36
+ pf = linelist [10 ].replace ('pfam' ,'' )
37
+ if pf in pfam_dict .keys ():
38
+ out .write (f'{ line } \t { pfam_dict [pf ]} \n ' )
39
+ else :
40
+ out .write (f'{ line } \n ' )
41
+
42
+ # map Pfam clan C
43
+ def map_clan_c (in_file1 ,in_file2 ,out_file ):
44
+ pfam_dict = {}
45
+
46
+ with open (in_file1 ,'rt' ) as f1 :
47
+ for line in f1 :
48
+ linelist = line .strip ().split ('\t ' )
49
+ pfam_dict [linelist [1 ]] = linelist [2 ]
50
+
51
+ with open (out_file ,'wt' ) as out :
52
+ with open (in_file2 ,'rt' ) as f2 :
53
+ for line in f2 :
54
+ line = line .strip ()
55
+ if line .startswith ('smorf' ):
56
+ out .write (f'{ line } \t clan_description\n ' )
57
+ else :
58
+ linelist = line .split ('\t ' )
59
+ if len (linelist ) == 17 and linelist [- 2 ] in pfam_dict .keys ():
60
+ out .write (f'{ line } \t { pfam_dict [linelist [- 2 ]]} \n ' )
61
+ else :
62
+ out .write (f'{ line } \n ' )
63
+
64
+ # group ribosomal and unknown proteins
65
+ def modify (infile ,outfile ):
66
+ with open (outfile ,'wt' ) as out :
67
+ with open (infile ,'rt' ) as f :
68
+ for line in f :
69
+ line = line .strip ()
70
+ if line .startswith ('smorf' ):
71
+ out .write (f'group\t { line } \n ' )
72
+ else :
73
+ linelist = line .split ('\t ' )
74
+ if len (linelist ) > 16 :
75
+ if len (linelist ) == 17 :
76
+ if 'ribosomal' in linelist [- 1 ].casefold ():
77
+ out .write (f'Ribosomal protein\t { line } \n ' )
78
+ elif 'uncharacterised' in linelist [- 1 ].casefold () or 'unknown' in linelist [- 1 ].casefold ():
79
+ out .write (f'Domain of unknown function\t { line } \n ' )
80
+ else :
81
+ group = linelist [- 1 ].split (',' )[0 ]
82
+ out .write (f'{ group } \t { line } \n ' )
83
+ else :
84
+ if 'ribosomal' in linelist [- 2 ].casefold ():
85
+ out .write (f'Ribosomal protein\t { line } \n ' )
86
+ else :
87
+ out .write (f'{ linelist [- 1 ]} \t { line } \n ' )
88
+ else :
89
+ out .write (f'\t { line } \n ' )
90
+
91
+ def count_pfam (infile ,outfile1 ,outfile2 ):
92
+ group = {}
93
+ pfam = {}
94
+ all = set ()
95
+
96
+ pfam_info = {}
97
+
98
+ out1 = open (outfile1 ,'wt' )
99
+ out2 = open (outfile2 ,'wt' )
100
+
101
+ with open (infile ,'rt' ) as f :
102
+ for line in f :
103
+ if line .startswith ('group' ):
104
+ continue
105
+ else :
106
+ linelist = line .strip ().split ('\t ' )
107
+ if linelist [0 ] not in group .keys ():
108
+ group [linelist [0 ]] = set ()
109
+ group [linelist [0 ]].add (linelist [1 ])
110
+ if linelist [11 ] not in pfam .keys ():
111
+ pfam [linelist [11 ]] = set ()
112
+ pfam_info [linelist [11 ]] = f'{ linelist [0 ]} \t { linelist [12 ]} \t { linelist [13 ]} \t { linelist [15 ]} \t { linelist [16 ]} \t { linelist [17 ]} '
113
+ pfam [linelist [11 ]].add (linelist [1 ])
114
+
115
+ all .add (linelist [1 ])
116
+
117
+ out1 .write (f'group\t count\t all_count\n ' )
118
+ for key ,value in group .items ():
119
+ number = len (value )
120
+ all_number = len (all )
121
+ out1 .write (f'{ key } \t { number } \t { all_number } \n ' )
122
+
123
+ out2 .write (f'pfam\t count\t all_count\t group\t short_name\t description\t clan_id\t clan\t short_description\n ' )
124
+ for key ,value in pfam .items ():
125
+ number = len (value )
126
+ all_number = len (all )
127
+ out2 .write (f'{ key } \t { number } \t { all_number } \t { pfam_info [key ]} \n ' )
128
+
129
+ out1 .close ()
130
+ out2 .close ()
131
+
132
+ infile1 = 'multi_genus_3_cdd.tsv'
133
+ infile2 = '90AA_multi_habitat_cdd.tsv'
134
+ outfile1 = 'multi_genus_3_pfam.tsv'
135
+ outfile2 = '90AA_multi_habitat_pfam.tsv'
136
+ select_pfam (infile1 ,outfile1 )
137
+ select_pfam (infile2 ,outfile2 )
138
+
139
+ pfama = 'Pfam-A.clans.tsv'
140
+ outfile3 = 'multi_genus_3_pfam_clan_A.tsv'
141
+ outfile4 = '90AA_multi_habitat_pfam_clan_A.tsv'
142
+ pfam2clan (pfama ,outfile1 ,outfile3 )
143
+ pfam2clan (pfama ,outfile2 ,outfile4 )
144
+
145
+ pfamc = 'pfam-c_format.txt'
146
+ outfile5 = 'multi_genus_3_pfam_clan_A_C.tsv'
147
+ outfile6 = '90AA_multi_habitat_pfam_clan_A_C.tsv'
148
+ map_clan_c (pfamc ,outfile5 ,outfile6 )
149
+
150
+ outfile7 = 'multi_genus_3_pfam_clan_A_C_group.tsv'
151
+ outfile8 = '90AA_multi_habitat_pfam_clan_A_C_group.tsv'
152
+ modify (outfile7 ,outfile8 )
153
+
154
+ outfile9 = 'multi_genus_3_pfam_group_count.tsv'
155
+ outfile10 = 'multi_genus_3_pfam_count.tsv'
156
+ outfile11 = '90AA_multi_habitat_pfam_group_count.tsv'
157
+ outfile12 = '90AA_multi_habitat_pfam_count.tsv'
158
+ count_pfam (outfile7 ,outfile9 ,outfile10 )
159
+ count_pfam (outfile8 ,outfile11 ,outfile12 )
0 commit comments