Skip to content

Commit ea176a6

Browse files
Major refactoring of auxiliary functions (#10)
* refactor map_habitat.py delete unnecessary code repetition fix readability makes code cleaner * refactor filter_length.py minor changes, improve code readability * fix problem with merging step * fix format of the table previously, there was a column separated by multiple spaces this occasionated one of the values as NA, which breaks the merge procedure by PANDAS. * refactor and fix column renaming column names were wrongly associated before to float valued columns now it is fixed by always fixing the position of these columns as the first two. * fix variable naming * refactor map_quality.py fix some internal functions, reduce overworking improved readability * fix error in the loop to include only small orfs The loop before would include all sequences regardless the result of the tests before (length and if was already seen), so now it loops first in the length, if condition met, then goes to the seen test, and if not seen before, then it is added to our dictionary, avoiding overwritting and conflicts. * refactor and enhance the deeplca function Improved readability of the code, and reduce the complexity and time to compute LCA for each smorf. * Update map_taxonomy.py * Update map_taxonomy.py * Update map_taxonomy.py * fix error in format of one variable
1 parent 0477159 commit ea176a6

File tree

7 files changed

+267
-159
lines changed

7 files changed

+267
-159
lines changed

gmsc_mapper/filter_length.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,40 @@
1-
def filter_length(queryfile,tmpdirname,N):
1+
message_error_all_long = '''GMSC-mapper Error: Input sequences are all more than 303nt or 100aa,
2+
which will be filtered. There are some options:
3+
1.If you don't want to filter, please use --nofilter flag.
4+
2.Please check if your input consist of contigs, which should
5+
use -i not --nt-genes or --aa-genes as input.
6+
'''
7+
8+
message_error_longer='''GMSC-mapper Warning: Input has seqences more than 303nt or 100aa,
9+
which will be filtered. If you don't want to filter,
10+
please use --nofilter flag.
11+
'''
12+
13+
14+
def filter_length(queryfile, tmpdirname, N):
215
import sys
16+
317
from os import path
418
from .fasta import fasta_iter
5-
filtered_file = path.join(tmpdirname,"filtered.faa")
19+
20+
filtered_file = path.join(tmpdirname, "filtered.faa")
621

7-
with open(filtered_file,'wt') as of:
22+
with open(filtered_file, 'wt') as of:
823
all_longer_flag = 1
924
longer_exist_flag = 0
10-
for ID,seq in fasta_iter(queryfile):
25+
26+
for ID, seq in fasta_iter(queryfile):
1127
if len(seq) < N:
1228
all_longer_flag = 0
1329
of.write(f'>{ID}\n{seq}\n')
1430
else:
1531
longer_exist_flag = 1
32+
1633
if all_longer_flag:
17-
sys.stderr.write("GMSC-mapper Error: Input sequences are all more than 303nt or 100aa,which will be filtered. 1.If you don't want to filter,please use --nofilter flag. 2.Please check if your input is contigs,which should use -i not --nt-genes or --aa-genes as input.\n")
34+
sys.stderr.write(message_error_all_long)
1835
sys.exit(1)
1936
else:
2037
if longer_exist_flag:
21-
print("GMSC-mapper Warning: Input has seqences more than 303nt or 100aa,which will be filtered.If you don't want to filter,please use --nofilter flag.\n")
22-
return filtered_file
38+
print(message_error_longer)
39+
40+
return filtered_file

gmsc_mapper/map_habitat.py

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,75 @@
11
import pandas as pd
2+
23
from os import path
34

4-
def smorf_habitat(outdir,habitatfile,resultfile):
5-
habitat_file = path.join(outdir,"habitat.out.smorfs.tsv")
65

7-
result = pd.read_csv(resultfile,sep='\t',header=None)
8-
result = result.rename(columns={0:'qseqid',1:'sseqid'})
9-
if habitatfile.endswith('.gz'):
10-
reader = pd.read_csv(habitatfile,compression="gzip",sep="\t",chunksize=5000000,header=None)
11-
if habitatfile.endswith('.xz'):
12-
reader = pd.read_csv(habitatfile,compression="xz",sep="\t",chunksize=5000000,header=None)
13-
if habitatfile.endswith('.bz2'):
14-
reader = pd.read_csv(habitatfile,compression="bz2",sep="\t",chunksize=5000000,header=None)
15-
else:
16-
reader = pd.read_csv(habitatfile,sep="\t",chunksize=5000000,header=None)
6+
def fixdf(x):
7+
x = x.dropna()
8+
x = x.drop_duplicates()
9+
return ','.join(x)
10+
11+
12+
def formatlabel(x):
13+
x = x.split(',')
14+
x = list(set(x))
15+
x = sorted(x)
16+
return ','.join(x)
17+
18+
19+
def smorf_habitat(outdir, habitatfile, resultfile):
20+
habitat_file = path.join(outdir, "habitat.out.smorfs.tsv")
21+
22+
result = pd.read_csv(resultfile,
23+
sep='\t',
24+
header=None)
25+
26+
result.rename({0: 'qseqid', 1: 'sseqid'},
27+
axis=1,
28+
inplace=True)
29+
30+
reader = pd.read_table(habitatfile,
31+
sep="\t",
32+
chunksize=5_000_000,
33+
header=None,
34+
names=['sseqid', 'habitat'])
1735

1836
output_list = []
1937
for chunk in reader:
20-
chunk.columns = ['sseqid','habitat']
21-
output_chunk = pd.merge(result,chunk,how='left')[['qseqid', 'habitat']]
38+
output_chunk = result.merge(on='sseqid',
39+
right=chunk,
40+
how='left')
41+
output_chunk = output_chunk[['qseqid', 'habitat']]
2242
output_list.append(output_chunk)
23-
output = pd.concat(output_list, axis=0).sort_values(by='qseqid')
24-
output = output.groupby('qseqid',as_index=False,sort=False).agg({'habitat':lambda x : ','.join(x.dropna().drop_duplicates())})
25-
output['habitat'] = output['habitat'].apply(lambda x: ','.join(sorted(list(set(x.split(','))))))
26-
output.to_csv(habitat_file,sep='\t',index=False)
43+
44+
output = pd.concat(output_list,
45+
axis=0)
46+
47+
output = output.sort_values(by='qseqid')
48+
49+
output = output.groupby('qseqid',
50+
as_index=False,
51+
sort=False)
52+
53+
output = output.agg({'habitat':lambda x : fixdf(x)})
54+
output['habitat'] = output['habitat'].apply(lambda x: formatlabel(x))
55+
56+
output.to_csv(habitat_file,
57+
sep='\t',
58+
index=False)
59+
60+
wdf = output['habitat'].apply(lambda x: len(x.split(',')))
61+
number_dict = dict(wdf.value_counts())
62+
number_dict_normalize = dict(wdf.value_counts(normalize=True))
2763

28-
number_dict = dict(output['habitat'].apply(lambda x: len(x.split(','))).value_counts())
29-
number_dict_normalize = dict(output['habitat'].apply(lambda x: len(x.split(','))).value_counts(normalize=True))
3064
if 1 in number_dict.keys():
3165
single_number = number_dict[1]
3266
single_percentage = number_dict_normalize[1]
3367
else:
3468
single_number = 0
3569
single_percentage = 0
70+
3671
multi_number = output['habitat'].size - single_number
3772
multi_percentage = 1 - single_percentage
3873

39-
return single_number,single_percentage,multi_number,multi_percentage
74+
return (single_number, single_percentage, multi_number, multi_percentage)
75+

gmsc_mapper/map_quality.py

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,60 @@
1-
from os import path
21
import pandas as pd
32

4-
def smorf_quality(outdir,qualityfile,resultfile):
5-
quality_file = path.join(outdir,"quality.out.smorfs.tsv")
3+
from os import path
4+
65

7-
result = pd.read_csv(resultfile,sep='\t',header=None)
8-
result = result.rename(columns={0:'qseqid',1:'sseqid'})
9-
if qualityfile.endswith('.gz'):
10-
ref_quality = pd.read_csv(qualityfile,compression="gzip",sep='\t',header=None)
11-
if qualityfile.endswith('.xz'):
12-
ref_quality = pd.read_csv(qualityfile,compression="xz",sep='\t',header=None)
13-
if qualityfile.endswith('.bz2'):
14-
ref_quality = pd.read_csv(qualityfile,compression="bz2",sep='\t',header=None)
6+
def judgefunc(x):
7+
x = x.split(',')
8+
if 'high quality' in x:
9+
return 'high quality'
1510
else:
16-
ref_quality = pd.read_csv(qualityfile,sep="\t",header=None)
11+
return 'low quality'
12+
13+
14+
def smorf_quality(outdir:str, qualityfile:str, resultfile:str) -> tuple:
15+
result = pd.read_csv(resultfile,
16+
sep='\t',
17+
header=None)
18+
19+
result = result.rename({0:'qseqid', 1:'sseqid'},
20+
axis=1)
21+
22+
quality_file = path.join(outdir,
23+
"quality.out.smorfs.tsv")
24+
25+
ref_quality = pd.read_table(qualityfile,
26+
sep="\t",
27+
header=None)
1728

1829
ref_quality.columns = ['sseqid']
1930
ref_quality['quality'] = 'high quality'
2031

21-
output = pd.merge(result,ref_quality,how='left')[['qseqid', 'quality']].fillna('low quality')
22-
output = output.groupby('qseqid',as_index=False,sort=False).agg({'quality':lambda x : ','.join(x.drop_duplicates())})
23-
rule = {"high quality":0,"low quality":1}
24-
output['quality'] = output['quality'].apply(lambda x: sorted(x.split(','),key=lambda x:rule[x])[0])
25-
32+
output = result.merge(on='sseqid',
33+
right=ref_quality,
34+
how='left')
35+
36+
output = output[['qseqid', 'quality']]
37+
output = output.fillna('low quality')
38+
39+
output = output.groupby('qseqid',
40+
as_index=False,
41+
sort=False)
42+
43+
output = output.agg({'quality': lambda x: ','.join(x.drop_duplicates())})
44+
output['quality'] = output['quality'].apply(lambda x: judgefunc(x))
45+
2646
number_dict = dict(output['quality'].value_counts())
2747
number_dict_normalize = dict(output['quality'].value_counts(normalize=True))
48+
2849
if "high quality" in number_dict.keys():
2950
number = number_dict['high quality']
3051
percentage = number_dict_normalize['high quality']
3152
else:
3253
number = 0
3354
percentage = 0
34-
output.to_csv(quality_file,sep='\t',index=False)
35-
return number,percentage
55+
56+
output.to_csv(quality_file,
57+
sep='\t',
58+
index=False)
59+
60+
return (number, percentage)

0 commit comments

Comments
 (0)