Skip to content

Commit 34189b1

Browse files
committed
ENH update 00_Remove_redundancy_and_cluster
1 parent 5dc9634 commit 34189b1

12 files changed

+68
-113
lines changed

General_Scripts/00_Remove_redundancy_and_cluster/01_deduplicate_sort_merge.py renamed to General_Scripts/00_Remove_redundancy_and_cluster/01_deduplicate_sort_merge_extract.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
Concept:
33
The number of all redundant smORFs from metagenomes and Progenome2 is 4,599,187,424
44
(metagenomes:4,564,570,019,Progenome2:34,617,405).
5-
The whole smORFs can't be sorted in memory at one time and need to be splited into 256 subfiles.
6-
Then we can de duplicate and sort each subfile separately.
7-
Finally, we merge all the subfiles to generate non-redundant sorted smORFs.
5+
The whole smORFs are too large to be sorted in memory.
6+
1. Split smORFs into 256 subfiles to de duplicate and sort each subfile separately.
7+
2. Merge all the subfiles to generate non-redundant sorted smORFs.
8+
3. Extract non-singletons and singletons.
89
'''
910

1011
from jug import TaskGenerator, bvalue
@@ -38,7 +39,7 @@ def splitseq(infile):
3839
return (outputlist)
3940

4041
'''
41-
De duplicate and sort every subfiles according to sequence alphabetical order.
42+
De duplicate and sort subfiles according to sequence alphabetical order.
4243
Calculate the number of occurrences of each sequence.
4344
'''
4445
@TaskGenerator
@@ -81,10 +82,33 @@ def mergeseq(outfile):
8182
preseq = seq
8283
print("finish merge")
8384

85+
@TaskGenerator
86+
def extract_seq(infile1,infile2,outfile1,outfile2):
87+
fastaset = set()
88+
with gzip.open(infile1,"rt") as f:
89+
for line in f:
90+
line = line.strip()
91+
linelist = line.split("\t")
92+
if linelist[0] != "1":
93+
fastaset.add(linelist[1])
94+
95+
with gzip.open(outfile1, "wt", compresslevel=1) as out1, \
96+
gzip.open(outfile2, "wt", compresslevel=1) as out2:
97+
for ID,seq in fasta_iter(infile2):
98+
if seq in fastaset:
99+
out1.write(f'>{ID}\n{seq}\n')
100+
else:
101+
out2.write(f'>{ID}\n{seq}\n')
102+
84103
INPUT_FILE = "GMSC10.metag_Prog_smorfs.faa.gz"
85104
OUTPUT_FILE = "metag_ProG_dedup.faa.gz"
86105

87106
splits = splitseq(INPUT_FILE)
88107
for sp in bvalue(splits):
89108
dedup_fasta(sp)
90-
mergeseq(OUTPUT_FILE)
109+
mergeseq(OUTPUT_FILE)
110+
111+
INPUT_FILE_1 = "metag_ProG.raw_number.tsv.gz"
112+
OUT_FILE_1 = "metag_ProG_nonsingleton.faa.gz"
113+
OUT_FILE_2 = "metag_ProG_singleton.faa.gz"
114+
extract_seq(INPUT_FILE_1,OUTPUT_FILE,OUT_FILE_1,OUT_FILE_2)

General_Scripts/00_Remove_redundancy_and_cluster/02_extract.py

Lines changed: 0 additions & 32 deletions
This file was deleted.

General_Scripts/00_Remove_redundancy_and_cluster/03_linclust.sh renamed to General_Scripts/00_Remove_redundancy_and_cluster/02_linclust.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ cd clust_result
1212
#make db
1313
mmseqs createdb metag_ProG_nonsingleton.faa.gz metag_ProG_nonsingleton.DB
1414

15-
#clust with kmer:21,-c 0.9,--min-seq-id:0.9
15+
#clust with -c 0.9,--min-seq-id:0.9
1616
mmseqs linclust metag_ProG_nonsingleton.DB metag_ProG_nonsingleton_0.9_clu tmp -c 0.9 --min-seq-id 0.9
1717

1818
#Extract representative sequence

General_Scripts/00_Remove_redundancy_and_cluster/04_1_sig_select100AA.py renamed to General_Scripts/00_Remove_redundancy_and_cluster/03_1_sig_select_100AA.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,16 @@
55

66
def select(infile,outfile):
77
import random
8-
import lzma
98
n = 0
109
random.seed(1234)
1110
random_number = set(random.sample(range(150994369,287926875),1000))
1211
with open(outfile,'wt') as out:
13-
with lzma.open(infile,'rt') as f:
12+
with open(infile,'rt') as f:
1413
for line in f:
15-
linelist = line.strip().split('\t')
16-
if linelist[1] != '':
17-
if n in random_number:
18-
out.write(f'{linelist[0]}\t{linelist[1]}\n')
19-
n += 1
14+
member,cluster = line.strip().split('\t')
15+
if n in random_number:
16+
out.write(f'{member}\t{cluster}\n')
17+
n += 1
2018

2119
def select_100(infile1,infile2,outfile):
2220
from fasta import fasta_iter
@@ -44,16 +42,16 @@ def select_90(infile1,infile2,outfile):
4442
if h in smorf:
4543
out.write(f'>{h}\n{seq}\n')
4644

47-
clusterfile = 'all_0.9_0.5_family_sort.tsv.xz'
45+
clusterfile = 'metag_ProG_nonsingleton_0.9_clu.tsv'
4846
selected_cluster = 'selected_cluster.tsv'
4947
select(clusterfile,selected_cluster)
5048

5149
infile1 = 'selected_cluster.tsv'
52-
infile2 = '100AA_GMSC_sort.faa.xz'
50+
infile2 = '100AA_GMSC.faa.xz'
5351
outfile = 'selected_100AA.faa'
5452
select_100(infile1,infile2,outfile)
5553

5654
infile1 = 'selected_cluster.tsv'
57-
infile2 = '90AA_GMSC_sort.faa.gz'
55+
infile2 = '90AA_GMSC.faa.xz'
5856
outfile = 'selected_90AA.faa'
5957
select_90(infile1,infile2,outfile)
File renamed without changes.
File renamed without changes.
File renamed without changes.

General_Scripts/00_Remove_redundancy_and_cluster/07_diamond.sh renamed to General_Scripts/00_Remove_redundancy_and_cluster/06_diamond.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env bash
22

33
#Concept:
4-
#Align all the singletons of raw data against non-singletons using Diamond (evalue:0.00001, identity:90).
4+
#Align all the singletons of raw data against cluster representatives using Diamond (evalue:0.00001, identity:90).
55

66
set -e
77
set -o pipefail
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
'''
2+
Concept:
3+
Identify the clusters which rescued (aligned) singletons mapped based on the best e-value.
4+
'''
5+
6+
def identify(infile,outfile):
7+
nameset = set()
8+
with open(outfile,'wt') as out:
9+
with open (infile) as f:
10+
for line in f:
11+
linelist = line.strip().split('\t')
12+
if linelist[0] in nameset:
13+
continue
14+
else:
15+
nameset.add(linelist[0])
16+
out.write(linelist[0]+"\t"+linelist[2]+"\n")
17+
18+
for i in range(24):
19+
INPUT_FILE_1 = "sub"+str(i)+".faa.gz.tsv"
20+
OUT_FILE_1 = "sub"+str(i)+".faa.gz.tsv.tmp"
21+
identify(INPUT_FILE_1,OUT_FILE_1)
22+
23+
# Then merge all the tmp subfiles into singleton_0.9.tsv

General_Scripts/00_Remove_redundancy_and_cluster/08_identify_clusters.py

Lines changed: 0 additions & 23 deletions
This file was deleted.

0 commit comments

Comments
 (0)