BigDataBiology
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/01_deduplicate_sort_merge.py‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/01_deduplicate_sort_merge_extract.py‎
Lines changed: 29 additions & 5 deletions b/‎General_Scripts/00_Remove_redundancy_and_cluster/01_deduplicate_sort_merge.py‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/01_deduplicate_sort_merge_extract.py‎
Lines changed: 29 additions & 5 deletions
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/02_extract.py‎
Lines changed: 0 additions & 32 deletions b/‎General_Scripts/00_Remove_redundancy_and_cluster/02_extract.py‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/03_linclust.sh‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/02_linclust.sh‎
Lines changed: 1 addition & 1 deletion b/‎General_Scripts/00_Remove_redundancy_and_cluster/03_linclust.sh‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/02_linclust.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/04_1_sig_select100AA.py‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/03_1_sig_select_100AA.py‎
Lines changed: 8 additions & 10 deletions b/‎General_Scripts/00_Remove_redundancy_and_cluster/04_1_sig_select100AA.py‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/03_1_sig_select_100AA.py‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/04_1_sig_select1000.py‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/03_2_sig_select_singleton_cluster.py‎ b/‎General_Scripts/00_Remove_redundancy_and_cluster/04_1_sig_select1000.py‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/03_2_sig_select_singleton_cluster.py‎
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/05_align_swipe.sh‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/04_align_swipe.sh‎ b/‎General_Scripts/00_Remove_redundancy_and_cluster/05_align_swipe.sh‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/04_align_swipe.sh‎
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/06_split_singletons.py‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/05_split_singletons.py‎ b/‎General_Scripts/00_Remove_redundancy_and_cluster/06_split_singletons.py‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/05_split_singletons.py‎
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/07_diamond.sh‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/06_diamond.sh‎
Lines changed: 1 addition & 1 deletion b/‎General_Scripts/00_Remove_redundancy_and_cluster/07_diamond.sh‎ renamed to ‎General_Scripts/00_Remove_redundancy_and_cluster/06_diamond.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/07_identify_clusters.py‎
Lines changed: 23 additions & 0 deletions b/‎General_Scripts/00_Remove_redundancy_and_cluster/07_identify_clusters.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎General_Scripts/00_Remove_redundancy_and_cluster/08_identify_clusters.py‎
Lines changed: 0 additions & 23 deletions b/‎General_Scripts/00_Remove_redundancy_and_cluster/08_identify_clusters.py‎
Lines changed: 0 additions & 23 deletions
@@ -2,9 +2,10 @@
 Concept:
 The number of all redundant smORFs from metagenomes and Progenome2 is 4,599,187,424
 (metagenomes:4,564,570,019,Progenome2:34,617,405).
-The whole smORFs can't be sorted in memory at one time and need to be splited into 256 subfiles.
-Then we can de duplicate and sort each subfile separately.
-Finally, we merge all the subfiles to generate non-redundant sorted smORFs.
+The whole smORFs are too large to be sorted in memory. 
+1. Split smORFs into 256 subfiles to de duplicate and sort each subfile separately.
+2. Merge all the subfiles to generate non-redundant sorted smORFs.
+3. Extract non-singletons and singletons.
 '''
 
 from jug import TaskGenerator, bvalue
@@ -38,7 +39,7 @@ def splitseq(infile):
     return (outputlist)
 
 '''
-De duplicate and sort every subfiles according to sequence alphabetical order.
+De duplicate and sort subfiles according to sequence alphabetical order.
 Calculate the number of occurrences of each sequence.
 '''
 @TaskGenerator
@@ -81,10 +82,33 @@ def mergeseq(outfile):
                 preseq = seq
     print("finish merge")
 
+@TaskGenerator
+def extract_seq(infile1,infile2,outfile1,outfile2):
+    fastaset = set()
+    with gzip.open(infile1,"rt") as f:
+        for line in f:
+            line = line.strip()
+            linelist = line.split("\t")
+            if linelist[0] != "1":
+                fastaset.add(linelist[1])
+
+    with gzip.open(outfile1, "wt", compresslevel=1) as out1, \
+        gzip.open(outfile2, "wt", compresslevel=1) as out2:
+        for ID,seq in fasta_iter(infile2):
+            if seq in fastaset:
+                out1.write(f'>{ID}\n{seq}\n')
+            else:
+                out2.write(f'>{ID}\n{seq}\n')
+
 INPUT_FILE = "GMSC10.metag_Prog_smorfs.faa.gz"
 OUTPUT_FILE = "metag_ProG_dedup.faa.gz"
 
 splits = splitseq(INPUT_FILE)
 for sp in bvalue(splits):
     dedup_fasta(sp)
-mergeseq(OUTPUT_FILE)
+mergeseq(OUTPUT_FILE)
+
+INPUT_FILE_1 = "metag_ProG.raw_number.tsv.gz"
+OUT_FILE_1 = "metag_ProG_nonsingleton.faa.gz"
+OUT_FILE_2 = "metag_ProG_singleton.faa.gz"
+extract_seq(INPUT_FILE_1,OUTPUT_FILE,OUT_FILE_1,OUT_FILE_2)
@@ -12,7 +12,7 @@ cd clust_result
 #make db
 mmseqs createdb metag_ProG_nonsingleton.faa.gz metag_ProG_nonsingleton.DB
 
-#clust with kmer:21,-c 0.9,--min-seq-id:0.9
+#clust with -c 0.9,--min-seq-id:0.9
 mmseqs linclust metag_ProG_nonsingleton.DB metag_ProG_nonsingleton_0.9_clu tmp -c 0.9 --min-seq-id 0.9 
 
 #Extract representative sequence
 
@@ -5,18 +5,16 @@
 
 def select(infile,outfile):
     import random
-    import lzma
     n = 0
     random.seed(1234)
     random_number = set(random.sample(range(150994369,287926875),1000))
     with open(outfile,'wt') as out:
-        with lzma.open(infile,'rt') as f:
+        with open(infile,'rt') as f:
             for line in f:
-                linelist = line.strip().split('\t')
-                if linelist[1] != '':
-                    if n in random_number:
-                        out.write(f'{linelist[0]}\t{linelist[1]}\n')
-                    n += 1
+                member,cluster = line.strip().split('\t')
+                if n in random_number:
+                    out.write(f'{member}\t{cluster}\n')
+                n += 1
 
 def select_100(infile1,infile2,outfile):
     from fasta import fasta_iter
@@ -44,16 +42,16 @@ def select_90(infile1,infile2,outfile):
             if h in smorf:
                 out.write(f'>{h}\n{seq}\n')
 
-clusterfile = 'all_0.9_0.5_family_sort.tsv.xz'
+clusterfile = 'metag_ProG_nonsingleton_0.9_clu.tsv'
 selected_cluster = 'selected_cluster.tsv'
 select(clusterfile,selected_cluster)
 
 infile1 = 'selected_cluster.tsv'
-infile2 = '100AA_GMSC_sort.faa.xz'
+infile2 = '100AA_GMSC.faa.xz'
 outfile = 'selected_100AA.faa'
 select_100(infile1,infile2,outfile)
 
 infile1 = 'selected_cluster.tsv'
-infile2 = '90AA_GMSC_sort.faa.gz'
+infile2 = '90AA_GMSC.faa.xz'
 outfile = 'selected_90AA.faa'
 select_90(infile1,infile2,outfile)
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 #Concept: 
-#Align all the singletons of raw data against non-singletons using Diamond (evalue:0.00001, identity:90).
+#Align all the singletons of raw data against cluster representatives using Diamond (evalue:0.00001, identity:90).
 
 set -e
 set -o pipefail
 
@@ -0,0 +1,23 @@
+'''
+Concept:
+Identify the clusters which rescued (aligned) singletons mapped based on the best e-value.
+'''
+
+def identify(infile,outfile):
+    nameset = set()
+    with open(outfile,'wt') as out:
+        with open (infile) as f:
+            for line in f:
+                linelist = line.strip().split('\t')
+                if linelist[0] in nameset:
+                    continue
+                else:
+                    nameset.add(linelist[0])
+                    out.write(linelist[0]+"\t"+linelist[2]+"\n")
+
+for i in range(24):
+    INPUT_FILE_1 = "sub"+str(i)+".faa.gz.tsv"
+    OUT_FILE_1 = "sub"+str(i)+".faa.gz.tsv.tmp"
+    identify(INPUT_FILE_1,OUT_FILE_1)
+
+# Then merge all the tmp subfiles into singleton_0.9.tsv