BigDataBiology
diff --git a/‎General_Scripts/01_Taxonomy_mapping/01_map_prog_taxa_dedup.py‎
Lines changed: 6 additions & 6 deletions b/‎General_Scripts/01_Taxonomy_mapping/01_map_prog_taxa_dedup.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎General_Scripts/01_Taxonomy_mapping/03_lca_change_format.py‎
Lines changed: 38 additions & 45 deletions b/‎General_Scripts/01_Taxonomy_mapping/03_lca_change_format.py‎
Lines changed: 38 additions & 45 deletions
diff --git a/‎General_Scripts/01_Taxonomy_mapping/04_map_metag_taxid_full.py‎
Lines changed: 13 additions & 17 deletions b/‎General_Scripts/01_Taxonomy_mapping/04_map_metag_taxid_full.py‎
Lines changed: 13 additions & 17 deletions
diff --git a/‎General_Scripts/01_Taxonomy_mapping/05_dedup_cluster.py‎
Lines changed: 2 additions & 2 deletions b/‎General_Scripts/01_Taxonomy_mapping/05_dedup_cluster.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎General_Scripts/01_Taxonomy_mapping/06_map_taxonomy.py‎
Lines changed: 11 additions & 16 deletions b/‎General_Scripts/01_Taxonomy_mapping/06_map_taxonomy.py‎
Lines changed: 11 additions & 16 deletions
@@ -1,7 +1,7 @@
 '''
 Concept:
-Map taxonomy of smORFs from Progenomes.
 De duplicate of smORFs from Progenomes.
+Map taxonomy of smORFs from Progenomes.
 '''
 
 import pandas as pd
@@ -42,12 +42,12 @@ def dedup_fasta(infile,outfile1,outfile2):
     out2.close()
     print("finish dedup and sort")
 
-prog = "./taxa/progenome/genome_prog.tsv"
-taxa = "./taxa/progenome/specI_genome_taxa.txt"
-prog_taxa = "./taxa/progenome/prog_specI_genome_taxa.tsv"
+prog = "genome_prog.tsv"
+taxa = "specI_genome_taxa.txt"
+prog_taxa = "prog_specI_genome_taxa.tsv"
 maptaxa_tsv(prog,taxa,prog_taxa)
 
 INPUT_FILE = "GMSC10.ProG_smorfs.faa.gz"  
-OUTPUT_FILE_1 = "./taxa/progenome/prog_dedup_sort.faa.gz"
-OUTPUT_FILE_2 = "./taxa/progenome/prog_redundant.tsv.gz"    
+OUTPUT_FILE_1 = "prog_dedup_sort.faa.gz"
+OUTPUT_FILE_2 = "prog_redundant.tsv.gz"    
 dedup_fasta(INPUT_FILE,OUTPUT_FILE_1,OUTPUT_FILE_2)
@@ -8,34 +8,32 @@ def mergeall(infile1,infile2,outfile):
     name = {}
     with gzip.open(infile1,"rt") as f1:
         for line in f1:
-            line = line.strip()
-            linelist = line.split("\t")
+            linelist = line.strip().split("\t")
             if linelist[0] in name.keys():
                 name[linelist[0]].append(linelist[1])
             else:
                 name[linelist[0]] = [linelist[1]]
 
-    out1 = gzip.open(outfile, "wt", compresslevel=1)         
-    
+    out1 = gzip.open(outfile, "wt", compresslevel=1)            
     with open(infile2,"rt") as f2:
         for line in f2:
-            line = line.strip()
-            linelist = line.split("\t")
-            for i in range (len(name[linelist[1]])):
-                out1.write(linelist[0]+"\t"+name[linelist[1]][i]+"\n")
+            linelist = line.strip().split("\t")
+            for item in name[linelist[1]]:
+                out1.write(f'{linelist[0]}\t{item}\n')
     out1.close()             
 
 def LCA(infile1,infile2,outfile):   
     import gzip
+
     name = {}
     cluster = {}
     change = {}
     taxa = set()
     flag = 1
+
     with gzip.open(infile1,"rt") as f1:
         for line in f1:
-            line = line.strip()
-            linelist = line.split("\t")
+            linelist = line.strip().split("\t")
             if len(linelist) == 11:
                 kindom = linelist[4]
                 phylum = linelist[5]
@@ -46,15 +44,13 @@ def LCA(infile1,infile2,outfile):
                 species = linelist[10]
                 name[linelist[1]] = [kindom,phylum,cla,order,family,genus,species]
             else:
-                name[linelist[1]] = []           
+                name[linelist[1]] = []      
+
     out1 = gzip.open(outfile, "wt", compresslevel=1)         
 
     with gzip.open(infile2,"rt") as f2:
         for line in f2:
-            line = line.strip()
-            linelist = line.split("\t")
-            rep = linelist[0] 
-            smorf = linelist[1]
+            rep,smorf = line.strip().split("\t")
             if rep in cluster.keys():
                 cluster[rep].append(smorf) 
                 lastrep = rep
@@ -63,10 +59,10 @@ def LCA(infile1,infile2,outfile):
                     cluster[rep] = [smorf]
                     for rank in range(7):
                         flag = 1
-                        for i in range(len(cluster[lastrep])):
+                        for item in cluster[lastrep]:
                             if taxa:
-                                if name[cluster[lastrep][i]]:
-                                    if name[cluster[lastrep][i]][6-rank] in taxa:
+                                if name[item]:
+                                    if name[item][6-rank] in taxa:
                                         continue
                                     else:
                                         flag = 0
@@ -75,8 +71,8 @@ def LCA(infile1,infile2,outfile):
                                 else:
                                     continue
                             else:
-                                if name[cluster[lastrep][i]]:
-                                    taxa.add(name[cluster[lastrep][i]][6-rank])
+                                if name[item]:
+                                    taxa.add(name[item][6-rank])
                                 else:
                                     continue
                         if flag == 1:
@@ -89,14 +85,14 @@ def LCA(infile1,infile2,outfile):
                         elif rank == 6 and flag == 0:
                             break
                         else:
-                            for x in range(len(cluster[lastrep])):
-                                if name[cluster[lastrep][x]]:
-                                    change[lastrep].append(name[cluster[lastrep][x]][j])
+                            for item in cluster[lastrep]:
+                                if name[item]:
+                                    change[lastrep].append(name[item][j])
                                     break
                                 else:
                                     continue
-                    for n in range(len(cluster[lastrep])):
-                        out1.write(lastrep+"\t"+cluster[lastrep][n]+"\t")
+                    for item in cluster[lastrep]:
+                        out1.write(f'{lastrep}\t{item}\t')
                         if len(change[lastrep]) != 0 :
                             if len(change[lastrep]) != 1:
                                 for m in range(len(change[lastrep])-1):
@@ -112,22 +108,20 @@ def LCA(infile1,infile2,outfile):
                     lastrep = rep
         for rank in range(7):
             flag = 1
-            for i in range(len(cluster[lastrep])):
+            for item in cluster[lastrep]:
                 if taxa:
-                    if name[cluster[lastrep][i]]:
-                        if name[cluster[lastrep][i]][6-rank] in taxa:
+                    if name[item]:
+                        if name[item][6-rank] in taxa:
                             continue
                         else:
                             flag = 0
-                            print(flag)
-                            print(taxa)
                             taxa = set()
                             break
                     else:
                         continue
                 else:
-                    if name[cluster[lastrep][i]]:
-                        taxa.add(name[cluster[lastrep][i]][6-rank])
+                    if name[item]:
+                        taxa.add(name[item][6-rank])
                     else:
                         continue
             if flag == 1:
@@ -140,14 +134,14 @@ def LCA(infile1,infile2,outfile):
             elif rank == 6 and flag == 0:
                 break
             else:
-                for x in range(len(cluster[lastrep])):
-                    if name[cluster[lastrep][x]]:
-                        change[lastrep].append(name[cluster[lastrep][x]][j])
+                for item in cluster[lastrep]:
+                    if name[item]:
+                        change[lastrep].append(name[item][j])
                         break
                     else:
                         continue
-        for n in range(len(cluster[lastrep])):
-            out1.write(lastrep+"\t"+cluster[lastrep][n]+"\t")
+        for item in cluster[lastrep]:
+            out1.write(f'{lastrep}\t{item}\t')
             if len(change[lastrep]) != 0 :
                 if len(change[lastrep]) != 1:
                     for m in range(len(change[lastrep])-1):
@@ -166,8 +160,7 @@ def change(infile1,outfile):
     out = gzip.open(outfile, "wt", compresslevel=1)
     with gzip.open(infile1,"rt") as f1:
         for line in f1:
-            line = line.strip()
-            linelist = line.split("\t")
+            linelist = line.strip().split("\t")
             out.write(linelist[1])
             if len(linelist) > 2:
                 for i in range(2,len(linelist)):
@@ -198,12 +191,12 @@ def change(infile1,outfile):
                 out.write("\n")
     out.close()       
 
-INPUT_FILE_1 = "./taxa/progenome/prog_redundant.tsv.gz"  
-INPUT_FILE_2 = "./taxa/progenome/clust_result/prog_dedup_0.9_clu.tsv"
-INPUT_FILE_3 = "./taxa/progenome/prog_specI_genome_taxa.tsv.gz"  
-OUTPUT_FILE_1 = "./taxa/progenome/prog_all_0.9_clu.tsv.gz"
-OUTPUT_FILE_2 = "./taxa/progenome/all_taxonomy.tsv.gz"
-OUTPUT_FILE_3 = "./taxa/progenome/prog_taxonomy_change.tsv.gz"
+INPUT_FILE_1 = "prog_redundant.tsv.gz"  
+INPUT_FILE_2 = "prog_dedup_0.9_clu.tsv"
+INPUT_FILE_3 = "prog_specI_genome_taxa.tsv.gz"  
+OUTPUT_FILE_1 = "prog_all_0.9_clu.tsv.gz"
+OUTPUT_FILE_2 = "all_taxonomy.tsv.gz"
+OUTPUT_FILE_3 = "prog_taxonomy_change.tsv.gz"
 
 mergeall(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1)
 LCA(INPUT_FILE_3,OUTPUT_FILE_1,OUTPUT_FILE_2)
 
@@ -1,6 +1,6 @@
 '''
 Concept:
-Map taxid for smORFs from metaG according to contig.
+Map taxid for smORFs from metaG based on contigs.
 Get fullname of taxonomy of taxid according to GTDB files.
 '''
 
@@ -15,13 +15,11 @@ def maptax(infile1,infile2,outfile):
     out = lzma.open(outfile, "wt") 
 
     f2 = lzma.open(infile2,"rt")
-    header = f2.readline()
     metag = f2.readline().strip().split("\t")
 
     with lzma.open(infile1,"rt") as f1:
         for line in f1:
-            line = line.strip()
-            linelist = line.split("\t")
+            linelist = line.strip().split("\t")
             if line.startswith("sample"):
                 continue
             else:
@@ -35,9 +33,9 @@ def maptax(infile1,infile2,outfile):
                         while metag[1] == lastsample:
                             contig2 = metag[1]+metag[2].split(" # ")[0].split("_")[0]+"_"+metag[2].split(" # ")[0].split("_")[1]
                             if contig2 in taxa.keys():
-                                out.write(metag[0]+"\t"+taxa[contig2]+"\n")   
+                                out.write(f'{metag[0]}\t{taxa[contig2]}\n')   
                             else:
-                                out.write(metag[0]+"\n")
+                                out.write(f'{metag[0]}\n')
                             metag = f2.readline().strip().split("\t")
                             if metag == [""]:
                                 break
@@ -53,9 +51,9 @@ def maptax(infile1,infile2,outfile):
         while metag[1] == lastsample:
             contig2 = metag[1]+metag[2].split(" # ")[0].split("_")[0]+"_"+metag[2].split(" # ")[0].split("_")[1]
             if contig2 in taxa.keys():
-                out.write(metag[0]+"\t"+taxa[contig2]+"\n")   
+                out.write(f'{metag[0]}\t{taxa[contig2]}\n')   
             else:
-                out.write(metag[0]+"\n")
+                out.write(f'{metag[0]}\n')
             metag = f2.readline().strip().split("\t")
             if metag == [""]:
                 break
@@ -74,14 +72,12 @@ def fulltax(infile1,infile2,outfile):
 
     with open(infile1,"rt") as f1:
         for line in f1:
-            line = line.strip()
-            linelist = line.split("\t")
+            linelist = line.strip().split("\t")
             taxonomy.append(linelist[1])
 
     with lzma.open(infile2,"rt") as f2:      
         for line in f2:
-            line = line.strip()
-            linelist = line.split("\t")
+            linelist = line.strip().split("\t")
             if line.startswith("sample"):
                 continue
             else:
@@ -92,7 +88,7 @@ def fulltax(infile1,infile2,outfile):
                     flag = 0
                     if linelist[3] == "superkingdom":
                         tax = "d__"+linelist[4]
-                        outf.write(linelist[2]+"\t"+tax+"\n")
+                        outf.write(f'{linelist[2]}\t{tax}\n')
                     else:
                         tax = linelist[3][0]+"__"+linelist[4]
                         for i in range(len(taxonomy)):
@@ -117,15 +113,15 @@ def fulltax(infile1,infile2,outfile):
                                 flag = 1
                                 break
                         if flag == 0:
-                            outf.write(linelist[2]+"\n")
+                            outf.write(f'{linelist[2]}\n')
 
     outf.close()
 
 INPUT_FILE_1 = "mmseqs2.lca_taxonomy.full.tsv.xz"  
 INPUT_FILE_2 = "GMSC10.metag_smorfs.rename.txt.xz"
-INPUT_FILE_3 = "./taxa/metag/gtdb_taxonomy.tsv"
-OUTPUT_FILE_1 = "./taxa/metag/metag_taxid.tsv.xz"
-OUTPUT_FILE_2 = "./taxa/metag/taxid_fullname_gtdb.tsv"
+INPUT_FILE_3 = "gtdb_taxonomy.tsv"
+OUTPUT_FILE_1 = "metag_taxid.tsv.xz"
+OUTPUT_FILE_2 = "taxid_fullname_gtdb.tsv"
 
 maptax(INPUT_FILE_1,INPUT_FILE_2,OUTPUT_FILE_1)
 fulltax(INPUT_FILE_3,INPUT_FILE_1,OUTPUT_FILE_2)
@@ -13,10 +13,10 @@
 def splitseq(infile):
     print("start splitseq")
     outputlist = [
-        f'/taxa/metag/dedup_cluster/split/sub_{ix:03}.faa.gz'
+        f'sub_{ix:03}.faa.gz'
         for ix in range(256)]
     outputfiles = [
-        gzip.open(f'/taxa/metag/dedup_cluster/split/sub_{ix:03}.faa.gz',compresslevel=1,  mode='wt')
+        gzip.open(f'sub_{ix:03}.faa.gz',compresslevel=1,  mode='wt')
         for ix in range(256)]
     for ID,seq in fasta_iter(infile):
         h = hashlib.sha256()
 
@@ -27,16 +27,15 @@ def metag_full(infile1,infile2,outpath):
 
     with open(infile1,"rt") as f1:
         for line in f1:
-            line = line.strip()
-            linelist = line.split("\t",1)
+            linelist = line.strip().split("\t",1)
             if len(linelist) == 2:
                 tax[linelist[0]] = linelist[1]
             else:
                 continue
+
     with lzma.open(infile2,"rt") as f2:
         for line in f2:
-            line = line.strip()
-            linelist = line.split("\t")
+            linelist = line.strip().split("\t")
             if n < 600000000:
                 if len(linelist) == 2:
                     if linelist[1] in tax.keys():
@@ -117,42 +116,38 @@ def metag_full(infile1,infile2,outpath):
 '''
 def map_cluster(infile1,infile2,infile3,outfile):
     tax = {}
-    n = 0
     out = lzma.open(outfile, "wt")
     with gzip.open(infile1,"rt") as f1:
         for line in f1:
-            line = line.strip()
-            linelist = line.split("\t",1)
+            linelist = line.strip().split("\t",1)
             if len(linelist) == 2:
                 tax[linelist[0]] = linelist[1]
             else:
                 continue
 
     with lzma.open(infile2,"rt") as f2:
         for line in f2:
-            line = line.strip()
-            linelist = line.split("\t",1)
+            linelist = line.strip().split("\t",1)
             if len(linelist) == 2:
                 tax[linelist[0]] = linelist[1]
             else:
                 continue    
 
     with gzip.open(infile3,"rt") as f4:
         for line in f4:
-            line = line.strip()
-            linelist = line.split("\t") 
+            linelist = line.strip().split("\t") 
             if linelist[1] in tax.keys():
                 out.write(linelist[0]+"\t"+linelist[1]+"\t"+tax[linelist[1]]+"\n")
             else:
                 out.write(linelist[0]+"\t"+linelist[1]+"\n")           
     out.close()
 
-INPUT_FILE_1 = "./taxa/metag/taxid_fullname_gtdb.tsv"   
-INPUT_FILE_2 = "./taxa/metag/metag_taxid.tsv.xz"
+INPUT_FILE_1 = "taxid_fullname_gtdb.tsv"   
+INPUT_FILE_2 = "metag_taxid.tsv.xz"
 INPUT_FILE_3 = "dedup_cluster.tsv.gz"
-INPUT_FILE_4 = "./taxa/progenome/prog_taxonomy_change.tsv.gz"  
-OUT_PATH_1 = "./taxa/metag/metag_taxonomy" 
-OUT_PATH_2 = "./taxa/metag/metag_cluster_taxonomy" 
+INPUT_FILE_4 = "prog_taxonomy_change.tsv.gz"  
+OUT_PATH_1 = "metag_taxonomy" 
+OUT_PATH_2 = "metag_cluster_taxonomy" 
 
 metag_full(INPUT_FILE_1,INPUT_FILE_2,OUT_PATH_1)
 for i in range(1,9):