Skip to content

Commit 83c83fa

Browse files
committed
Improve integration of mmseqs
- check exit code and report on error - check executable works (bioconda container does not run on arm). in such cases, skip mmseqs cluster step in pipeline.
1 parent 14653ce commit 83c83fa

File tree

2 files changed

+91
-96
lines changed

2 files changed

+91
-96
lines changed

FastOMA/_utils_roothog.py

Lines changed: 74 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11

22
import csv
33
import itertools
4+
import subprocess
45

56
import networkx as nx
67
from Bio import SeqIO
@@ -577,13 +578,9 @@ def merge_rhogs2(hogmaps, rhogs_prots, conf_infer_roothogs):
577578

578579
candidates_pair = find_rhog_candidate_pairs(hogmaps, rhogs_prots, conf_infer_roothogs) # rhogs_prots
579580

580-
print("There are " + str(len(candidates_pair)) + " candidate pairs of rhogs for merging.")
581+
logger.debug(f"There are {len(candidates_pair)} candidate pairs of rhogs for merging.")
581582
cluster_rhogs_list = cluster_rhogs(candidates_pair)
582-
583-
print("There are " + str(len(cluster_rhogs_list)) + " clusters.")
584-
logger.debug("There are " + str(len(cluster_rhogs_list)) + " clusters.")
585-
586-
print("** the recursion limit is "+str( sys.getrecursionlimit()))
583+
logger.debug(f"There are {len(cluster_rhogs_list)} clusters.")
587584
logger.debug("** the recursion limit is "+str( sys.getrecursionlimit()))
588585
cluster_rhogs_list = cluster_rhogs_nx(cluster_rhogs_list, candidates_pair)
589586

@@ -758,121 +755,109 @@ def merge_rhogs2(hogmaps, rhogs_prots, conf_infer_roothogs):
758755
# return rhogs_prots
759756

760757

761-
def collect_unmapped_singleton(rhogs_prots, unmapped,prot_recs_all,unmapped_singleton_fasta= "singleton_unmapped.fa"):
758+
def collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all, unmapped_singleton_fasta="singleton_unmapped.fa"):
762759
unmapped_recs = []
763760
for species_name, prot_names in unmapped.items():
761+
all_recs_of_species = prot_recs_all[species_name]
764762
for prot_name in prot_names:
765-
if prot_name in prot_recs_all[species_name]: # some small prots are removed in the begining min_sequence_length
766-
prot_rec = prot_recs_all[species_name][prot_name]
767-
unmapped_recs.append(prot_rec)
763+
try:
764+
unmapped_recs.append(all_recs_of_species[prot_name])
765+
except KeyError:
766+
# some small prots are removed in the begining min_sequence_length
767+
pass
768+
logger.debug(f"collected sequence records of {len(unmapped_recs)} unmapped proteins.")
768769

769-
print(len(unmapped_recs))
770770
singleton_recs = []
771771
for rhogid, sp_prot_list in rhogs_prots.items():
772772
if len(sp_prot_list) == 1:
773773
species_name = sp_prot_list[0][0]
774774
prot_name = sp_prot_list[0][1]
775775
prot_rec = prot_recs_all[species_name][prot_name]
776776
singleton_recs.append(prot_rec)
777-
print(len(singleton_recs))
777+
logger.debug(f"collected sequence records of {len(singleton_recs)} singleton proteins.")
778778

779779
singleton_unmapped_recs = unmapped_recs + singleton_recs
780-
781780
SeqIO.write(singleton_unmapped_recs, unmapped_singleton_fasta , "fasta")
782-
781+
logger.debug(f"Wrote {len(singleton_unmapped_recs)} sequences of unmapped and singleton proteins to {unmapped_singleton_fasta}.")
783782
return len(singleton_unmapped_recs)
784783

785784

786-
import subprocess
787-
788-
789785
def run_linclust(fasta_to_cluster="singleton_unmapped.fa"): # todo move run_linclust to _wrapper.py . see easy-cluster below.
790786
# todo: change FastOMA.nf to assgin more cpus to infer_roothog step. mmseqs uses all cpus by default # num_threads = 10 , --threads " + str(num_threads) + "
791787
command_clust = mmseqs_executable_path +" easy-cluster " + fasta_to_cluster + " singleton_unmapped tmp_linclust"
792788
# easy-cluster is much better than easy-linclust but a bit slower. todo: make it as an arugment for user
793789

794-
logger.debug("linclust rooting started" + command_clust)
795-
process = subprocess.Popen(command_clust.split(), stdout=subprocess.PIPE)
790+
logger.debug("clustering rooting started " + command_clust)
791+
process = subprocess.Popen(command_clust.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
796792
output, error = process.communicate()
793+
logger.debug(f"clustering rooting finished with returncode {process.returncode}.")
794+
if process.returncode != 0:
795+
logger.error(f"mmseqs easy-cluster failed with returncode {process.returncode}.")
796+
logger.error(f"stdout:\n{output.decode('utf-8')}\n******************")
797+
logger.error(f"stderr:\n{error.decode('utf-8')}\n******************")
798+
raise RuntimeError("mmseqs easy-cluster failed.")
799+
return "singleton_unmapped_all_seqs.fasta" # the fasta-like file of clusters
800+
801+
802+
def write_clusters(fasta_clustered, address_rhogs_folder, min_rhog_size):
803+
"""parse the fasta-like output of mmseqs linclust and write each cluster
804+
805+
The format is described here: https://github.com/soedinglab/MMseqs2/wiki#cluster-fasta-like-format
806+
Example:
807+
>O67032 <- cluster 1
808+
>sp|O67032|RF1_AQUAE <- member1 of cluster 1
809+
MLKEAYISRLDKLQEKYRKLQEELSKPEVIQD...
810+
>sp|O84026|RF1_CHLTR <- member2 of cluster 1
811+
MEIKVLECLKRLEEVEKQISDPNIFSNPKEYS...
812+
>sp|P47500|RF1_MYCGE <- member3 of cluster 1
813+
MDFDKQLFFNVEKIVELTEQLEKDLNKPNLSF...
814+
>O66429 <- cluster 2
815+
>sp|O66429|EFTU_AQUAE <- member1 of cluster 2
816+
MAKEKFERTKEHVNVGTIGHVDHGKS
817+
>sp|P0CD71|EFTU_CHLTR <- member1 of cluster 1
818+
MSKETFQRNKPHINIGTIGHVDHGKTTLTAAITRAL
819+
"""
797820

798-
# if verbose: # todo print mmseqs logs/error/output for debugging
799-
# print("output:\n", output)
800-
# print("error:\n", error)
801-
802-
# if "Error analyzing file" in str(output) or error:
803-
# try:
804-
805-
return "done"
806-
807-
808-
def write_clusters(address_rhogs_folder, min_rhog_size):
809-
cluster_output_address = "singleton_unmapped_all_seqs.fasta"
810-
cluster_file = open(cluster_output_address, 'r')
811-
# cluster_dic = {}
812-
previous_line_start = " "
813-
# parsing memseqs fasta-like format https://github.com/soedinglab/MMseqs2/wiki#cluster-fasta-like-format
814-
# >A0A0R4IFW6
815-
# >tr|A0A0R4IFW6|A0A0R4IFW6_DANRE||DANRE||1000001584 tr|A0A0R
816-
# MSRKTTSKRHYKPSSEIDDAALARKREYW
817-
clusters = []
818-
cluster = []
819-
# line_strip = " "
820-
# line_number =1
821-
for line in cluster_file:
822-
line_strip = line.strip()
823-
# print(line_number,previous_line_start,line_strip[0],cluster )# "_",previous_line_start,line_strip[0])
824-
if previous_line_start == " ":
825-
clusters = []
826-
cluster = []
827-
elif line_strip[0] == ">" and previous_line_start == ">": # new cluster started at previous line
828-
if len(cluster) >= 5: # more than one records [ID1, seq1,ID2, seq2, newcluster_id]
829-
clusters.append(cluster[:-1]) # last item is the id of new cluster
830-
cluster = [line_strip]
831-
elif line_strip[0] == ">" and previous_line_start != ">":
832-
cluster.append(line_strip)
833-
elif line_strip[0] != ">":
834-
cluster.append(line_strip)
835-
836-
previous_line_start = line_strip[0]
837-
# line_number+=1
821+
with open(fasta_clustered, 'rt') as cluster_file:
822+
previous_line_start = " "
823+
clusters = []
824+
cluster = []
825+
for line in cluster_file:
826+
line_strip = line.strip()
827+
if previous_line_start == " ":
828+
clusters = []
829+
cluster = []
830+
elif line_strip[0] == ">" and previous_line_start == ">": # new cluster started at previous line
831+
if len(cluster) >= 5:
832+
# more than one records [ID1, seq1,ID2, seq2, newcluster_id], i.e. non-trivial cluster
833+
clusters.append(cluster[:-1]) # last item is the id of new cluster
834+
cluster = [line_strip]
835+
elif line_strip[0] == ">" and previous_line_start != ">":
836+
cluster.append(line_strip)
837+
elif line_strip[0] != ">":
838+
cluster.append(line_strip)
839+
previous_line_start = line_strip[0]
838840

839841
# for last cluster
840842
if len(cluster) >= 4: # more than one records [ID1, seq1,ID2, seq2]
841843
clusters.append(cluster)
844+
logger.debug(f"Number of non-trivial linclust clusters raw is {len(clusters)}.")
842845

843-
logger.debug("Number of linclust clusters raw is " + str(len(clusters)))
844-
# the record id is parsed by mmseqs very ad hoc. so better use the fasta-like file
845-
# cluster_output_address = "singleton_unmapped_cluster.tsv"
846-
# cluster_file = open(cluster_output_address, 'r')
847-
# cluster_dic = {}
848-
# for line in cluster_file:
849-
# line_strip = line.strip()
850-
# rep, prot= line_strip.split()
851-
# if rep in cluster_dic:
852-
# cluster_dic[rep].append(prot) # the frist line includ (rep,rep)
853-
# else:
854-
# cluster_dic[rep]=[prot]
855-
# cluster_list = []
856-
# for rep, prot_list in cluster_dic.items():
857-
# if len(prot_list)>1:
858-
# cluster_list.append(prot_list)
859-
860-
861-
cluster_iter= 1000*10
846+
cluster_iter = 10000
862847
for cluster in clusters:
848+
# cluster is like ['>sp|O67032|RF1_AQUAE', 'MLKEAYISRLDKLQEKYRKLQEELSKPEVIQD...', ...], i.e. an id and its sequence.
849+
# thus, check for double the min_rhog_size to have at least min_rhog_size proteins
863850
if len(cluster) >= 2 * min_rhog_size:
864-
species_names= [] # it seems that genes from same species tend to cluster together, discard such clusters
865-
for pr_idi in range(int(len(cluster)/2)):
851+
species_names = set([]) # it seems that genes from same species tend to cluster together, discard such clusters
852+
for pr_idi in range(int(len(cluster)/2)):
866853
species_name = cluster[pr_idi*2].split(" ")[0].split("||")[1]
867-
species_names.append(species_name)
868-
if len(set(species_names))>1:
869-
file_idx = open(address_rhogs_folder + "/HOG_clust" + str(cluster_iter) + ".fa", "w")
870-
for line in cluster:
871-
file_idx.write(line + "\n")
872-
file_idx.close()
873-
cluster_iter+=1
874-
875-
return cluster_iter-1
854+
species_names.add(species_name)
855+
if len(species_names) > 1:
856+
with open(address_rhogs_folder + "/HOG_clust" + str(cluster_iter) + ".fa", "w") as file_idx:
857+
for line in cluster:
858+
file_idx.write(line + "\n")
859+
cluster_iter += 1
860+
return cluster_iter - 10000
876861

877862

878863
def parse_isoform_file(species_names, folder=None):

FastOMA/infer_roothogs.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os.path
2+
import subprocess
23
from shutil import which
34

45
from . import _utils_roothog, logger
@@ -59,15 +60,24 @@ def fastoma_infer_roothogs():
5960

6061
min_rhog_size = 2
6162
rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, conf.out_rhog_folder, min_rhog_size)
62-
linclust_available=which("mmseqs") # True #
63+
linclust_available = which("mmseqs") # True #
6364
# if memseqs is not installed the output will be empty / None
6465
if linclust_available:
65-
num_unmapped_singleton = _utils_roothog.collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all, "singleton_unmapped.fa")
66-
if num_unmapped_singleton:
67-
result_linclust = _utils_roothog.run_linclust(fasta_to_cluster="singleton_unmapped.fa")
68-
logger.debug(" linclust is done %s", result_linclust)
69-
num_clusters = _utils_roothog.write_clusters(conf.out_rhog_folder, min_rhog_size)
70-
logger.debug("we wrote %d new clusters with linclust ", num_clusters)
66+
try:
67+
res = subprocess.run([linclust_available, "-h"], check=True, text=True, capture_output=True)
68+
except subprocess.CalledProcessError as e:
69+
logger.error("mmseqs linclust is not working: %s", e)
70+
logger.error("Skipping clustering of unmapped singletons")
71+
linclust_available = False
72+
if linclust_available:
73+
singleton_unmapped_path = "singleton_unmapped.fa"
74+
cnt_unmapped_singleton = _utils_roothog.collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all, unmapped_singleton_fasta=singleton_unmapped_path)
75+
if cnt_unmapped_singleton:
76+
cluster_file = _utils_roothog.run_linclust(fasta_to_cluster="singleton_unmapped.fa")
77+
num_clusters = _utils_roothog.write_clusters(cluster_file, conf.out_rhog_folder, min_rhog_size)
78+
logger.debug("we wrote %d new clusters with linclust", num_clusters)
79+
else:
80+
logger.info("mmseqs linclust / easy-cluster not available, skipping clustering of unmapped and singletons")
7181

7282

7383
if __name__ == "__main__":

0 commit comments

Comments
 (0)