|
1 | 1 |
|
2 | 2 | import csv |
3 | 3 | import itertools |
| 4 | +import subprocess |
4 | 5 |
|
5 | 6 | import networkx as nx |
6 | 7 | from Bio import SeqIO |
@@ -577,13 +578,9 @@ def merge_rhogs2(hogmaps, rhogs_prots, conf_infer_roothogs): |
577 | 578 |
|
578 | 579 | candidates_pair = find_rhog_candidate_pairs(hogmaps, rhogs_prots, conf_infer_roothogs) # rhogs_prots |
579 | 580 |
|
580 | | - print("There are " + str(len(candidates_pair)) + " candidate pairs of rhogs for merging.") |
| 581 | + logger.debug(f"There are {len(candidates_pair)} candidate pairs of rhogs for merging.") |
581 | 582 | cluster_rhogs_list = cluster_rhogs(candidates_pair) |
582 | | - |
583 | | - print("There are " + str(len(cluster_rhogs_list)) + " clusters.") |
584 | | - logger.debug("There are " + str(len(cluster_rhogs_list)) + " clusters.") |
585 | | - |
586 | | - print("** the recursion limit is "+str( sys.getrecursionlimit())) |
| 583 | + logger.debug(f"There are {len(cluster_rhogs_list)} clusters.") |
587 | 584 | logger.debug("** the recursion limit is "+str( sys.getrecursionlimit())) |
588 | 585 | cluster_rhogs_list = cluster_rhogs_nx(cluster_rhogs_list, candidates_pair) |
589 | 586 |
|
@@ -758,121 +755,109 @@ def merge_rhogs2(hogmaps, rhogs_prots, conf_infer_roothogs): |
758 | 755 | # return rhogs_prots |
759 | 756 |
|
760 | 757 |
|
761 | | -def collect_unmapped_singleton(rhogs_prots, unmapped,prot_recs_all,unmapped_singleton_fasta= "singleton_unmapped.fa"): |
| 758 | +def collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all, unmapped_singleton_fasta="singleton_unmapped.fa"): |
762 | 759 | unmapped_recs = [] |
763 | 760 | for species_name, prot_names in unmapped.items(): |
| 761 | + all_recs_of_species = prot_recs_all[species_name] |
764 | 762 | for prot_name in prot_names: |
765 | | - if prot_name in prot_recs_all[species_name]: # some small prots are removed in the begining min_sequence_length |
766 | | - prot_rec = prot_recs_all[species_name][prot_name] |
767 | | - unmapped_recs.append(prot_rec) |
| 763 | + try: |
| 764 | + unmapped_recs.append(all_recs_of_species[prot_name]) |
| 765 | + except KeyError: |
| 766 | + # some small prots are removed in the begining min_sequence_length |
| 767 | + pass |
| 768 | + logger.debug(f"collected sequence records of {len(unmapped_recs)} unmapped proteins.") |
768 | 769 |
|
769 | | - print(len(unmapped_recs)) |
770 | 770 | singleton_recs = [] |
771 | 771 | for rhogid, sp_prot_list in rhogs_prots.items(): |
772 | 772 | if len(sp_prot_list) == 1: |
773 | 773 | species_name = sp_prot_list[0][0] |
774 | 774 | prot_name = sp_prot_list[0][1] |
775 | 775 | prot_rec = prot_recs_all[species_name][prot_name] |
776 | 776 | singleton_recs.append(prot_rec) |
777 | | - print(len(singleton_recs)) |
| 777 | + logger.debug(f"collected sequence records of {len(singleton_recs)} singleton proteins.") |
778 | 778 |
|
779 | 779 | singleton_unmapped_recs = unmapped_recs + singleton_recs |
780 | | - |
781 | 780 | SeqIO.write(singleton_unmapped_recs, unmapped_singleton_fasta , "fasta") |
782 | | - |
| 781 | + logger.debug(f"Wrote {len(singleton_unmapped_recs)} sequences of unmapped and singleton proteins to {unmapped_singleton_fasta}.") |
783 | 782 | return len(singleton_unmapped_recs) |
784 | 783 |
|
785 | 784 |
|
786 | | -import subprocess |
787 | | - |
788 | | - |
789 | 785 | def run_linclust(fasta_to_cluster="singleton_unmapped.fa"): # todo move run_linclust to _wrapper.py . see easy-cluster below. |
790 | 786 | # todo: change FastOMA.nf to assgin more cpus to infer_roothog step. mmseqs uses all cpus by default # num_threads = 10 , --threads " + str(num_threads) + " |
791 | 787 | command_clust = mmseqs_executable_path +" easy-cluster " + fasta_to_cluster + " singleton_unmapped tmp_linclust" |
792 | 788 | # easy-cluster is much better than easy-linclust but a bit slower. todo: make it as an arugment for user |
793 | 789 |
|
794 | | - logger.debug("linclust rooting started" + command_clust) |
795 | | - process = subprocess.Popen(command_clust.split(), stdout=subprocess.PIPE) |
| 790 | + logger.debug("clustering rooting started " + command_clust) |
| 791 | + process = subprocess.Popen(command_clust.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
796 | 792 | output, error = process.communicate() |
| 793 | + logger.debug(f"clustering rooting finished with returncode {process.returncode}.") |
| 794 | + if process.returncode != 0: |
| 795 | + logger.error(f"mmseqs easy-cluster failed with returncode {process.returncode}.") |
| 796 | + logger.error(f"stdout:\n{output.decode('utf-8')}\n******************") |
| 797 | + logger.error(f"stderr:\n{error.decode('utf-8')}\n******************") |
| 798 | + raise RuntimeError("mmseqs easy-cluster failed.") |
| 799 | + return "singleton_unmapped_all_seqs.fasta" # the fasta-like file of clusters |
| 800 | + |
| 801 | + |
| 802 | +def write_clusters(fasta_clustered, address_rhogs_folder, min_rhog_size): |
| 803 | + """parse the fasta-like output of mmseqs linclust and write each cluster |
| 804 | +
|
| 805 | + The format is described here: https://github.com/soedinglab/MMseqs2/wiki#cluster-fasta-like-format |
| 806 | + Example: |
| 807 | + >O67032 <- cluster 1 |
| 808 | + >sp|O67032|RF1_AQUAE <- member1 of cluster 1 |
| 809 | + MLKEAYISRLDKLQEKYRKLQEELSKPEVIQD... |
| 810 | + >sp|O84026|RF1_CHLTR <- member2 of cluster 1 |
| 811 | + MEIKVLECLKRLEEVEKQISDPNIFSNPKEYS... |
| 812 | + >sp|P47500|RF1_MYCGE <- member3 of cluster 1 |
| 813 | + MDFDKQLFFNVEKIVELTEQLEKDLNKPNLSF... |
| 814 | + >O66429 <- cluster 2 |
| 815 | + >sp|O66429|EFTU_AQUAE <- member1 of cluster 2 |
| 816 | + MAKEKFERTKEHVNVGTIGHVDHGKS |
| 817 | + >sp|P0CD71|EFTU_CHLTR <- member1 of cluster 1 |
| 818 | + MSKETFQRNKPHINIGTIGHVDHGKTTLTAAITRAL |
| 819 | + """ |
797 | 820 |
|
798 | | - # if verbose: # todo print mmseqs logs/error/output for debugging |
799 | | - # print("output:\n", output) |
800 | | - # print("error:\n", error) |
801 | | - |
802 | | - # if "Error analyzing file" in str(output) or error: |
803 | | - # try: |
804 | | - |
805 | | - return "done" |
806 | | - |
807 | | - |
808 | | -def write_clusters(address_rhogs_folder, min_rhog_size): |
809 | | - cluster_output_address = "singleton_unmapped_all_seqs.fasta" |
810 | | - cluster_file = open(cluster_output_address, 'r') |
811 | | - # cluster_dic = {} |
812 | | - previous_line_start = " " |
813 | | - # parsing memseqs fasta-like format https://github.com/soedinglab/MMseqs2/wiki#cluster-fasta-like-format |
814 | | - # >A0A0R4IFW6 |
815 | | - # >tr|A0A0R4IFW6|A0A0R4IFW6_DANRE||DANRE||1000001584 tr|A0A0R |
816 | | - # MSRKTTSKRHYKPSSEIDDAALARKREYW |
817 | | - clusters = [] |
818 | | - cluster = [] |
819 | | - # line_strip = " " |
820 | | - # line_number =1 |
821 | | - for line in cluster_file: |
822 | | - line_strip = line.strip() |
823 | | - # print(line_number,previous_line_start,line_strip[0],cluster )# "_",previous_line_start,line_strip[0]) |
824 | | - if previous_line_start == " ": |
825 | | - clusters = [] |
826 | | - cluster = [] |
827 | | - elif line_strip[0] == ">" and previous_line_start == ">": # new cluster started at previous line |
828 | | - if len(cluster) >= 5: # more than one records [ID1, seq1,ID2, seq2, newcluster_id] |
829 | | - clusters.append(cluster[:-1]) # last item is the id of new cluster |
830 | | - cluster = [line_strip] |
831 | | - elif line_strip[0] == ">" and previous_line_start != ">": |
832 | | - cluster.append(line_strip) |
833 | | - elif line_strip[0] != ">": |
834 | | - cluster.append(line_strip) |
835 | | - |
836 | | - previous_line_start = line_strip[0] |
837 | | - # line_number+=1 |
| 821 | + with open(fasta_clustered, 'rt') as cluster_file: |
| 822 | + previous_line_start = " " |
| 823 | + clusters = [] |
| 824 | + cluster = [] |
| 825 | + for line in cluster_file: |
| 826 | + line_strip = line.strip() |
| 827 | + if previous_line_start == " ": |
| 828 | + clusters = [] |
| 829 | + cluster = [] |
| 830 | + elif line_strip[0] == ">" and previous_line_start == ">": # new cluster started at previous line |
| 831 | + if len(cluster) >= 5: |
| 832 | + # more than one records [ID1, seq1,ID2, seq2, newcluster_id], i.e. non-trivial cluster |
| 833 | + clusters.append(cluster[:-1]) # last item is the id of new cluster |
| 834 | + cluster = [line_strip] |
| 835 | + elif line_strip[0] == ">" and previous_line_start != ">": |
| 836 | + cluster.append(line_strip) |
| 837 | + elif line_strip[0] != ">": |
| 838 | + cluster.append(line_strip) |
| 839 | + previous_line_start = line_strip[0] |
838 | 840 |
|
839 | 841 | # for last cluster |
840 | 842 | if len(cluster) >= 4: # more than one records [ID1, seq1,ID2, seq2] |
841 | 843 | clusters.append(cluster) |
| 844 | + logger.debug(f"Number of non-trivial linclust clusters raw is {len(clusters)}.") |
842 | 845 |
|
843 | | - logger.debug("Number of linclust clusters raw is " + str(len(clusters))) |
844 | | - # the record id is parsed by mmseqs very ad hoc. so better use the fasta-like file |
845 | | - # cluster_output_address = "singleton_unmapped_cluster.tsv" |
846 | | - # cluster_file = open(cluster_output_address, 'r') |
847 | | - # cluster_dic = {} |
848 | | - # for line in cluster_file: |
849 | | - # line_strip = line.strip() |
850 | | - # rep, prot= line_strip.split() |
851 | | - # if rep in cluster_dic: |
852 | | - # cluster_dic[rep].append(prot) # the frist line includ (rep,rep) |
853 | | - # else: |
854 | | - # cluster_dic[rep]=[prot] |
855 | | - # cluster_list = [] |
856 | | - # for rep, prot_list in cluster_dic.items(): |
857 | | - # if len(prot_list)>1: |
858 | | - # cluster_list.append(prot_list) |
859 | | - |
860 | | - |
861 | | - cluster_iter= 1000*10 |
| 846 | + cluster_iter = 10000 |
862 | 847 | for cluster in clusters: |
| 848 | + # cluster is like ['>sp|O67032|RF1_AQUAE', 'MLKEAYISRLDKLQEKYRKLQEELSKPEVIQD...', ...], i.e. an id and its sequence. |
| 849 | + # thus, check for double the min_rhog_size to have at least min_rhog_size proteins |
863 | 850 | if len(cluster) >= 2 * min_rhog_size: |
864 | | - species_names= [] # it seems that genes from same species tend to cluster together, discard such clusters |
865 | | - for pr_idi in range(int(len(cluster)/2)): |
| 851 | + species_names = set([]) # it seems that genes from same species tend to cluster together, discard such clusters |
| 852 | + for pr_idi in range(int(len(cluster)/2)): |
866 | 853 | species_name = cluster[pr_idi*2].split(" ")[0].split("||")[1] |
867 | | - species_names.append(species_name) |
868 | | - if len(set(species_names))>1: |
869 | | - file_idx = open(address_rhogs_folder + "/HOG_clust" + str(cluster_iter) + ".fa", "w") |
870 | | - for line in cluster: |
871 | | - file_idx.write(line + "\n") |
872 | | - file_idx.close() |
873 | | - cluster_iter+=1 |
874 | | - |
875 | | - return cluster_iter-1 |
| 854 | + species_names.add(species_name) |
| 855 | + if len(species_names) > 1: |
| 856 | + with open(address_rhogs_folder + "/HOG_clust" + str(cluster_iter) + ".fa", "w") as file_idx: |
| 857 | + for line in cluster: |
| 858 | + file_idx.write(line + "\n") |
| 859 | + cluster_iter += 1 |
| 860 | + return cluster_iter - 10000 |
876 | 861 |
|
877 | 862 |
|
878 | 863 | def parse_isoform_file(species_names, folder=None): |
|
0 commit comments