@@ -64,9 +64,9 @@ def read_infer_xml_rhog(rhogid, inferhog_concurrent_on, pickles_rhog_folder, pi
6464 # os.makedirs("./genetrees")
6565
6666 logger .info ("\n " + "==" * 10 + "\n Start working on root hog: " + rhogid + ". \n " )
67- rhog_i_prot_address = rhogs_fa_folder + "/ HOG_" + rhogid + " .fa"
67+ rhog_i_prot_address = os . path . join ( rhogs_fa_folder , f" HOG_{ rhogid } .fa")
6868 rhog_i = list (SeqIO .parse (rhog_i_prot_address , "fasta" ))
69- logger .info ("number of proteins in the rHOG is " + str ( len (rhog_i )) + "." )
69+ logger .info ("number of proteins in the rHOG is %d." , len (rhog_i ))
7070 # the file "species_tree_checked.nwk" is created by the check_input.py
7171 (species_tree ) = _utils_subhog .read_species_tree (conf_infer_subhhogs .species_tree )
7272
@@ -88,9 +88,11 @@ def read_infer_xml_rhog(rhogid, inferhog_concurrent_on, pickles_rhog_folder, pi
8888 if not keep_subhog_each_pickle :
8989 shutil .rmtree (pickles_subhog_folder )
9090
91+ tot_genes , placed_genes = 0 , 0
9192 hogs_rhogs_xml = []
9293 for hog_i in hogs_a_rhog :
93- if len (hog_i ._members ) >= inferhog_min_hog_size_xml :
94+ tot_genes += len (hog_i )
95+ if len (hog_i ) >= inferhog_min_hog_size_xml :
9496 # could be improved # hogs_a_rhog_xml = hog_i.to_orthoxml(**gene_id_name)
9597 hogs_a_rhog_xml_raw = hog_i .to_orthoxml () # <generef > <paralg object >
9698 if orthoxml_v03 and 'paralogGroup' in str (hogs_a_rhog_xml_raw ) :
@@ -106,17 +108,25 @@ def read_infer_xml_rhog(rhogid, inferhog_concurrent_on, pickles_rhog_folder, pi
106108 else :
107109 hogs_a_rhog_xml = hogs_a_rhog_xml_raw
108110 hogs_rhogs_xml .append (hogs_a_rhog_xml )
111+ placed_genes += len (hog_i )
112+ if conf_infer_subhhogs .v > 1 :
113+ logger .debug ("writing an orthoxml stub file for hog %s" , hog_i .hogid )
114+ with open (f'orthostub_{ rhogid } _xml_{ hog_i .hogid } .xml' , 'wb' ) as handle :
115+ ET .indent (hogs_a_rhog_xml , space = ' ' , level = 0 )
116+ oxml_et = ET .ElementTree (hogs_a_rhog_xml )
117+ oxml_et .write (handle , encoding = "utf-8" )
109118 else :
110119 logger .debug ("we are not reporting due to fastoma singleton hog |*| " + str (list (hog_i ._members )[0 ]))
111- if len (hog_i . _members ) > 1 :
120+ if len (hog_i ) > 1 :
112121 logger .warning ("issue 166312309 this is not a singleton " + str (hog_i ._members ))
113122
114-
115123 pickles_rhog_file = pickles_rhog_folder + '/file_' + rhogid + '.pickle'
116124 with open (pickles_rhog_file , 'wb' ) as handle :
117125 # dill_pickle.dump(hogs_rhogs_xml, handle, protocol=dill_pickle.HIGHEST_PROTOCOL)
118126 pickle .dump (hogs_rhogs_xml , handle , protocol = pickle .HIGHEST_PROTOCOL )
119- logger .info ("All subHOGs for the rootHOG %s as OrthoXML format is written in %s" , rhogid , pickles_rhog_file ) # todo report how many genes are in the group for this rootHOG (out of how many were in the initial grouping from omamer)
127+ logger .info ("All subHOGs for the rootHOG %s as OrthoXML format is written in %s" , rhogid , pickles_rhog_file )
128+ logger .info (" --> placed %d out of %d proteins (%.2f%%). %d proteins in original roothog" ,
129+ placed_genes , tot_genes , 100 * placed_genes / tot_genes , len (rhog_i ))
120130 # to see orthoxml as string, you might need to do it for different idx
121131 # idx=0; from xml.dom import minidom; import xml.etree.ElementTree as ET; minidom.parseString(ET.tostring(hogs_rhogs_xml[idx])).toprettyxml(indent=" ")
122132 del hogs_a_rhog # to be memory efficient
0 commit comments