Skip to content

Commit af91dfb

Browse files
committed
[ADD] improve output, store orthoxml stub files in debug mode
1 parent e3a0cca commit af91dfb

File tree

2 files changed

+19
-6
lines changed

2 files changed

+19
-6
lines changed

FastOMA/_hog_class.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,9 @@ def taxlevel(self):
144144
def rhogid(self):
145145
return self._rhogid
146146

147+
def __len__(self):
148+
return len(self._members)
149+
147150
def get_members(self):
148151
return set(self._members)
149152

FastOMA/_infer_subhog.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,9 @@ def read_infer_xml_rhog(rhogid, inferhog_concurrent_on, pickles_rhog_folder, pi
6464
# os.makedirs("./genetrees")
6565

6666
logger.info("\n" + "==" * 10 + "\n Start working on root hog: " + rhogid + ". \n")
67-
rhog_i_prot_address = rhogs_fa_folder + "/HOG_" + rhogid+ ".fa"
67+
rhog_i_prot_address = os.path.join(rhogs_fa_folder, f"HOG_{rhogid}.fa")
6868
rhog_i = list(SeqIO.parse(rhog_i_prot_address, "fasta"))
69-
logger.info("number of proteins in the rHOG is " + str(len(rhog_i)) + ".")
69+
logger.info("number of proteins in the rHOG is %d.", len(rhog_i))
7070
# the file "species_tree_checked.nwk" is created by the check_input.py
7171
(species_tree) = _utils_subhog.read_species_tree(conf_infer_subhhogs.species_tree)
7272

@@ -88,9 +88,11 @@ def read_infer_xml_rhog(rhogid, inferhog_concurrent_on, pickles_rhog_folder, pi
8888
if not keep_subhog_each_pickle:
8989
shutil.rmtree(pickles_subhog_folder)
9090

91+
tot_genes, placed_genes = 0, 0
9192
hogs_rhogs_xml = []
9293
for hog_i in hogs_a_rhog:
93-
if len(hog_i._members) >= inferhog_min_hog_size_xml:
94+
tot_genes += len(hog_i)
95+
if len(hog_i) >= inferhog_min_hog_size_xml:
9496
# could be improved # hogs_a_rhog_xml = hog_i.to_orthoxml(**gene_id_name)
9597
hogs_a_rhog_xml_raw = hog_i.to_orthoxml() # <generef > <paralg object >
9698
if orthoxml_v03 and 'paralogGroup' in str(hogs_a_rhog_xml_raw) :
@@ -106,17 +108,25 @@ def read_infer_xml_rhog(rhogid, inferhog_concurrent_on, pickles_rhog_folder, pi
106108
else:
107109
hogs_a_rhog_xml = hogs_a_rhog_xml_raw
108110
hogs_rhogs_xml.append(hogs_a_rhog_xml)
111+
placed_genes += len(hog_i)
112+
if conf_infer_subhhogs.v > 1:
113+
logger.debug("writing an orthoxml stub file for hog %s", hog_i.hogid)
114+
with open(f'orthostub_{rhogid}_xml_{hog_i.hogid}.xml', 'wb') as handle:
115+
ET.indent(hogs_a_rhog_xml, space=' ', level=0)
116+
oxml_et = ET.ElementTree(hogs_a_rhog_xml)
117+
oxml_et.write(handle, encoding="utf-8")
109118
else:
110119
logger.debug("we are not reporting due to fastoma singleton hog |*| " + str(list(hog_i._members)[0]))
111-
if len(hog_i._members) > 1:
120+
if len(hog_i) > 1:
112121
logger.warning("issue 166312309 this is not a singleton "+str(hog_i._members))
113122

114-
115123
pickles_rhog_file = pickles_rhog_folder + '/file_' + rhogid + '.pickle'
116124
with open(pickles_rhog_file, 'wb') as handle:
117125
# dill_pickle.dump(hogs_rhogs_xml, handle, protocol=dill_pickle.HIGHEST_PROTOCOL)
118126
pickle.dump(hogs_rhogs_xml, handle, protocol=pickle.HIGHEST_PROTOCOL)
119-
logger.info("All subHOGs for the rootHOG %s as OrthoXML format is written in %s", rhogid, pickles_rhog_file) # todo report how many genes are in the group for this rootHOG (out of how many were in the initial grouping from omamer)
127+
logger.info("All subHOGs for the rootHOG %s as OrthoXML format is written in %s", rhogid, pickles_rhog_file)
128+
logger.info(" --> placed %d out of %d proteins (%.2f%%). %d proteins in original roothog",
129+
placed_genes, tot_genes, 100*placed_genes/tot_genes, len(rhog_i))
120130
# to see orthoxml as string, you might need to do it for different idx
121131
# idx=0; from xml.dom import minidom; import xml.etree.ElementTree as ET; minidom.parseString(ET.tostring(hogs_rhogs_xml[idx])).toprettyxml(indent=" ")
122132
del hogs_a_rhog # to be memory efficient

0 commit comments

Comments
 (0)