11"""
22 OMAmer - tree-driven and alignment-free protein assignment to sub-families
33
4- (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
4+ (C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
55 (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
66 (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
77 Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -499,13 +499,13 @@ def _get_child_prots(hogs, hog2protoffs, child_prots_off):
499499 # TODO: check what else would break. this could be used if someone wanted to build a
500500 # database for flat OGs.
501501 LOG .warning ("No nesting structure in HOGs defined in OrthoXML." )
502- else :
503- self .db .create_carray (
504- "/" ,
505- "ChildrenHOG" ,
506- obj = np .array (child_hogs , dtype = np .uint32 ),
507- filters = self ._compr ,
508- )
502+ child_hogs = [ 0 ] # adding sentinel in case no nested HOGs are defined.
503+ self .db .create_carray (
504+ "/" ,
505+ "ChildrenHOG" ,
506+ obj = np .array (child_hogs , dtype = np .uint32 ),
507+ filters = self ._compr ,
508+ )
509509 self .db .create_carray (
510510 "/" ,
511511 "ChildrenProt" ,
@@ -1063,14 +1063,14 @@ def __init__(
10631063 self .include_younger_fams = include_younger_fams
10641064
10651065 def setup_hogparser (self , oxml_path ):
1066- from .HOGParser import HOGParser
1066+ from .orthoxml_parser import OrthoxmlParser
10671067
10681068 LOG .debug ("loading species table from orthoxml" )
10691069 self .oxml_path = oxml_path
1070- self .hog_parser = HOGParser (oxml_path )
1070+ self .hog_parser = OrthoxmlParser (oxml_path )
10711071
10721072 def parse_oxml (self , nspecies_below ):
1073- from .HOGParser import is_orthologGroup_node
1073+ from .orthoxml_parser import is_orthologGroup_node
10741074
10751075 def generate_hog_tab (hog ):
10761076 # this is more memory intensive (we cache the results)
@@ -1142,7 +1142,7 @@ def generate_hogs_for_prot(hog, gene_data):
11421142 hogs = []
11431143 entries = []
11441144 LOG .debug ("loading hog structure and membership from orthoxml" )
1145- for hog in tqdm (self .hog_parser .HOGs (auto_clean = True ), desc = "Parsing HOGs" ):
1145+ for hog in tqdm (self .hog_parser .iter_hogs (auto_clean = True ), desc = "Parsing HOGs" ):
11461146 hog_data = list (filter (lambda x : x is not None , generate_hog_tab (hog )))
11471147 gene_data = list (generate_prot_tab (hog ))
11481148 hog_data += list (generate_hogs_for_prot (hog , gene_data ))
@@ -1169,7 +1169,7 @@ def generate_hogs_for_prot(hog, gene_data):
11691169 # set index as the protein id so that we can look up the hog id.
11701170 ent_tab = ent_tab [ent_tab ["hogid" ].isin (set (hog_tab ["ID" ]))].set_index ("protId" )
11711171
1172- return ( hog_tab .to_records (), ent_tab )
1172+ return hog_tab .to_records (), ent_tab
11731173
11741174 ### main function ###
11751175 def build_database (self , oxml_path , sequence_files , structure_files , stree_path ):
@@ -1183,7 +1183,7 @@ def build_database(self, oxml_path, sequence_files, structure_files, stree_path)
11831183 self .add_taxid_col ()
11841184
11851185 LOG .debug ("build hog_table from orthoxml" )
1186- ( hog_tab , ent_tab ) = self .parse_oxml (nspecies_below )
1186+ hog_tab , ent_tab = self .parse_oxml (nspecies_below )
11871187
11881188 LOG .debug ("select and strip OMA HOGs" )
11891189 (
@@ -1271,13 +1271,26 @@ def select_and_filter_OMA_proteins(
12711271
12721272 ent_tab = ent_tab [ent_tab ["hogid" ].map (lambda x : x in oma_hog2hog )]
12731273
1274- LOG .debug (" - loading proteins from FASTA sequences, for selected HOGs" )
1274+ # Make ent_tab into a dict {protID -> (hogid, species)} as it's
1275+ # more efficient for querying individual protID than pd.DataFrame.loc
1276+
1277+ duplicates = ent_tab [ent_tab .index .duplicated (keep = False )]
1278+
1279+ # if found duplicated keys (i.e. protId), raise error
1280+ if len (duplicates ) > 0 :
1281+ nonunique_keys = duplicates .index .unique ()
1282+ error_message = "Found duplicated protein IDs:\n " + "\n \t " .join (s for s in nonunique_keys )
1283+ LOG .error (error_message )
1284+ raise RuntimeError (error_message )
1285+
1286+ ent_tab = ent_tab [~ ent_tab .index .duplicated (keep = "first" )].to_dict ("index" )
12751287
12761288 prot_off = 0 # pointer to protein in protein table
12771289
12781290 # store rows for species and protein tables and sequence buffer
12791291 seq_buffs = []
12801292
1293+ LOG .debug (" - loading proteins from FASTA sequences for selected HOGs" )
12811294 prot_tab = self .db .create_table (
12821295 "/" ,
12831296 "Protein" ,
@@ -1299,20 +1312,20 @@ def select_and_filter_OMA_proteins(
12991312 SeqIO .parse (fp , "fasta" ),
13001313 desc = "Parsing sequences ({})" .format (os .path .basename (fasta_fn )),
13011314 ):
1302- if rec .description in ent_tab . index :
1315+ if rec .description in ent_tab :
13031316 # this seems to be most common, full header is used in standalone orthoXML
13041317 prot_id = rec .description
1305- elif rec .id in ent_tab . index :
1318+ elif rec .id in ent_tab :
13061319 # otherwise we might only see the first part of the id.
13071320 prot_id = rec .id
13081321 else :
13091322 # otherwise we can't do anything...
13101323 continue
1311-
13121324 # get hog id, skip if we have filtered it out
1313- r = ent_tab . loc [prot_id ]
1325+ r = ent_tab [prot_id ]
13141326 hog_id = r ["hogid" ]
13151327 sp = r ["species" ]
1328+
13161329 # (hog_id, sp) = entry_mapping[prot_id]
13171330 if hog_id not in oma_hog2hog :
13181331 continue
@@ -1345,6 +1358,7 @@ def select_and_filter_OMA_proteins(
13451358 # update offset of protein row in table
13461359 prot_off += 1
13471360
1361+
13481362 # store species info
13491363 sp_rows = [()] * len (species ) # keep sorted
13501364 for sp in sp2sp_off :
@@ -1419,6 +1433,7 @@ def select_and_filter_OMA_proteins(
14191433 structure_buff = np .concatenate (ss_buffs )
14201434 return fam2hogs , hog2protoffs , hog2tax , hog2oma_hog , seq_buff , structure_buff
14211435
1436+
14221437 def add_taxid_col (self ):
14231438 """
14241439 Add the NCBI taxon id from orthoxml
0 commit comments