1313import pickle
1414from abc import ABC
1515from collections import OrderedDict
16- from typing import Any , Dict , Generator , List , Optional , Tuple
16+ from typing import Any , Dict , Generator , List , Optional , Tuple , Union
1717
1818import fastobo
1919import networkx as nx
@@ -244,16 +244,26 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
244244 with open (data_path , encoding = "utf-8" ) as chebi :
245245 chebi = "\n " .join (l for l in chebi if not l .startswith ("xref:" ))
246246
247- elements = [
248- term_callback (clause )
249- for clause in fastobo .loads (chebi )
250- if clause and ":" in str (clause .id )
251- ]
247+ elements = []
248+ for term_doc in fastobo .loads (chebi ):
249+ if (
250+ term_doc
251+ and isinstance (term_doc .id , fastobo .id .PrefixedIdent )
252+ and term_doc .id .prefix == "CHEBI"
253+ ):
254+ term_dict = term_callback (term_doc )
255+ if term_dict :
256+ elements .append (term_dict )
252257
253258 g = nx .DiGraph ()
254259 for n in elements :
255260 g .add_node (n ["id" ], ** n )
256- g .add_edges_from ([(p , q ["id" ]) for q in elements for p in q ["parents" ]])
261+
262+ # Only take the edges which connects the existing nodes, to avoid internal creation of obsolete nodes
263+ # https://github.com/ChEB-AI/python-chebai/pull/55#issuecomment-2386654142
264+ g .add_edges_from (
265+ [(p , q ["id" ]) for q in elements for p in q ["parents" ] if g .has_node (p )]
266+ )
257267
258268 print ("Compute transitive closure" )
259269 return nx .transitive_closure_dag (g )
@@ -397,7 +407,9 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
397407 """
398408 try :
399409 filename = self .processed_file_names_dict ["data" ]
400- data_chebi_version = torch .load (os .path .join (self .processed_dir , filename ))
410+ data_chebi_version = torch .load (
411+ os .path .join (self .processed_dir , filename ), weights_only = False
412+ )
401413 except FileNotFoundError :
402414 raise FileNotFoundError (
403415 f"File data.pt doesn't exists. "
@@ -418,7 +430,8 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
418430 data_chebi_train_version = torch .load (
419431 os .path .join (
420432 self ._chebi_version_train_obj .processed_dir , filename_train
421- )
433+ ),
434+ weights_only = False ,
422435 )
423436 except FileNotFoundError :
424437 raise FileNotFoundError (
@@ -812,7 +825,7 @@ def chebi_to_int(s: str) -> int:
812825 return int (s [s .index (":" ) + 1 :])
813826
814827
815- def term_callback (doc ) -> dict :
828+ def term_callback (doc : fastobo . term . TermFrame ) -> Union [ Dict , bool ] :
816829 """
817830 Extracts information from a ChEBI term document.
818831 This function takes a ChEBI term document as input and extracts relevant information such as the term ID, parents,
@@ -852,6 +865,12 @@ def term_callback(doc) -> dict:
852865 parents .append (chebi_to_int (str (clause .term )))
853866 elif isinstance (clause , fastobo .term .NameClause ):
854867 name = str (clause .name )
868+
869+ if isinstance (clause , fastobo .term .IsObsoleteClause ):
870+ if clause .obsolete :
871+ # if the term document contains clause as obsolete as true, skips this document.
872+ return False
873+
855874 return {
856875 "id" : chebi_to_int (str (doc .id )),
857876 "parents" : parents ,
0 commit comments