1010from io import StringIO
1111from pathlib import Path
1212from textwrap import dedent
13- from typing import Any
13+ from typing import Any , TypeAlias
1414
1515import bioregistry
1616import networkx as nx
3737 SynonymTypeDef ,
3838 Term ,
3939 TypeDef ,
40+ build_ontology ,
4041 default_reference ,
41- make_ad_hoc_ontology ,
4242)
4343from ..struct_utils import Annotation , Stanza
4444from ..typedef import comment as has_comment
@@ -174,7 +174,7 @@ def from_obonet(
174174
175175 missing_typedefs : set [ReferenceTuple ] = set ()
176176
177- subset_typedefs = _get_subsetdefs (graph .graph , ontology_prefix = ontology_prefix )
177+ subset_typedefs = _get_subsetdefs (graph .graph , ontology_prefix = ontology_prefix , strict = strict )
178178
179179 root_terms : list [Reference ] = []
180180 property_values : list [Annotation ] = []
@@ -243,20 +243,22 @@ def from_obonet(
243243 use_tqdm = use_tqdm ,
244244 )
245245
246- return make_ad_hoc_ontology (
247- _ontology = ontology_prefix ,
248- _name = name ,
249- _auto_generated_by = graph .graph .get ("auto-generated-by" ),
250- _typedefs = list (typedefs .values ()),
251- _synonym_typedefs = list (synonym_typedefs .values ()),
252- _date = date ,
253- _data_version = data_version ,
254- _root_terms = root_terms ,
246+ return build_ontology (
247+ prefix = ontology_prefix ,
248+ name = name ,
249+ auto_generated_by = graph .graph .get ("auto-generated-by" ),
250+ typedefs = list (typedefs .values ()),
251+ synonym_typedefs = list (synonym_typedefs .values ()),
252+ date = date ,
253+ version = data_version ,
254+ idspaces = idspaces ,
255+ root_terms = root_terms ,
256+ subsetdefs = subset_typedefs ,
257+ properties = property_values ,
258+ imports = imports ,
259+ # ontology_iri
260+ # ontology_version_iri
255261 terms = terms ,
256- _property_values = property_values ,
257- _subsetdefs = subset_typedefs ,
258- _imports = imports ,
259- _idspaces = idspaces ,
260262 )
261263
262264
@@ -268,7 +270,7 @@ def _get_terms(
268270 upgrade : bool ,
269271 typedefs : Mapping [ReferenceTuple , TypeDef ],
270272 synonym_typedefs : Mapping [ReferenceTuple , SynonymTypeDef ],
271- subset_typedefs ,
273+ subset_typedefs : SubsetTypeDefs ,
272274 missing_typedefs : set [ReferenceTuple ],
273275 macro_config : MacroConfig ,
274276 use_tqdm : bool = False ,
@@ -332,7 +334,13 @@ def _get_terms(
332334 missing_typedefs = missing_typedefs ,
333335 )
334336 _process_replaced_by (term , data , ontology_prefix = ontology_prefix , strict = strict )
335- _process_subsets (term , data , ontology_prefix = ontology_prefix , strict = strict )
337+ _process_subsets (
338+ term ,
339+ data ,
340+ ontology_prefix = ontology_prefix ,
341+ strict = strict ,
342+ subset_typedefs = subset_typedefs ,
343+ )
336344 _process_intersection_of (term , data , ontology_prefix = ontology_prefix , strict = strict )
337345 _process_union_of (term , data , ontology_prefix = ontology_prefix , strict = strict )
338346 _process_equivalent_to (term , data , ontology_prefix = ontology_prefix , strict = strict )
@@ -379,13 +387,19 @@ def _process_creation_date(term: Stanza, data) -> None:
379387
380388def _process_union_of (term : Stanza , data , * , ontology_prefix : str , strict : bool ) -> None :
381389 for reference in iterate_node_reference_tag (
382- "union_of" , data = data , ontology_prefix = ontology_prefix , strict = strict , node = term .reference
390+ term ,
391+ "union_of" ,
392+ data = data ,
393+ ontology_prefix = ontology_prefix ,
394+ strict = strict ,
395+ node = term .reference ,
383396 ):
384397 term .append_union_of (reference )
385398
386399
387400def _process_equivalent_to (term : Stanza , data , * , ontology_prefix : str , strict : bool ) -> None :
388401 for reference in iterate_node_reference_tag (
402+ term ,
389403 "equivalent_to" ,
390404 data = data ,
391405 ontology_prefix = ontology_prefix ,
@@ -397,6 +411,7 @@ def _process_equivalent_to(term: Stanza, data, *, ontology_prefix: str, strict:
397411
398412def _process_disjoint_from (term : Stanza , data , * , ontology_prefix : str , strict : bool ) -> None :
399413 for reference in iterate_node_reference_tag (
414+ term ,
400415 "disjoint_from" ,
401416 data = data ,
402417 ontology_prefix = ontology_prefix ,
@@ -408,15 +423,15 @@ def _process_disjoint_from(term: Stanza, data, *, ontology_prefix: str, strict:
408423
409424def _process_alts (term : Stanza , data , * , ontology_prefix : str , strict : bool ) -> None :
410425 for alt_reference in iterate_node_reference_tag (
411- "alt_id" , data , node = term .reference , strict = strict , ontology_prefix = ontology_prefix
426+ term , "alt_id" , data , node = term .reference , strict = strict , ontology_prefix = ontology_prefix
412427 ):
413428 term .append_alt (alt_reference )
414429
415430
416431def _process_parents (term : Stanza , data , * , ontology_prefix : str , strict : bool ) -> None :
417432 for tag in ["is_a" , "instance_of" ]:
418433 for parent in iterate_node_reference_tag (
419- tag , data , node = term .reference , strict = strict , ontology_prefix = ontology_prefix
434+ term , tag , data , node = term .reference , strict = strict , ontology_prefix = ontology_prefix
420435 ):
421436 term .append_parent (parent )
422437
@@ -512,20 +527,35 @@ def _process_relations(
512527
513528def _process_replaced_by (stanza : Stanza , data , * , ontology_prefix : str , strict : bool ) -> None :
514529 for reference in iterate_node_reference_tag (
515- "replaced_by" , data , node = stanza .reference , strict = strict , ontology_prefix = ontology_prefix
530+ stanza ,
531+ "replaced_by" ,
532+ data ,
533+ node = stanza .reference ,
534+ strict = strict ,
535+ ontology_prefix = ontology_prefix ,
516536 ):
517537 stanza .append_replaced_by (reference )
518538
519539
520- def _process_subsets (stanza : Stanza , data , * , ontology_prefix : str , strict : bool ) -> None :
540+ UNDEFINED_SUBSETS = set ()
541+
542+
543+ def _process_subsets (
544+ stanza : Stanza , data , * , ontology_prefix : str , strict : bool , subset_typedefs : SubsetTypeDefs
545+ ) -> None :
521546 for reference in iterate_node_reference_tag (
547+ stanza ,
522548 "subset" ,
523549 data ,
524550 node = stanza .reference ,
525551 strict = strict ,
526552 ontology_prefix = ontology_prefix ,
527553 counter = SUBSET_ERROR_COUNTER ,
528554 ):
555+ if reference not in subset_typedefs :
556+ if reference not in UNDEFINED_SUBSETS :
557+ logger .warning ("[%s] undefined subset: %s" , stanza .curie , reference )
558+ UNDEFINED_SUBSETS .add (reference )
529559 stanza .append_subset (reference )
530560
531561
@@ -669,8 +699,13 @@ def _handle_xref(
669699SUBSET_ERROR_COUNTER : Counter [tuple [str , str ]] = Counter ()
670700
671701
672- def _get_subsetdefs (graph : nx .MultiDiGraph , ontology_prefix : str ) -> list [tuple [Reference , str ]]:
673- rv = []
702+ SubsetTypeDefs : TypeAlias = dict [Reference , str ]
703+
704+
705+ def _get_subsetdefs (
706+ graph : nx .MultiDiGraph , ontology_prefix : str , * , strict : bool = False
707+ ) -> SubsetTypeDefs :
708+ rv = {}
674709 for subsetdef in graph .get ("subsetdef" , []):
675710 left , _ , right = subsetdef .partition (" " )
676711 if not right :
@@ -682,11 +717,12 @@ def _get_subsetdefs(graph: nx.MultiDiGraph, ontology_prefix: str) -> list[tuple[
682717 name = right ,
683718 line = subsetdef ,
684719 counter = SUBSET_ERROR_COUNTER ,
720+ strict = strict ,
685721 )
686722 if left_ref is None :
687723 continue
688724 right = right .strip ('"' )
689- rv . append (( left_ref , right ))
725+ rv [ left_ref ] = right
690726 return rv
691727
692728
@@ -812,6 +848,7 @@ def iterate_typedefs(
812848 # can't really have a pre-defined set of synonym typedefs here!
813849 synonym_typedefs : Mapping [ReferenceTuple , SynonymTypeDef ] = {}
814850 typedefs : Mapping [ReferenceTuple , TypeDef ] = {}
851+ subset_typedefs : SubsetTypeDefs = {} # FIXME
815852 missing_typedefs : set [ReferenceTuple ] = set ()
816853 for data in graph .graph .get ("typedefs" , []):
817854 if "id" in data :
@@ -889,7 +926,13 @@ def iterate_typedefs(
889926 missing_typedefs = missing_typedefs ,
890927 )
891928 _process_replaced_by (typedef , data , ontology_prefix = ontology_prefix , strict = strict )
892- _process_subsets (typedef , data , ontology_prefix = ontology_prefix , strict = strict )
929+ _process_subsets (
930+ typedef ,
931+ data ,
932+ ontology_prefix = ontology_prefix ,
933+ strict = strict ,
934+ subset_typedefs = subset_typedefs ,
935+ )
893936 _process_intersection_of (typedef , data , ontology_prefix = ontology_prefix , strict = strict )
894937 _process_union_of (typedef , data , ontology_prefix = ontology_prefix , strict = strict )
895938 _process_equivalent_to (typedef , data , ontology_prefix = ontology_prefix , strict = strict )
@@ -904,6 +947,7 @@ def iterate_typedefs(
904947 _process_holds_over_chain (typedef , data , ontology_prefix = ontology_prefix , strict = strict )
905948 typedef .disjoint_over .extend (
906949 iterate_node_reference_tag (
950+ typedef ,
907951 "disjoint_over" ,
908952 data ,
909953 node = typedef .reference ,
@@ -913,6 +957,7 @@ def iterate_typedefs(
913957 )
914958 typedef .transitive_over .extend (
915959 iterate_node_reference_tag (
960+ typedef ,
916961 "transitive_over" ,
917962 data ,
918963 node = typedef .reference ,
@@ -926,6 +971,7 @@ def iterate_typedefs(
926971
927972def _process_consider (stanza : Stanza , data , * , ontology_prefix : str , strict : bool = False ):
928973 for reference in iterate_node_reference_tag (
974+ stanza ,
929975 "consider" ,
930976 data ,
931977 node = stanza .reference ,
@@ -1349,6 +1395,7 @@ def _parse_default_prop(property_id, ontology_prefix) -> Reference | None:
13491395
13501396
13511397def iterate_node_reference_tag (
1398+ stanza : Stanza ,
13521399 tag : str ,
13531400 data : Mapping [str , Any ],
13541401 * ,
@@ -1359,21 +1406,35 @@ def iterate_node_reference_tag(
13591406 counter : Counter [tuple [str , str ]] | None = None ,
13601407) -> Iterable [Reference ]:
13611408 """Extract a list of CURIEs from the data."""
1362- for identifier in data .get (tag , []):
1409+ for str_or_curie_or_uri in data .get (tag , []):
13631410 reference = _obo_parse_identifier (
1364- identifier ,
1411+ str_or_curie_or_uri ,
13651412 strict = strict ,
13661413 node = node ,
13671414 ontology_prefix = ontology_prefix ,
13681415 upgrade = upgrade ,
13691416 counter = counter ,
13701417 )
1371- if reference is None :
1418+ if reference is not None :
1419+ yield reference
1420+ elif tag == "subset" :
1421+ # this is to avoid the millions of 2:STAR and 3:STAR errors when parsing ChEBI that makes
1422+ # it take forever. In general, most of the subset identifiers are totally borked.
1423+ if str_or_curie_or_uri not in SUBSET_INVALIDS :
1424+ logger .warning (
1425+ "[%s] %s - could not parse subset identifier: %s" ,
1426+ stanza .curie ,
1427+ tag ,
1428+ str_or_curie_or_uri ,
1429+ )
1430+ SUBSET_INVALIDS .add (str_or_curie_or_uri )
1431+ else :
13721432 logger .warning (
1373- "[%s] %s - could not parse identifier: %s" , ontology_prefix , tag , identifier
1433+ "[%s] %s - could not parse identifier: %s" , stanza . curie , tag , str_or_curie_or_uri
13741434 )
1375- else :
1376- yield reference
1435+
1436+
1437+ SUBSET_INVALIDS : set [str ] = set ()
13771438
13781439
13791440def _process_intersection_of (
0 commit comments