Skip to content

Commit fb43c6d

Browse files
authored
Update subset handling (#476)
This plagues the ChEBI parsing since there are thousands of `2:STAR` and 3:STAR` values for subsets, which are not actually CURIEs but somehow get serialized this way due to how OWLAPI works
1 parent 12a9c28 commit fb43c6d

File tree

9 files changed

+160
-69
lines changed

9 files changed

+160
-69
lines changed

src/pyobo/getters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def get_ontology(
144144

145145
if version is None:
146146
version = _get_version_from_artifact(prefix)
147+
logger.info(f"[%s] current version is {version}", prefix)
147148

148149
if force_process:
149150
obonet_json_gz_path = None

src/pyobo/struct/functional/obo_to_functional.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def get_ontology_axioms(obo_ontology: Obo) -> Iterable[f.Box]:
6969

7070
if obo_ontology.subsetdefs:
7171
yield f.Declaration("oboInOwl:SubsetProperty", type="AnnotationProperty")
72-
for subset_typedef, subset_label in obo_ontology.subsetdefs:
72+
for subset_typedef, subset_label in obo_ontology.subsetdefs.items():
7373
yield f.Declaration(subset_typedef, type="AnnotationProperty")
7474
yield m.LabelMacro(subset_typedef, subset_label)
7575
yield f.SubAnnotationPropertyOf(subset_typedef, "oboInOwl:SubsetProperty")

src/pyobo/struct/obo/reader.py

Lines changed: 94 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from io import StringIO
1111
from pathlib import Path
1212
from textwrap import dedent
13-
from typing import Any
13+
from typing import Any, TypeAlias
1414

1515
import bioregistry
1616
import networkx as nx
@@ -37,8 +37,8 @@
3737
SynonymTypeDef,
3838
Term,
3939
TypeDef,
40+
build_ontology,
4041
default_reference,
41-
make_ad_hoc_ontology,
4242
)
4343
from ..struct_utils import Annotation, Stanza
4444
from ..typedef import comment as has_comment
@@ -174,7 +174,7 @@ def from_obonet(
174174

175175
missing_typedefs: set[ReferenceTuple] = set()
176176

177-
subset_typedefs = _get_subsetdefs(graph.graph, ontology_prefix=ontology_prefix)
177+
subset_typedefs = _get_subsetdefs(graph.graph, ontology_prefix=ontology_prefix, strict=strict)
178178

179179
root_terms: list[Reference] = []
180180
property_values: list[Annotation] = []
@@ -243,20 +243,22 @@ def from_obonet(
243243
use_tqdm=use_tqdm,
244244
)
245245

246-
return make_ad_hoc_ontology(
247-
_ontology=ontology_prefix,
248-
_name=name,
249-
_auto_generated_by=graph.graph.get("auto-generated-by"),
250-
_typedefs=list(typedefs.values()),
251-
_synonym_typedefs=list(synonym_typedefs.values()),
252-
_date=date,
253-
_data_version=data_version,
254-
_root_terms=root_terms,
246+
return build_ontology(
247+
prefix=ontology_prefix,
248+
name=name,
249+
auto_generated_by=graph.graph.get("auto-generated-by"),
250+
typedefs=list(typedefs.values()),
251+
synonym_typedefs=list(synonym_typedefs.values()),
252+
date=date,
253+
version=data_version,
254+
idspaces=idspaces,
255+
root_terms=root_terms,
256+
subsetdefs=subset_typedefs,
257+
properties=property_values,
258+
imports=imports,
259+
# ontology_iri
260+
# ontology_version_iri
255261
terms=terms,
256-
_property_values=property_values,
257-
_subsetdefs=subset_typedefs,
258-
_imports=imports,
259-
_idspaces=idspaces,
260262
)
261263

262264

@@ -268,7 +270,7 @@ def _get_terms(
268270
upgrade: bool,
269271
typedefs: Mapping[ReferenceTuple, TypeDef],
270272
synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
271-
subset_typedefs,
273+
subset_typedefs: SubsetTypeDefs,
272274
missing_typedefs: set[ReferenceTuple],
273275
macro_config: MacroConfig,
274276
use_tqdm: bool = False,
@@ -332,7 +334,13 @@ def _get_terms(
332334
missing_typedefs=missing_typedefs,
333335
)
334336
_process_replaced_by(term, data, ontology_prefix=ontology_prefix, strict=strict)
335-
_process_subsets(term, data, ontology_prefix=ontology_prefix, strict=strict)
337+
_process_subsets(
338+
term,
339+
data,
340+
ontology_prefix=ontology_prefix,
341+
strict=strict,
342+
subset_typedefs=subset_typedefs,
343+
)
336344
_process_intersection_of(term, data, ontology_prefix=ontology_prefix, strict=strict)
337345
_process_union_of(term, data, ontology_prefix=ontology_prefix, strict=strict)
338346
_process_equivalent_to(term, data, ontology_prefix=ontology_prefix, strict=strict)
@@ -379,13 +387,19 @@ def _process_creation_date(term: Stanza, data) -> None:
379387

380388
def _process_union_of(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
381389
for reference in iterate_node_reference_tag(
382-
"union_of", data=data, ontology_prefix=ontology_prefix, strict=strict, node=term.reference
390+
term,
391+
"union_of",
392+
data=data,
393+
ontology_prefix=ontology_prefix,
394+
strict=strict,
395+
node=term.reference,
383396
):
384397
term.append_union_of(reference)
385398

386399

387400
def _process_equivalent_to(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
388401
for reference in iterate_node_reference_tag(
402+
term,
389403
"equivalent_to",
390404
data=data,
391405
ontology_prefix=ontology_prefix,
@@ -397,6 +411,7 @@ def _process_equivalent_to(term: Stanza, data, *, ontology_prefix: str, strict:
397411

398412
def _process_disjoint_from(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
399413
for reference in iterate_node_reference_tag(
414+
term,
400415
"disjoint_from",
401416
data=data,
402417
ontology_prefix=ontology_prefix,
@@ -408,15 +423,15 @@ def _process_disjoint_from(term: Stanza, data, *, ontology_prefix: str, strict:
408423

409424
def _process_alts(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
410425
for alt_reference in iterate_node_reference_tag(
411-
"alt_id", data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
426+
term, "alt_id", data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
412427
):
413428
term.append_alt(alt_reference)
414429

415430

416431
def _process_parents(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
417432
for tag in ["is_a", "instance_of"]:
418433
for parent in iterate_node_reference_tag(
419-
tag, data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
434+
term, tag, data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
420435
):
421436
term.append_parent(parent)
422437

@@ -512,20 +527,35 @@ def _process_relations(
512527

513528
def _process_replaced_by(stanza: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
514529
for reference in iterate_node_reference_tag(
515-
"replaced_by", data, node=stanza.reference, strict=strict, ontology_prefix=ontology_prefix
530+
stanza,
531+
"replaced_by",
532+
data,
533+
node=stanza.reference,
534+
strict=strict,
535+
ontology_prefix=ontology_prefix,
516536
):
517537
stanza.append_replaced_by(reference)
518538

519539

520-
def _process_subsets(stanza: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
540+
UNDEFINED_SUBSETS = set()
541+
542+
543+
def _process_subsets(
544+
stanza: Stanza, data, *, ontology_prefix: str, strict: bool, subset_typedefs: SubsetTypeDefs
545+
) -> None:
521546
for reference in iterate_node_reference_tag(
547+
stanza,
522548
"subset",
523549
data,
524550
node=stanza.reference,
525551
strict=strict,
526552
ontology_prefix=ontology_prefix,
527553
counter=SUBSET_ERROR_COUNTER,
528554
):
555+
if reference not in subset_typedefs:
556+
if reference not in UNDEFINED_SUBSETS:
557+
logger.warning("[%s] undefined subset: %s", stanza.curie, reference)
558+
UNDEFINED_SUBSETS.add(reference)
529559
stanza.append_subset(reference)
530560

531561

@@ -669,8 +699,13 @@ def _handle_xref(
669699
SUBSET_ERROR_COUNTER: Counter[tuple[str, str]] = Counter()
670700

671701

672-
def _get_subsetdefs(graph: nx.MultiDiGraph, ontology_prefix: str) -> list[tuple[Reference, str]]:
673-
rv = []
702+
SubsetTypeDefs: TypeAlias = dict[Reference, str]
703+
704+
705+
def _get_subsetdefs(
706+
graph: nx.MultiDiGraph, ontology_prefix: str, *, strict: bool = False
707+
) -> SubsetTypeDefs:
708+
rv = {}
674709
for subsetdef in graph.get("subsetdef", []):
675710
left, _, right = subsetdef.partition(" ")
676711
if not right:
@@ -682,11 +717,12 @@ def _get_subsetdefs(graph: nx.MultiDiGraph, ontology_prefix: str) -> list[tuple[
682717
name=right,
683718
line=subsetdef,
684719
counter=SUBSET_ERROR_COUNTER,
720+
strict=strict,
685721
)
686722
if left_ref is None:
687723
continue
688724
right = right.strip('"')
689-
rv.append((left_ref, right))
725+
rv[left_ref] = right
690726
return rv
691727

692728

@@ -812,6 +848,7 @@ def iterate_typedefs(
812848
# can't really have a pre-defined set of synonym typedefs here!
813849
synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef] = {}
814850
typedefs: Mapping[ReferenceTuple, TypeDef] = {}
851+
subset_typedefs: SubsetTypeDefs = {} # FIXME
815852
missing_typedefs: set[ReferenceTuple] = set()
816853
for data in graph.graph.get("typedefs", []):
817854
if "id" in data:
@@ -889,7 +926,13 @@ def iterate_typedefs(
889926
missing_typedefs=missing_typedefs,
890927
)
891928
_process_replaced_by(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
892-
_process_subsets(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
929+
_process_subsets(
930+
typedef,
931+
data,
932+
ontology_prefix=ontology_prefix,
933+
strict=strict,
934+
subset_typedefs=subset_typedefs,
935+
)
893936
_process_intersection_of(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
894937
_process_union_of(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
895938
_process_equivalent_to(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
@@ -904,6 +947,7 @@ def iterate_typedefs(
904947
_process_holds_over_chain(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
905948
typedef.disjoint_over.extend(
906949
iterate_node_reference_tag(
950+
typedef,
907951
"disjoint_over",
908952
data,
909953
node=typedef.reference,
@@ -913,6 +957,7 @@ def iterate_typedefs(
913957
)
914958
typedef.transitive_over.extend(
915959
iterate_node_reference_tag(
960+
typedef,
916961
"transitive_over",
917962
data,
918963
node=typedef.reference,
@@ -926,6 +971,7 @@ def iterate_typedefs(
926971

927972
def _process_consider(stanza: Stanza, data, *, ontology_prefix: str, strict: bool = False):
928973
for reference in iterate_node_reference_tag(
974+
stanza,
929975
"consider",
930976
data,
931977
node=stanza.reference,
@@ -1349,6 +1395,7 @@ def _parse_default_prop(property_id, ontology_prefix) -> Reference | None:
13491395

13501396

13511397
def iterate_node_reference_tag(
1398+
stanza: Stanza,
13521399
tag: str,
13531400
data: Mapping[str, Any],
13541401
*,
@@ -1359,21 +1406,35 @@ def iterate_node_reference_tag(
13591406
counter: Counter[tuple[str, str]] | None = None,
13601407
) -> Iterable[Reference]:
13611408
"""Extract a list of CURIEs from the data."""
1362-
for identifier in data.get(tag, []):
1409+
for str_or_curie_or_uri in data.get(tag, []):
13631410
reference = _obo_parse_identifier(
1364-
identifier,
1411+
str_or_curie_or_uri,
13651412
strict=strict,
13661413
node=node,
13671414
ontology_prefix=ontology_prefix,
13681415
upgrade=upgrade,
13691416
counter=counter,
13701417
)
1371-
if reference is None:
1418+
if reference is not None:
1419+
yield reference
1420+
elif tag == "subset":
1421+
# this is to avoid the millions of 2:STAR and 3:STAR errors when parsing ChEBI that makes
1422+
# it take forever. In general, most of the subset identifiers are totally borked.
1423+
if str_or_curie_or_uri not in SUBSET_INVALIDS:
1424+
logger.warning(
1425+
"[%s] %s - could not parse subset identifier: %s",
1426+
stanza.curie,
1427+
tag,
1428+
str_or_curie_or_uri,
1429+
)
1430+
SUBSET_INVALIDS.add(str_or_curie_or_uri)
1431+
else:
13721432
logger.warning(
1373-
"[%s] %s - could not parse identifier: %s", ontology_prefix, tag, identifier
1433+
"[%s] %s - could not parse identifier: %s", stanza.curie, tag, str_or_curie_or_uri
13741434
)
1375-
else:
1376-
yield reference
1435+
1436+
1437+
SUBSET_INVALIDS: set[str] = set()
13771438

13781439

13791440
def _process_intersection_of(

src/pyobo/struct/struct.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ class Obo:
600600
#: A cache of terms
601601
_items: list[Term] | None = field(init=False, default=None, repr=False)
602602

603-
subsetdefs: ClassVar[list[tuple[Reference, str]] | None] = None
603+
subsetdefs: ClassVar[dict[Reference, str] | None] = None
604604

605605
property_values: ClassVar[list[Annotation] | None] = None
606606

@@ -696,7 +696,7 @@ def _get_prefixes(self) -> set[str]:
696696
prefixes.update(stanza._get_prefixes())
697697
for synonym_typedef in self.synonym_typedefs or []:
698698
prefixes.update(synonym_typedef._get_prefixes())
699-
prefixes.update(subset.prefix for subset, _ in self.subsetdefs or [])
699+
prefixes.update(subset.prefix for subset in self.subsetdefs or [])
700700
# _iterate_property_pairs covers metadata, root terms,
701701
# and properties in self.property_values
702702
prefixes.update(_get_prefixes_from_annotations(self._iterate_property_pairs()))
@@ -711,7 +711,7 @@ def _get_references(self) -> dict[str, set[Reference]]:
711711
for rr in itt.chain(self, self.typedefs or [], self.synonym_typedefs or []):
712712
for prefix, references in rr._get_references().items():
713713
rv[prefix].update(references)
714-
for subset, _ in self.subsetdefs or []:
714+
for subset in self.subsetdefs or {}:
715715
rv[subset.prefix].add(subset)
716716
# _iterate_property_pairs covers metadata, root terms,
717717
# and properties in self.property_values
@@ -881,7 +881,7 @@ def iterate_obo_lines(
881881
for imp in self.imports or []:
882882
yield f"import: {imp}"
883883
# 7
884-
for subset, subset_remark in self.subsetdefs or []:
884+
for subset, subset_remark in (self.subsetdefs or {}).items():
885885
yield f'subsetdef: {reference_escape(subset, ontology_prefix=self.ontology)} "{subset_remark}"'
886886
# 8
887887
for synonym_typedef in sorted(self.synonym_typedefs or []):
@@ -2400,7 +2400,7 @@ def build_ontology(
24002400
version: str | None = None,
24012401
idspaces: dict[str, str] | None = None,
24022402
root_terms: list[Reference] | None = None,
2403-
subsetdefs: list[tuple[Reference, str]] | None = None,
2403+
subsetdefs: dict[Reference, str] | None = None,
24042404
properties: list[Annotation] | None = None,
24052405
imports: list[str] | None = None,
24062406
description: str | None = None,
@@ -2410,6 +2410,8 @@ def build_ontology(
24102410
repository: str | None = None,
24112411
ontology_iri: str | None = None,
24122412
ontology_version_iri: str | None = None,
2413+
auto_generated_by: str | None = None,
2414+
date: datetime.datetime | None = None,
24132415
) -> Obo:
24142416
"""Build an ontology from parts."""
24152417
if name is None:
@@ -2459,10 +2461,10 @@ def build_ontology(
24592461
return make_ad_hoc_ontology(
24602462
_ontology=prefix,
24612463
_name=name,
2462-
# _auto_generated_by
2464+
_auto_generated_by=auto_generated_by,
24632465
_typedefs=typedefs,
24642466
_synonym_typedefs=synonym_typedefs,
2465-
# _date: datetime.datetime | None = None,
2467+
_date=date,
24662468
_data_version=version,
24672469
_idspaces=idspaces,
24682470
_root_terms=root_terms,
@@ -2485,7 +2487,7 @@ def make_ad_hoc_ontology(
24852487
_data_version: str | None = None,
24862488
_idspaces: Mapping[str, str] | None = None,
24872489
_root_terms: list[Reference] | None = None,
2488-
_subsetdefs: list[tuple[Reference, str]] | None = None,
2490+
_subsetdefs: dict[Reference, str] | None = None,
24892491
_property_values: list[Annotation] | None = None,
24902492
_imports: list[str] | None = None,
24912493
_ontology_iri: str | None = None,

0 commit comments

Comments
 (0)