Skip to content

Commit be34523

Browse files
authored
Update HGNC export (#486)
Motivated by biopragmatics/bioregistry#1811 This PR does the following: 1. Adds missing relations (e.g., ones that appear in logical definition of other ones) 2. Adds explicit labels for placeholder SO terms that haven't been reviewed yet (see The-Sequence-Ontology/SO-Ontologies#667 and The-Sequence-Ontology/SO-Ontologies#668) 3. Directly incorporate HGNC gene groups content into HGNC export (while simultaneously addressing some version management issues for HGNC gene groups) This does not solve the more generic issue that PyOBO exports OFN and OWL products using IRIs instead of stringified CURIEs in annotations. This means that if the IRI isn't defined, then Protege tries its best to guess a label, and which pollutes the class list when exploring. - protegeproject/protege#1157 - protegeproject/protege#1321 - protegeproject/protege#1322 - protegeproject/protege#1323
1 parent 9ac001c commit be34523

File tree

4 files changed

+62
-50
lines changed

4 files changed

+62
-50
lines changed

src/pyobo/resources/so.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@
2121

2222
def get_so_name(so_id: str) -> str | None:
2323
"""Get the name from the identifier."""
24+
if so_id == "0003002":
25+
# see https://github.com/The-Sequence-Ontology/SO-Ontologies/pull/668
26+
return "viral integration site"
27+
if so_id == "0003001":
28+
# see https://github.com/The-Sequence-Ontology/SO-Ontologies/pull/667
29+
return "cluster RNA gene"
2430
return load_so().get(so_id)
2531

2632

src/pyobo/sources/hgnc/hgnc.py

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,27 +14,25 @@
1414

1515
from pyobo.api.utils import get_version
1616
from pyobo.resources.so import get_so_name
17-
from pyobo.struct import (
18-
Annotation,
19-
Obo,
20-
OBOLiteral,
21-
Reference,
22-
Term,
23-
from_species,
24-
has_gene_product,
25-
is_mentioned_by,
26-
member_of,
27-
orthologous,
28-
transcribes_to,
29-
)
17+
from pyobo.sources.hgnc.hgncgenefamily import GENE_GROUP_PREFIX, get_gene_family_terms
18+
from pyobo.struct import Annotation, Obo, OBOLiteral, Reference, Term
3019
from pyobo.struct.struct import gene_symbol_synonym, previous_gene_symbol, previous_name
3120
from pyobo.struct.typedef import (
3221
comment,
22+
enables,
3323
ends,
3424
exact_match,
25+
from_species,
3526
gene_product_enables,
27+
gene_product_of,
28+
has_gene_product,
29+
has_member,
30+
is_mentioned_by,
3631
located_in,
32+
member_of,
33+
orthologous,
3734
starts,
35+
transcribes_to,
3836
)
3937
from pyobo.utils.path import ensure_path
4038

@@ -137,6 +135,7 @@
137135
"homeodb", # TODO add to bioregistry, though this is defunct
138136
"mamit-trnadb", # TODO add to bioregistry, though this is defunct
139137
"mane_select", # TODO
138+
"gene_group", # gene_group_id is needed, but this just has label
140139
}
141140

142141
#: A mapping from HGNC's locus_type annotations to sequence ontology identifiers
@@ -195,8 +194,10 @@ class HGNCGetter(Obo):
195194
typedefs = [
196195
from_species,
197196
has_gene_product,
197+
gene_product_of,
198198
gene_product_enables,
199199
transcribes_to,
200+
has_member,
200201
orthologous,
201202
member_of,
202203
exact_match,
@@ -205,6 +206,7 @@ class HGNCGetter(Obo):
205206
starts,
206207
ends,
207208
comment,
209+
enables,
208210
]
209211
synonym_typedefs = [
210212
previous_name,
@@ -262,6 +264,8 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
262264
for so_id in sorted(_so_ids)
263265
]
264266

267+
yield from get_gene_family_terms(version=version, force=force)
268+
265269
statuses = set()
266270
for entry in tqdm(entries, desc=f"Mapping {PREFIX}", unit="gene", unit_scale=True):
267271
name, symbol, identifier = (
@@ -393,15 +397,10 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
393397
term.append_mentioned_by(Reference(prefix="pubmed", identifier=str(pubmed_id)))
394398

395399
gene_group_ids = entry.pop("gene_group_id", [])
396-
gene_groups = entry.pop("gene_group", [])
397-
for gene_group_id, gene_group_label in zip(gene_group_ids, gene_groups, strict=False):
400+
for gene_group_id in gene_group_ids:
398401
term.append_relationship(
399402
member_of,
400-
Reference(
401-
prefix="hgnc.genegroup",
402-
identifier=str(gene_group_id),
403-
name=gene_group_label,
404-
),
403+
Reference(prefix=GENE_GROUP_PREFIX, identifier=str(gene_group_id)),
405404
)
406405

407406
for alias_symbol in entry.pop("alias_symbol", []):

src/pyobo/sources/hgnc/hgncgenefamily.py

Lines changed: 36 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,20 @@
55

66
import pandas as pd
77

8-
from ...struct import Obo, Reference, Term, is_mentioned_by
8+
from ...api.utils import get_version
9+
from ...struct.struct import Obo, Reference, Term
910
from ...struct.struct import abbreviation as symbol_type
10-
from ...struct.typedef import enables, exact_match, from_species
11+
from ...struct.typedef import enables, exact_match, from_species, is_mentioned_by
1112
from ...utils.path import ensure_df
1213

1314
__all__ = [
15+
"GENE_GROUP_REFERENCE",
16+
"GENE_GROUP_TERM",
1417
"HGNCGroupGetter",
18+
"get_gene_family_terms",
1519
]
1620

17-
PREFIX = "hgnc.genegroup"
21+
GENE_GROUP_PREFIX = "hgnc.genegroup"
1822
FAMILIES_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/family.csv"
1923
FAMILIES_ALIAS_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/family_alias.csv"
2024
HIERARCHY_URL = "https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/hierarchy.csv"
@@ -23,19 +27,21 @@
2327
class HGNCGroupGetter(Obo):
2428
"""An ontology representation of HGNC's gene group nomenclature."""
2529

26-
ontology = PREFIX
30+
ontology = GENE_GROUP_PREFIX
2731
bioversions_key = "hgnc"
2832
synonym_typedefs = [symbol_type]
2933
typedefs = [from_species, enables, exact_match, is_mentioned_by]
3034

3135
def iter_terms(self, force: bool = False) -> Iterable[Term]:
3236
"""Iterate over terms in the ontology."""
33-
return get_terms(force=force)
37+
return get_gene_family_terms(force=force)
3438

3539

36-
def get_hierarchy(force: bool = False) -> Mapping[str, list[str]]:
40+
def get_hierarchy(*, version: str | None = None, force: bool = False) -> Mapping[str, list[str]]:
3741
"""Get the HGNC Gene Families hierarchy as a dictionary."""
38-
df = ensure_df(PREFIX, url=HIERARCHY_URL, force=force, sep=",")
42+
if version is None:
43+
version = get_version("hgnc")
44+
df = ensure_df(GENE_GROUP_PREFIX, url=HIERARCHY_URL, force=force, sep=",", version=version)
3945
d = defaultdict(list)
4046
for parent_id, child_id in df.values:
4147
d[child_id].append(parent_id)
@@ -44,51 +50,51 @@ def get_hierarchy(force: bool = False) -> Mapping[str, list[str]]:
4450

4551
COLUMNS = ["id", "abbreviation", "name", "pubmed_ids", "desc_comment", "desc_go"]
4652

53+
GENE_GROUP_REFERENCE = Reference(prefix="SO", identifier="0005855", name="gene group")
54+
GENE_GROUP_TERM = Term(reference=GENE_GROUP_REFERENCE)
4755

48-
def get_terms(force: bool = False) -> Iterable[Term]:
56+
57+
def get_gene_family_terms(*, version: str | None = None, force: bool = False) -> Iterable[Term]:
4958
"""Get the HGNC Gene Group terms."""
50-
terms = list(_get_terms_helper(force=force))
51-
hierarchy = get_hierarchy(force=force)
59+
if version is None:
60+
version = get_version("hgnc")
61+
62+
terms = list(_get_terms_helper(force=force, version=version))
63+
hierarchy = get_hierarchy(force=force, version=version)
5264

53-
id_to_term = {term.reference.identifier: term for term in terms}
65+
id_to_term = {term.identifier: term for term in terms}
5466
for child_id, parent_ids in hierarchy.items():
5567
child: Term = id_to_term[child_id]
5668
for parent_id in parent_ids:
57-
parent: Term = id_to_term[parent_id]
58-
child.append_parent(
59-
Reference(
60-
prefix=PREFIX,
61-
identifier=parent_id,
62-
name=parent.name,
63-
)
64-
)
65-
gene_group = Reference(prefix="SO", identifier="0005855", name="gene group")
66-
yield Term(reference=gene_group)
69+
child.append_parent(id_to_term[parent_id])
70+
yield GENE_GROUP_TERM
6771
for term in terms:
6872
if not term.parents:
69-
term.append_parent(gene_group)
73+
term.append_parent(GENE_GROUP_REFERENCE)
7074
yield from terms
7175

7276

73-
def _get_terms_helper(force: bool = False) -> Iterable[Term]:
74-
alias_df = ensure_df(PREFIX, url=FAMILIES_ALIAS_URL, force=force, sep=",")
77+
def _get_terms_helper(version: str, force: bool = False) -> Iterable[Term]:
78+
alias_df = ensure_df(
79+
GENE_GROUP_PREFIX, url=FAMILIES_ALIAS_URL, force=force, sep=",", version=version
80+
)
7581
aliases = defaultdict(set)
7682
for _id, family_id, alias in alias_df.values:
7783
aliases[family_id].add(alias)
7884

79-
df = ensure_df(PREFIX, url=FAMILIES_URL, force=force, sep=",")
85+
df = ensure_df(GENE_GROUP_PREFIX, url=FAMILIES_URL, force=force, sep=",", version=version)
8086
for gene_group_id, symbol, name, pubmed_ids, definition, desc_go in df[COLUMNS].values:
8187
if not definition or pd.isna(definition):
8288
definition = None
8389
term = Term(
84-
reference=Reference(prefix=PREFIX, identifier=gene_group_id, name=name),
90+
reference=Reference(prefix=GENE_GROUP_PREFIX, identifier=gene_group_id, name=name),
8591
definition=definition,
8692
)
8793
if pubmed_ids and pd.notna(pubmed_ids):
88-
for s in pubmed_ids.replace(" ", ",").split(","):
89-
s = s.strip()
90-
if s:
91-
term.append_mentioned_by(Reference(prefix="pubmed", identifier=s))
94+
for pubmed_id in pubmed_ids.replace(" ", ",").split(","):
95+
pubmed_id = pubmed_id.strip()
96+
if pubmed_id:
97+
term.append_mentioned_by(Reference(prefix="pubmed", identifier=pubmed_id))
9298
if desc_go and pd.notna(desc_go):
9399
go_id = desc_go[len("http://purl.uniprot.org/go/") :]
94100
term.append_relationship(enables, Reference(prefix="GO", identifier=go_id))

src/pyobo/struct/typedef.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
"is_antagonist_of",
5656
"is_defined_by",
5757
"is_inverse_agonist_of",
58+
"is_mentioned_by",
5859
"located_in",
5960
"mapping_has_confidence",
6061
"mapping_has_justification",

0 commit comments

Comments
 (0)