Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/pyobo/identifier_utils/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from __future__ import annotations

import logging
from collections.abc import Callable
from functools import lru_cache, wraps
from typing import Annotated, ClassVar
from typing import Annotated, ClassVar, ParamSpec, TypeVar

import bioregistry
import click
Expand Down Expand Up @@ -266,7 +267,11 @@ def _parse_str_or_curie_or_uri_helper(
return rv


def wrap_norm_prefix(f):
S = ParamSpec("S")
T = TypeVar("T")


def wrap_norm_prefix(f: Callable[S, T]) -> Callable[S, T]:
"""Decorate a function that take in a prefix to auto-normalize, or return None if it can't be normalized."""

@wraps(f)
Expand Down
239 changes: 239 additions & 0 deletions src/pyobo/sources/plastchem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
"""Import PlastChem."""

from collections import Counter
from collections.abc import Iterable
from typing import Any

import pandas as pd
from tabulate import tabulate
from tqdm import tqdm

from pyobo import Obo, get_grounder
from pyobo.struct import Reference, Term, TypeDef, default_reference
from pyobo.struct.typedef import (
exact_match,
has_canonical_smiles,
has_inchi,
has_isomeric_smiles,
has_role,
)
from pyobo.utils.path import ensure_path

__all__ = ["PlastChemGetter"]

PREFIX = "plastchem"
URL = "https://zenodo.org/records/10701706/files/plastchem_db_v1.0.xlsx?download=1"
VERSION = "1.0"

# See page 45 of the report for explanation
HAZARD_LISTS = {
"Red": "chemicals of concern",
# The Red List contains the 3651 chemicals of concern that are currently not regulated internationally. These chemicals are hazardous according to well-established criteria (one or more hazard criteria) and should be regulated.
"Orange": "less hazardous",
# The Orange List covers 1168 chemicals that have been classified as less hazardous (e.g., carcinogenic, mutagenic category 2). They may be further watched, as additional hazard traits may be identified.
"Watch": "under assessment",
# For the 28 chemicals on the Watch List, a hazard evaluation is currently under development or inconclusive. Similar to the Orange List, it includes chemicals that have potential to become chemicals of concern once fully assessed.
"White": "not hazardous",
# The chemicals on the White List are classified as not hazardous but their hazard profiles are incomplete. While there is some level of evidence that White List chemicals are not of concern, the incomplete hazard assessment warrants prioritization for further evaluation to provide a complete hazard profile.
"Grey": "no hazard data",
# The largest list, the Grey List, includes 10 345 plastic chemicals without hazard information. Those chemicals constitute the biggest knowledge gap as their hazard properties are unknown based on the authoritative sources consulted. In the absence of this information, no regulatory action is possible at this point.
"MEA": "regulated globally", # Basel, Stochholm, Minamata
}
HAZARD_LIST_REFERENCES = {
f"{listn}_list": default_reference(PREFIX, f"{listn}_list") for listn in HAZARD_LISTS
}

HAZARD_LIST_ROOT = default_reference(PREFIX, "list")

TYPEDEF = TypeDef(reference=default_reference(PREFIX, "onList"))


class PlastChemGetter(Obo):
"""An ontology representation of PlastChem."""

ontology = PREFIX
static_version = VERSION
typedef = [
TYPEDEF,
has_inchi,
has_canonical_smiles,
has_isomeric_smiles,
exact_match,
has_role,
]

def iter_terms(self, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in the ontology."""
return get_terms()


def get_terms() -> Iterable[Term]:
"""Do it."""
yield Term(reference=HAZARD_LIST_ROOT)
for hazard_list_reference in HAZARD_LIST_REFERENCES.values():
term = Term(reference=hazard_list_reference)
term.append_parent(HAZARD_LIST_ROOT)
yield term

echa_counter: Counter[str] = Counter()
echa_examples = {}
function_counter: Counter[str] = Counter()
function_examples = {}
chebi_grounder = get_grounder("chebi")

path = ensure_path(PREFIX, url=URL, version=VERSION)
df = pd.read_excel(path, sheet_name="Full database", dtype=str, skiprows=1)
# TODO group by CAS number and add alt-id annotations
for _, row in df.iterrows():
if pd.isna(row["plastchem_ID"]):
continue

name: str | None
if pd.notna(row["pubchem_name"]):
name = row["pubchem_name"]
elif pd.notna(row["iupac_name"]):
name = row["iupac_name"]
else:
name = None
term = Term.from_triple(PREFIX, row["plastchem_ID"], name)

cas = row.pop("cas")
cas_fixed = row.pop("cas_fixed")
if pd.notna(cas_fixed) and pd.notna(cas):
if cas != cas_fixed.lstrip("'"):
pass
term.append_exact_match(Reference(prefix="cas", identifier=cas))

if pd.notna(pubchem_id := row.pop("pubchem_cid")):
term.append_exact_match(Reference(prefix="pubchem", identifier=pubchem_id))

if pd.notna(canonical_smiles := row.pop("canonical_smiles")):
term.annotate_string(has_canonical_smiles, canonical_smiles)
if pd.notna(isomeric_smiles := row.pop("isomeric_smiles")):
term.annotate_string(has_isomeric_smiles, isomeric_smiles)
if pd.notna(inchi := row.pop("inchi")):
term.annotate_string(has_inchi, inchi)
if pd.notna(inchikey := row.pop("inchikey")):
term.append_exact_match(Reference(prefix="inchikey", identifier=inchikey))

if pd.notna(echa_grouping := row.pop("ECHA_grouping")):
echa_counter[echa_grouping] += 1
if echa_grouping not in echa_examples:
if match := chebi_grounder.get_best_match(echa_grouping):
echa_examples[echa_grouping] = match.curie, match.name
elif match := chebi_grounder.get_best_match(echa_grouping.rstrip("s")):
echa_examples[echa_grouping] = match.curie, match.name

# TODO add hazard lists?

# NIAS means non-intentionally added substance
for func in _get_sep(row, "Harmonized_functions"):
func = func.replace("_", " ").lower()
if role := CHEBI_ROLE_MAP.get(func):
term.append_relationship(has_role, role)

function_counter[func] += 1
if func not in function_examples and name is not None:
if match := chebi_grounder.get_best_match(name):
if match.curie != "chebi:15702":
function_examples[func] = match.curie, match.name
elif match := chebi_grounder.get_best_match(name.rstrip("s")):
if match.curie != "chebi:15702":
function_examples[func] = match.curie, match.name

# TODO ECHA_grouping
# TODO ground to chebi:
# - Harmonized_functions
# - original_function_plasticmap
# - original_function_cpp
# - original_primary_function_aurisano
# - original_other_function_aurisano
# - industrial_sector_plasticmap

yield term

tqdm.write(
tabulate(
[
(
echa_name,
m.curie if (m := chebi_grounder.get_best_match(echa_name)) else None,
count,
)
for echa_name, count in echa_counter.most_common()
],
headers=["ECHA", "chebi", "count"],
)
)
tqdm.write("")

rows = [
(
function_name,
function_curie.curie if (function_curie := CHEBI_ROLE_MAP.get(function_name)) else "",
*function_examples.get(function_name, (None, None)),
count,
)
for function_name, count in function_counter.most_common()
]
rows = [r for r in rows if not r[1]]
tqdm.write(
tabulate(
rows,
headers=["function", "chebi", "example_chebi", "example_chebi_name", "count"],
)
)


CHEBI_ROLE_MAP = {
"plasticizer": Reference.from_curie("CHEBI:79056", name="plasticiser"),
"catalyst": Reference.from_curie("CHEBI:35223", name="catalyst"),
"monomer": Reference.from_curie("CHEBI:74236", name="polymerization monomer"),
"antioxidant": Reference.from_curie("CHEBI:22586", name="antioxidant"),
"flame retardant": Reference.from_curie("CHEBI:79314"),
"blowing agent": Reference.from_curie("CHEBI:747328"),
"filler": Reference.from_curie("CHEBI:747333"),
"stabilizer": Reference.from_curie("CHEBI:747331"),
"colorant": Reference.from_curie("CHEBI:37958"), # TODO add synonym
"pigment": Reference.from_curie("CHEBI:37958"), # TODO add synonym
"lubricant": Reference.from_curie("CHEBI:747329"),
"biocide": Reference.from_curie("CHEBI:33281"), # TODO add synonym
"solvent": Reference.from_curie("CHEBI:46787"),
"emulsifier": Reference.from_curie("CHEBI:63046"),
"surfactant": Reference.from_curie("CHEBI:35195"),
"anti-fog additive": Reference.from_curie("CHEBI:747327"),
"other processing aids": Reference.from_curie(
"CHEBI:747334"
), # this is the super class for processing aid
"antistatic agent": Reference.from_curie("CHEBI:747335"),
"adhesive": Reference.from_curie("CHEBI:747337"),
"unspecified additive": Reference.from_curie("CHEBI:747326"), # parent class
"heat stabilizer": Reference.from_curie("CHEBI:747338"),
"light stabilizer": Reference.from_curie("CHEBI:747339"),
"viscosity modifier": Reference.from_curie("CHEBI:747340"),
"impact modifier": Reference.from_curie("CHEBI:747341"),
"initiator": Reference.from_curie("CHEBI:747342"),
"crosslinking agent": Reference.from_curie("CHEBI:50684"),
"odor agent": Reference.from_curie("CHEBI:747343"),
"impurity": Reference.from_curie("CHEBI:143130"),
"ultraviolet-absorbing agent": Reference.from_curie("CHEBI:73335"),
"polymerization aid": Reference.from_curie("CHEBI:747345"),
# Non-intentionally added substances (NIAS)
"nias": Reference.from_curie("CHEBI:747346"),
# the following map up to NIAS
"intermediate": Reference.from_curie("CHEBI:747346"),
"degradation product": Reference.from_curie("CHEBI:747346"),
# TODO not sure how to model this. other starting
# substances are initiator and monomer
"unspecified raw material": None,
}


def _get_sep(row: dict[str, Any], key: str) -> list[str]:
if pd.notna(row[key]):
return row[key].split(";")
return []


if __name__ == "__main__":
PlastChemGetter.cli(["--owl", "--obo", "--force"])
10 changes: 10 additions & 0 deletions src/pyobo/struct/typedef.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"from_species",
"gene_product_enables",
"gene_product_member_of",
"has_canonical_smiles",
"has_contributor",
"has_creator",
"has_curation_status",
Expand All @@ -37,6 +38,7 @@
"has_gene_product",
"has_homepage",
"has_inchi",
"has_isomeric_smiles",
"has_mailbox",
"has_mature",
"has_member",
Expand Down Expand Up @@ -329,6 +331,14 @@
)

has_smiles = TypeDef(reference=v.has_smiles, is_metadata_tag=True).append_xref(v.debio_has_smiles)
has_canonical_smiles = TypeDef(reference=v.has_canonical_smiles, is_metadata_tag=True).append_xref(
v.debio_has_smiles
)
has_isomeric_smiles = TypeDef(reference=v.has_isomeric_smiles, is_metadata_tag=True).append_xref(
v.debio_has_smiles
)

# https://chemkg.github.io/chemrof/isomeric_smiles_string/

has_inchi = TypeDef(reference=v.has_inchi, is_metadata_tag=True).append_xref(v.debio_has_inchi)

Expand Down
2 changes: 2 additions & 0 deletions src/pyobo/struct/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ def _c(c: curies.Reference) -> Reference:

debio_has_smiles = Reference(prefix="debio", identifier="0000022", name="has SMILES")
has_smiles = Reference(prefix="chemrof", identifier="smiles_string")
has_canonical_smiles = Reference(prefix="chemrof", identifier="canonical_smiles_string")
has_isomeric_smiles = Reference(prefix="chemrof", identifier="isomeric_smiles_string")

is_mentioned_by = Reference(prefix="mito", identifier="isMentionedBy", name="is mentioned by")
mentions = Reference(prefix="mito", identifier="mentions", name="mentions")
Expand Down
Loading