Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 8 additions & 19 deletions metpo/analysis/analyze_ontology_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@
3. Redundant sources that could be removed
"""

import re
from collections import defaultdict

import click
import pandas as pd

from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map

def extract_term_source(iri: str) -> str:

def extract_term_source(iri: str, curie_map: dict[str, str] | None = None) -> str:
"""
Extract the defining ontology from an IRI.

Expand All @@ -27,22 +28,9 @@ def extract_term_source(iri: str) -> str:
- http://purl.obolibrary.org/obo/GO_0008150 → GO
- https://w3id.org/biolink/vocab/phenotype → biolink
"""
# OBO pattern: .../obo/PREFIX_ID
obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", iri)
if obo_match:
return obo_match.group(1).upper()

# Biolink pattern
if "biolink" in iri:
return "biolink"

# DOI pattern
if "doi.org" in iri:
doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", iri)
if doi_match:
return doi_match.group(1)

# Default: return as-is
extracted = extract_prefix(iri, curie_map)
if extracted:
return extracted.upper()
return "unknown"


Expand All @@ -53,9 +41,10 @@ def main(input):

print(f"Loading mappings from: {input}")
df = pd.read_csv(input, sep="\t", comment="#")
curie_map = parse_sssom_curie_map(input)

# Extract term source from IRI
df["term_source"] = df["object_id"].apply(extract_term_source)
df["term_source"] = df["object_id"].apply(lambda x: extract_term_source(x, curie_map))

print(f"\nLoaded {len(df)} mappings")
print(f"Unique ontology files (object_source): {df['object_source'].nunique()}")
Expand Down
33 changes: 12 additions & 21 deletions metpo/analysis/extract_definitions_from_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

import pandas as pd

from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map

# Configuration
SSSOM_FILE = "../../data/mappings/metpo_mappings_combined_relaxed.sssom.tsv"
METPO_SHEET = "../../src/templates/metpo_sheet.tsv"
Expand Down Expand Up @@ -48,26 +50,14 @@ def parse_object_label(label: str) -> tuple[str, str | None]:
return label.strip(), None


def extract_ontology_prefix(iri: str) -> str:
"""Extract ontology prefix from IRI."""
# Handle obo format
if "obo/" in iri:
match = re.search(r"/obo/([A-Z]+)_", iri)
if match:
return match.group(1)

# Handle DOI format
if "doi.org" in iri:
return "DOI"

# Handle other formats
if "biolink" in iri.lower():
return "BIOLINK"
if "dsmz" in iri.lower():
return "D3O"
if "mdatahub" in iri.lower():
return "MEO"

def extract_ontology_prefix(
iri: str,
curie_map: dict[str, str] | None = None,
) -> str:
"""Extract ontology prefix from CURIE/IRI with SSSOM-aware logic."""
extracted = extract_prefix(iri, curie_map)
if extracted:
return extracted.upper()
return "UNKNOWN"


Expand Down Expand Up @@ -114,6 +104,7 @@ def load_sssom_mappings() -> pd.DataFrame:

# Read from data start
df = pd.read_csv(SSSOM_FILE, sep="\t", skiprows=data_start)
curie_map = parse_sssom_curie_map(SSSOM_FILE)

# Extract distance from comment
df["distance"] = df["comment"].apply(extract_distance_from_comment)
Expand All @@ -124,7 +115,7 @@ def load_sssom_mappings() -> pd.DataFrame:
)

# Extract ontology prefix
df["ontology"] = df["object_id"].apply(extract_ontology_prefix)
df["ontology"] = df["object_id"].apply(lambda x: extract_ontology_prefix(x, curie_map))

return df

Expand Down
24 changes: 19 additions & 5 deletions metpo/pipeline/chromadb_semantic_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@
from dotenv import load_dotenv
from tqdm import tqdm

from metpo.utils.sssom_utils import normalize_object_id

CORE_CURIE_MAP = {
"METPO": "http://purl.obolibrary.org/obo/METPO_",
"skos": "http://www.w3.org/2004/02/skos/core#",
"semapv": "https://w3id.org/semapv/vocab/",
}


def load_metpo_terms(
tsv_path: str,
Expand Down Expand Up @@ -178,12 +186,18 @@ def write_sssom_output(
else:
filtered = [m for m in matches if m["distance"] <= max_distance]

normalized_rows: list[dict] = []
curie_map = dict(CORE_CURIE_MAP)
for m in filtered:
normalized_object_id, object_prefixes = normalize_object_id(str(m["match_iri"]))
curie_map.update(object_prefixes)
normalized_rows.append({**m, "normalized_object_id": normalized_object_id})

with Path(output_path).open("w", encoding="utf-8", newline="") as f:
# Write metadata block
f.write("# curie_map:\n")
f.write("# METPO: http://purl.obolibrary.org/obo/METPO_\n")
f.write("# skos: http://www.w3.org/2004/02/skos/core#\n")
f.write("# semapv: https://w3id.org/semapv/vocab/\n")
for prefix in sorted(curie_map):
f.write(f"# {prefix}: {curie_map[prefix]}\n")
f.write(
f"# mapping_set_id: metpo-ontology-mappings-{datetime.now(UTC).date().isoformat()}\n"
)
Expand Down Expand Up @@ -264,14 +278,14 @@ def write_sssom_output(
]
)

for m in filtered:
for m in normalized_rows:
similarity = 1.0 - (m["distance"] / 2.0)
writer.writerow(
[
m["metpo_id"],
m["metpo_label"],
similarity_to_predicate(similarity),
m["match_iri"],
m["normalized_object_id"],
m["match_document"],
"semapv:SemanticSimilarityThresholdMatching",
f"{similarity:.6f}",
Expand Down
26 changes: 8 additions & 18 deletions metpo/presentations/analyze_primary_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from collections import Counter
from pathlib import Path

from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map

# PRIMARY SOURCE 1: ChromaDB databases
CHROMA_COMBINED = Path("/home/mark/gitrepos/metpo/notebooks/chroma_combined/chroma.sqlite3")
CHROMA_OLS_20 = Path("/home/mark/gitrepos/metpo/notebooks/chroma_ols_20/chroma.sqlite3")
Expand Down Expand Up @@ -65,30 +67,18 @@ def analyze_sssom(sssom_path):
target_prefixes = Counter()
predicates = Counter()
total_mappings = 0
curie_map = parse_sssom_curie_map(sssom_path)

with Path(sssom_path).open() as f:
reader = csv.DictReader(f, delimiter="\t")
data_lines = (line for line in f if not line.startswith("#"))
reader = csv.DictReader(data_lines, delimiter="\t")
for row in reader:
total_mappings += 1

# Extract prefix from object_id
obj_id = row.get("object_id", "")
if "/obo/" in obj_id:
# e.g., http://purl.obolibrary.org/obo/PATO_0000001
prefix = obj_id.split("/obo/")[1].split("_")[0] if "_" in obj_id else ""
if prefix:
target_prefixes[prefix] += 1
elif "doi.org/10.1601" in obj_id:
target_prefixes["N4L"] += 1
elif "purl.dsmz.de" in obj_id:
if "d3o" in obj_id.lower():
target_prefixes["D3O"] += 1
elif "miso" in obj_id.lower():
target_prefixes["MISO"] += 1
elif "mdatahub.org" in obj_id.lower():
target_prefixes["MEO"] += 1
elif "biolink" in obj_id.lower():
target_prefixes["BIOLINK"] += 1
prefix = extract_prefix(row.get("object_id", ""), curie_map)
if prefix:
target_prefixes[prefix.upper()] += 1

pred = row.get("predicate_id", "")
predicates[pred] += 1
Expand Down
1 change: 1 addition & 0 deletions metpo/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Shared utilities for METPO scripts."""
149 changes: 149 additions & 0 deletions metpo/utils/sssom_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""Helpers for parsing SSSOM metadata and identifiers."""

import re
from pathlib import Path

CURIE_MAP_LINE = re.compile(r"^#\s{2,}([A-Za-z][\w.-]*):\s*(\S+)\s*$")

KNOWN_IRI_PREFIXES = {
"biolink": "https://w3id.org/biolink/vocab/",
"d3o": "https://purl.dsmz.de/schema/",
"doi": "http://doi.org/",
"bipon": "http://www.semanticweb.org/BiPON/",
}


def strip_angle_brackets(identifier: str) -> str:
"""Return identifier without surrounding angle brackets."""
text = str(identifier).strip()
if text.startswith("<") and text.endswith(">"):
return text[1:-1].strip()
return text


def parse_sssom_curie_map(sssom_path: str | Path) -> dict[str, str]:
"""Parse ``# curie_map`` prefixes from an SSSOM TSV header."""
curie_map: dict[str, str] = {}
in_curie_map = False

with Path(sssom_path).open(encoding="utf-8") as handle:
for line in handle:
if not line.startswith("#"):
break

text = line.rstrip("\n")
if text.strip() == "# curie_map:":
in_curie_map = True
continue

if in_curie_map:
# End curie_map block at first normal metadata line like:
# "# mapping_set_id: ...".
if text.startswith("# ") and not text.startswith("# "):
in_curie_map = False
continue

match = CURIE_MAP_LINE.match(text)
if match:
prefix, expansion = match.groups()
curie_map[prefix] = expansion
continue

# Any non-matching comment line ends the curie_map block.
in_curie_map = False

return curie_map


def iri_to_curie(
iri: str, known_iri_prefixes: dict[str, str] | None = None
) -> tuple[str, str] | None:
"""Convert common IRI forms to CURIEs when safely possible."""
prefixes = known_iri_prefixes or KNOWN_IRI_PREFIXES

# OBO-style compact form, e.g. http://purl.obolibrary.org/obo/GO_0008152 -> GO:0008152
if iri.startswith("http://purl.obolibrary.org/obo/"):
local = iri.removeprefix("http://purl.obolibrary.org/obo/")
if "_" in local:
prefix, suffix = local.split("_", 1)
if prefix and suffix:
return f"{prefix}:{suffix}", f"http://purl.obolibrary.org/obo/{prefix}_"
return None

for prefix, base in prefixes.items():
if iri.startswith(base):
local = iri.removeprefix(base)
if local:
return f"{prefix}:{local}", base

# Accept https variant for doi
if iri.startswith("https://doi.org/"):
local = iri.removeprefix("https://doi.org/")
doi_base = prefixes.get("doi", "http://doi.org/")
if local:
return f"doi:{local}", doi_base

return None


def normalize_object_id(
raw_identifier: str,
known_iri_prefixes: dict[str, str] | None = None,
) -> tuple[str, dict[str, str]]:
"""Normalize mapping object id to CURIE where possible, else plain IRI (no <>)."""
clean = strip_angle_brackets(raw_identifier)
if not clean:
return clean, {}

converted = iri_to_curie(clean, known_iri_prefixes=known_iri_prefixes)
if converted is not None:
curie, expansion = converted
prefix = curie.split(":", 1)[0]
return curie, {prefix: expansion}

# Keep plain IRI if no safe CURIE normalization is available.
return clean, {}


def extract_prefix(identifier: str, curie_map: dict[str, str] | None = None) -> str | None:
"""
Extract prefix from CURIE/IRI identifier.

Preference order:
1. CURIE prefix when object is already CURIE-like.
2. curie_map expansion match when object is an IRI.
3. conservative structural fallbacks (OBO IRIs).
"""
text = strip_angle_brackets(identifier)
detected: str | None = None
if not text:
return detected

lowered = text.lower()
is_iri = lowered.startswith(("http://", "https://"))
if ":" in text and not is_iri:
detected = text.split(":", 1)[0]
elif curie_map:
best_prefix: str | None = None
best_len = -1
for prefix, expansion in curie_map.items():
if text.startswith(expansion) and len(expansion) > best_len:
best_prefix = prefix
best_len = len(expansion)
detected = best_prefix

if detected is None:
if "/obo/" in text and "_" in text:
detected = text.split("/obo/")[1].split("_")[0]
elif "doi.org" in lowered:
detected = "doi"
elif "biolink" in lowered:
detected = "biolink"
elif "purl.dsmz.de" in lowered:
detected = "d3o"
elif "mdatahub.org" in lowered:
detected = "meo"
elif "semanticweb.org/bipon/" in lowered:
detected = "bipon"

return detected
Loading