Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 28 additions & 19 deletions metpo/analysis/analyze_ontology_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
import click
import pandas as pd

from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map

def extract_term_source(iri: str) -> str:

def extract_term_source(iri: str, curie_map: dict[str, str] | None = None) -> str:
"""
Extract the defining ontology from an IRI.

Expand All @@ -27,23 +29,29 @@ def extract_term_source(iri: str) -> str:
- http://purl.obolibrary.org/obo/GO_0008150 → GO
- https://w3id.org/biolink/vocab/phenotype → biolink
"""
# OBO pattern: .../obo/PREFIX_ID
obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", iri)
if obo_match:
return obo_match.group(1).upper()

# Biolink pattern
if "biolink" in iri:
return "biolink"

# DOI pattern
if "doi.org" in iri:
doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", iri)
if doi_match:
return doi_match.group(1)

# Default: return as-is
return "unknown"
text = str(iri).strip()
source = "unknown"
if not text:
return source

extracted = extract_prefix(text, curie_map)
if extracted:
source = extracted.upper()
else:
# OBO pattern: .../obo/PREFIX_ID
obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", text)
if obo_match:
source = obo_match.group(1).upper()
elif "biolink" in text.lower():
source = "biolink"
elif "doi.org" in text.lower():
doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", text, flags=re.IGNORECASE)
if doi_match:
source = doi_match.group(1)
elif "purl.dsmz.de" in text.lower():
source = "D3O"

return source


@click.command()
Expand All @@ -53,9 +61,10 @@ def main(input):

print(f"Loading mappings from: {input}")
df = pd.read_csv(input, sep="\t", comment="#")
curie_map = parse_sssom_curie_map(input)

# Extract term source from IRI
df["term_source"] = df["object_id"].apply(extract_term_source)
df["term_source"] = df["object_id"].apply(lambda x: extract_term_source(x, curie_map))

print(f"\nLoaded {len(df)} mappings")
print(f"Unique ontology files (object_source): {df['object_source'].nunique()}")
Expand Down
46 changes: 28 additions & 18 deletions metpo/analysis/extract_definitions_from_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

import pandas as pd

from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map

# Configuration
SSSOM_FILE = "../../data/mappings/metpo_mappings_combined_relaxed.sssom.tsv"
METPO_SHEET = "../../src/templates/metpo_sheet.tsv"
Expand Down Expand Up @@ -48,27 +50,34 @@ def parse_object_label(label: str) -> tuple[str, str | None]:
return label.strip(), None


def extract_ontology_prefix(iri: str) -> str:
def extract_ontology_prefix(
iri: str,
curie_map: dict[str, str] | None = None,
) -> str:
"""Extract ontology prefix from IRI."""
# Handle obo format
if "obo/" in iri:
match = re.search(r"/obo/([A-Z]+)_", iri)
if match:
return match.group(1)
text = str(iri).strip().strip("<>").strip()
prefix = "UNKNOWN"
if not text:
return prefix

# Handle DOI format
if "doi.org" in iri:
return "DOI"
extracted = extract_prefix(text, curie_map)
if extracted:
return extracted.upper()

# Handle other formats
if "biolink" in iri.lower():
return "BIOLINK"
if "dsmz" in iri.lower():
return "D3O"
if "mdatahub" in iri.lower():
return "MEO"
if "obo/" in text:
match = re.search(r"/obo/([A-Z]+)_", text)
if match:
prefix = match.group(1)
elif "doi.org" in text.lower():
prefix = "DOI"
elif "biolink" in text.lower():
prefix = "BIOLINK"
elif "dsmz" in text.lower():
prefix = "D3O"
elif "mdatahub" in text.lower():
prefix = "MEO"

return "UNKNOWN"
return prefix


def load_metpo_sheet() -> pd.DataFrame:
Expand Down Expand Up @@ -114,6 +123,7 @@ def load_sssom_mappings() -> pd.DataFrame:

# Read from data start
df = pd.read_csv(SSSOM_FILE, sep="\t", skiprows=data_start)
curie_map = parse_sssom_curie_map(SSSOM_FILE)

# Extract distance from comment
df["distance"] = df["comment"].apply(extract_distance_from_comment)
Expand All @@ -124,7 +134,7 @@ def load_sssom_mappings() -> pd.DataFrame:
)

# Extract ontology prefix
df["ontology"] = df["object_id"].apply(extract_ontology_prefix)
df["ontology"] = df["object_id"].apply(lambda x: extract_ontology_prefix(x, curie_map))

return df

Expand Down
78 changes: 73 additions & 5 deletions metpo/pipeline/chromadb_semantic_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,68 @@
from dotenv import load_dotenv
from tqdm import tqdm

CORE_CURIE_MAP = {
"METPO": "http://purl.obolibrary.org/obo/METPO_",
"skos": "http://www.w3.org/2004/02/skos/core#",
"semapv": "https://w3id.org/semapv/vocab/",
}

KNOWN_IRI_PREFIXES = {
"biolink": "https://w3id.org/biolink/vocab/",
"d3o": "https://purl.dsmz.de/schema/",
"doi": "http://doi.org/",
"bipon": "http://www.semanticweb.org/BiPON/",
}


def strip_angle_brackets(identifier: str) -> str:
text = identifier.strip()
if text.startswith("<") and text.endswith(">"):
return text[1:-1]
return text


def iri_to_curie(iri: str) -> tuple[str, str] | None:
"""Convert common IRI forms to CURIEs when safely possible."""
# OBO-style compact form, e.g. http://purl.obolibrary.org/obo/GO_0008152 -> GO:0008152
if iri.startswith("http://purl.obolibrary.org/obo/"):
local = iri.removeprefix("http://purl.obolibrary.org/obo/")
if "_" in local:
prefix, suffix = local.split("_", 1)
if prefix and suffix:
return f"{prefix}:{suffix}", f"http://purl.obolibrary.org/obo/{prefix}_"
return None

for prefix, base in KNOWN_IRI_PREFIXES.items():
if iri.startswith(base):
local = iri.removeprefix(base)
if local:
return f"{prefix}:{local}", base

# Accept https variant for doi
if iri.startswith("https://doi.org/"):
local = iri.removeprefix("https://doi.org/")
if local:
return f"doi:{local}", KNOWN_IRI_PREFIXES["doi"]

return None


def normalize_object_id(raw_identifier: str) -> tuple[str, dict[str, str]]:
"""Normalize mapping object id to CURIE where possible, else plain IRI (no <>)."""
clean = strip_angle_brackets(raw_identifier)
if not clean:
return clean, {}

converted = iri_to_curie(clean)
if converted is not None:
curie, expansion = converted
prefix = curie.split(":", 1)[0]
return curie, {prefix: expansion}

# Keep plain IRI if no safe CURIE normalization is available.
return clean, {}


def load_metpo_terms(
tsv_path: str,
Expand Down Expand Up @@ -178,12 +240,18 @@ def write_sssom_output(
else:
filtered = [m for m in matches if m["distance"] <= max_distance]

normalized_rows: list[dict] = []
curie_map = dict(CORE_CURIE_MAP)
for m in filtered:
normalized_object_id, object_prefixes = normalize_object_id(str(m["match_iri"]))
curie_map.update(object_prefixes)
normalized_rows.append({**m, "normalized_object_id": normalized_object_id})

with Path(output_path).open("w", encoding="utf-8", newline="") as f:
# Write metadata block
f.write("# curie_map:\n")
f.write("# METPO: http://purl.obolibrary.org/obo/METPO_\n")
f.write("# skos: http://www.w3.org/2004/02/skos/core#\n")
f.write("# semapv: https://w3id.org/semapv/vocab/\n")
for prefix in sorted(curie_map):
f.write(f"# {prefix}: {curie_map[prefix]}\n")
f.write(
f"# mapping_set_id: metpo-ontology-mappings-{datetime.now(UTC).date().isoformat()}\n"
)
Expand Down Expand Up @@ -264,14 +332,14 @@ def write_sssom_output(
]
)

for m in filtered:
for m in normalized_rows:
similarity = 1.0 - (m["distance"] / 2.0)
writer.writerow(
[
m["metpo_id"],
m["metpo_label"],
similarity_to_predicate(similarity),
m["match_iri"],
m["normalized_object_id"],
m["match_document"],
"semapv:SemanticSimilarityThresholdMatching",
f"{similarity:.6f}",
Expand Down
26 changes: 8 additions & 18 deletions metpo/presentations/analyze_primary_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from collections import Counter
from pathlib import Path

from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map

# PRIMARY SOURCE 1: ChromaDB databases
CHROMA_COMBINED = Path("/home/mark/gitrepos/metpo/notebooks/chroma_combined/chroma.sqlite3")
CHROMA_OLS_20 = Path("/home/mark/gitrepos/metpo/notebooks/chroma_ols_20/chroma.sqlite3")
Expand Down Expand Up @@ -65,30 +67,18 @@ def analyze_sssom(sssom_path):
target_prefixes = Counter()
predicates = Counter()
total_mappings = 0
curie_map = parse_sssom_curie_map(sssom_path)

with Path(sssom_path).open() as f:
reader = csv.DictReader(f, delimiter="\t")
data_lines = [line for line in f if not line.startswith("#")]
reader = csv.DictReader(data_lines, delimiter="\t")
for row in reader:
total_mappings += 1

# Extract prefix from object_id
obj_id = row.get("object_id", "")
if "/obo/" in obj_id:
# e.g., http://purl.obolibrary.org/obo/PATO_0000001
prefix = obj_id.split("/obo/")[1].split("_")[0] if "_" in obj_id else ""
if prefix:
target_prefixes[prefix] += 1
elif "doi.org/10.1601" in obj_id:
target_prefixes["N4L"] += 1
elif "purl.dsmz.de" in obj_id:
if "d3o" in obj_id.lower():
target_prefixes["D3O"] += 1
elif "miso" in obj_id.lower():
target_prefixes["MISO"] += 1
elif "mdatahub.org" in obj_id.lower():
target_prefixes["MEO"] += 1
elif "biolink" in obj_id.lower():
target_prefixes["BIOLINK"] += 1
prefix = extract_prefix(row.get("object_id", ""), curie_map)
if prefix:
target_prefixes[prefix.upper()] += 1

pred = row.get("predicate_id", "")
predicates[pred] += 1
Expand Down
1 change: 1 addition & 0 deletions metpo/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Shared utilities for METPO scripts."""
77 changes: 77 additions & 0 deletions metpo/utils/sssom_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Helpers for parsing SSSOM metadata and identifiers."""

import re
from pathlib import Path

CURIE_MAP_LINE = re.compile(r"^#\s{0,4}([A-Za-z][\w.-]*):\s*(\S+)\s*$")


def strip_angle_brackets(identifier: str) -> str:
"""Return identifier without surrounding angle brackets."""
text = str(identifier).strip()
if text.startswith("<") and text.endswith(">"):
return text[1:-1].strip()
return text


def parse_sssom_curie_map(sssom_path: str | Path) -> dict[str, str]:
"""Parse ``# curie_map`` prefixes from an SSSOM TSV header."""
curie_map: dict[str, str] = {}
in_curie_map = False

with Path(sssom_path).open(encoding="utf-8") as handle:
for line in handle:
if not line.startswith("#"):
break

text = line.rstrip("\n")
if text.strip() == "# curie_map:":
in_curie_map = True
continue

if in_curie_map:
match = CURIE_MAP_LINE.match(text)
if match:
prefix, expansion = match.groups()
curie_map[prefix] = expansion
continue

# End curie_map block at the first non-entry comment line.
if text.startswith("# ") and ":" in text:
in_curie_map = False

return curie_map


def extract_prefix(identifier: str, curie_map: dict[str, str] | None = None) -> str | None:
"""
Extract prefix from CURIE/IRI identifier.

Preference order:
1. CURIE prefix when object is already CURIE-like.
2. curie_map expansion match when object is an IRI.
3. conservative structural fallbacks (OBO IRIs).
"""
text = strip_angle_brackets(identifier)
if not text:
return None

lowered = text.lower()
is_iri = lowered.startswith(("http://", "https://"))
if ":" in text and not is_iri:
return text.split(":", 1)[0]

if curie_map:
best_prefix: str | None = None
best_len = -1
for prefix, expansion in curie_map.items():
if text.startswith(expansion) and len(expansion) > best_len:
best_prefix = prefix
best_len = len(expansion)
if best_prefix is not None:
return best_prefix

if "/obo/" in text and "_" in text:
return text.split("/obo/")[1].split("_")[0]

return None
Loading