diff --git a/metpo/analysis/analyze_ontology_value.py b/metpo/analysis/analyze_ontology_value.py index cfc08810..6652d7c0 100644 --- a/metpo/analysis/analyze_ontology_value.py +++ b/metpo/analysis/analyze_ontology_value.py @@ -11,14 +11,15 @@ 3. Redundant sources that could be removed """ -import re from collections import defaultdict import click import pandas as pd +from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map -def extract_term_source(iri: str) -> str: + +def extract_term_source(iri: str, curie_map: dict[str, str] | None = None) -> str: """ Extract the defining ontology from an IRI. @@ -27,22 +28,9 @@ def extract_term_source(iri: str) -> str: - http://purl.obolibrary.org/obo/GO_0008150 → GO - https://w3id.org/biolink/vocab/phenotype → biolink """ - # OBO pattern: .../obo/PREFIX_ID - obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", iri) - if obo_match: - return obo_match.group(1).upper() - - # Biolink pattern - if "biolink" in iri: - return "biolink" - - # DOI pattern - if "doi.org" in iri: - doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", iri) - if doi_match: - return doi_match.group(1) - - # Default: return as-is + extracted = extract_prefix(iri, curie_map) + if extracted: + return extracted.upper() return "unknown" @@ -53,9 +41,10 @@ def main(input): print(f"Loading mappings from: {input}") df = pd.read_csv(input, sep="\t", comment="#") + curie_map = parse_sssom_curie_map(input) # Extract term source from IRI - df["term_source"] = df["object_id"].apply(extract_term_source) + df["term_source"] = df["object_id"].apply(lambda x: extract_term_source(x, curie_map)) print(f"\nLoaded {len(df)} mappings") print(f"Unique ontology files (object_source): {df['object_source'].nunique()}") diff --git a/metpo/analysis/extract_definitions_from_mappings.py b/metpo/analysis/extract_definitions_from_mappings.py index 13e4849f..e8eb4cc9 100644 --- a/metpo/analysis/extract_definitions_from_mappings.py +++ b/metpo/analysis/extract_definitions_from_mappings.py @@ -16,6 +16,8 @@ import pandas as pd +from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map + # Configuration SSSOM_FILE = "../../data/mappings/metpo_mappings_combined_relaxed.sssom.tsv" METPO_SHEET = "../../src/templates/metpo_sheet.tsv" @@ -48,26 +50,14 @@ def parse_object_label(label: str) -> tuple[str, str | None]: return label.strip(), None -def extract_ontology_prefix(iri: str) -> str: - """Extract ontology prefix from IRI.""" - # Handle obo format - if "obo/" in iri: - match = re.search(r"/obo/([A-Z]+)_", iri) - if match: - return match.group(1) - - # Handle DOI format - if "doi.org" in iri: - return "DOI" - - # Handle other formats - if "biolink" in iri.lower(): - return "BIOLINK" - if "dsmz" in iri.lower(): - return "D3O" - if "mdatahub" in iri.lower(): - return "MEO" - +def extract_ontology_prefix( + iri: str, + curie_map: dict[str, str] | None = None, +) -> str: + """Extract ontology prefix from CURIE/IRI with SSSOM-aware logic.""" + extracted = extract_prefix(iri, curie_map) + if extracted: + return extracted.upper() return "UNKNOWN" @@ -114,6 +104,7 @@ def load_sssom_mappings() -> pd.DataFrame: # Read from data start df = pd.read_csv(SSSOM_FILE, sep="\t", skiprows=data_start) + curie_map = parse_sssom_curie_map(SSSOM_FILE) # Extract distance from comment df["distance"] = df["comment"].apply(extract_distance_from_comment) @@ -124,7 +115,7 @@ def load_sssom_mappings() -> pd.DataFrame: ) # Extract ontology prefix - df["ontology"] = df["object_id"].apply(extract_ontology_prefix) + df["ontology"] = df["object_id"].apply(lambda x: extract_ontology_prefix(x, curie_map)) return df diff --git a/metpo/pipeline/chromadb_semantic_mapper.py b/metpo/pipeline/chromadb_semantic_mapper.py index a9cedfaf..67ce519e 100644 --- a/metpo/pipeline/chromadb_semantic_mapper.py +++ b/metpo/pipeline/chromadb_semantic_mapper.py @@ -42,6 +42,14 @@ from dotenv import load_dotenv from tqdm import tqdm +from metpo.utils.sssom_utils import normalize_object_id + +CORE_CURIE_MAP = { + "METPO": "http://purl.obolibrary.org/obo/METPO_", + "skos": "http://www.w3.org/2004/02/skos/core#", + "semapv": "https://w3id.org/semapv/vocab/", +} + def load_metpo_terms( tsv_path: str, @@ -178,12 +186,18 @@ def write_sssom_output( else: filtered = [m for m in matches if m["distance"] <= max_distance] + normalized_rows: list[dict] = [] + curie_map = dict(CORE_CURIE_MAP) + for m in filtered: + normalized_object_id, object_prefixes = normalize_object_id(str(m["match_iri"])) + curie_map.update(object_prefixes) + normalized_rows.append({**m, "normalized_object_id": normalized_object_id}) + with Path(output_path).open("w", encoding="utf-8", newline="") as f: # Write metadata block f.write("# curie_map:\n") - f.write("# METPO: http://purl.obolibrary.org/obo/METPO_\n") - f.write("# skos: http://www.w3.org/2004/02/skos/core#\n") - f.write("# semapv: https://w3id.org/semapv/vocab/\n") + for prefix in sorted(curie_map): + f.write(f"# {prefix}: {curie_map[prefix]}\n") f.write( f"# mapping_set_id: metpo-ontology-mappings-{datetime.now(UTC).date().isoformat()}\n" ) @@ -264,14 +278,14 @@ def write_sssom_output( ] ) - for m in filtered: + for m in normalized_rows: similarity = 1.0 - (m["distance"] / 2.0) writer.writerow( [ m["metpo_id"], m["metpo_label"], similarity_to_predicate(similarity), - m["match_iri"], + m["normalized_object_id"], m["match_document"], "semapv:SemanticSimilarityThresholdMatching", f"{similarity:.6f}", diff --git a/metpo/presentations/analyze_primary_sources.py b/metpo/presentations/analyze_primary_sources.py index d02462db..7aa9825e 100644 --- a/metpo/presentations/analyze_primary_sources.py +++ b/metpo/presentations/analyze_primary_sources.py @@ -12,6 +12,8 @@ from collections import Counter from pathlib import Path +from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map + # PRIMARY SOURCE 1: ChromaDB databases CHROMA_COMBINED = Path("/home/mark/gitrepos/metpo/notebooks/chroma_combined/chroma.sqlite3") CHROMA_OLS_20 = Path("/home/mark/gitrepos/metpo/notebooks/chroma_ols_20/chroma.sqlite3") @@ -65,30 +67,18 @@ def analyze_sssom(sssom_path): target_prefixes = Counter() predicates = Counter() total_mappings = 0 + curie_map = parse_sssom_curie_map(sssom_path) with Path(sssom_path).open() as f: - reader = csv.DictReader(f, delimiter="\t") + data_lines = (line for line in f if not line.startswith("#")) + reader = csv.DictReader(data_lines, delimiter="\t") for row in reader: total_mappings += 1 # Extract prefix from object_id - obj_id = row.get("object_id", "") - if "/obo/" in obj_id: - # e.g., http://purl.obolibrary.org/obo/PATO_0000001 - prefix = obj_id.split("/obo/")[1].split("_")[0] if "_" in obj_id else "" - if prefix: - target_prefixes[prefix] += 1 - elif "doi.org/10.1601" in obj_id: - target_prefixes["N4L"] += 1 - elif "purl.dsmz.de" in obj_id: - if "d3o" in obj_id.lower(): - target_prefixes["D3O"] += 1 - elif "miso" in obj_id.lower(): - target_prefixes["MISO"] += 1 - elif "mdatahub.org" in obj_id.lower(): - target_prefixes["MEO"] += 1 - elif "biolink" in obj_id.lower(): - target_prefixes["BIOLINK"] += 1 + prefix = extract_prefix(row.get("object_id", ""), curie_map) + if prefix: + target_prefixes[prefix.upper()] += 1 pred = row.get("predicate_id", "") predicates[pred] += 1 diff --git a/metpo/utils/__init__.py b/metpo/utils/__init__.py new file mode 100644 index 00000000..e08d5cc7 --- /dev/null +++ b/metpo/utils/__init__.py @@ -0,0 +1 @@ +"""Shared utilities for METPO scripts.""" diff --git a/metpo/utils/sssom_utils.py b/metpo/utils/sssom_utils.py new file mode 100644 index 00000000..3a318824 --- /dev/null +++ b/metpo/utils/sssom_utils.py @@ -0,0 +1,149 @@ +"""Helpers for parsing SSSOM metadata and identifiers.""" + +import re +from pathlib import Path + +CURIE_MAP_LINE = re.compile(r"^#\s{2,}([A-Za-z][\w.-]*):\s*(\S+)\s*$") + +KNOWN_IRI_PREFIXES = { + "biolink": "https://w3id.org/biolink/vocab/", + "d3o": "https://purl.dsmz.de/schema/", + "doi": "http://doi.org/", + "bipon": "http://www.semanticweb.org/BiPON/", +} + + +def strip_angle_brackets(identifier: str) -> str: + """Return identifier without surrounding angle brackets.""" + text = str(identifier).strip() + if text.startswith("<") and text.endswith(">"): + return text[1:-1].strip() + return text + + +def parse_sssom_curie_map(sssom_path: str | Path) -> dict[str, str]: + """Parse ``# curie_map`` prefixes from an SSSOM TSV header.""" + curie_map: dict[str, str] = {} + in_curie_map = False + + with Path(sssom_path).open(encoding="utf-8") as handle: + for line in handle: + if not line.startswith("#"): + break + + text = line.rstrip("\n") + if text.strip() == "# curie_map:": + in_curie_map = True + continue + + if in_curie_map: + # End curie_map block at first normal metadata line like: + # "# mapping_set_id: ...". + if text.startswith("# ") and not text.startswith("# "): + in_curie_map = False + continue + + match = CURIE_MAP_LINE.match(text) + if match: + prefix, expansion = match.groups() + curie_map[prefix] = expansion + continue + + # Any non-matching comment line ends the curie_map block. + in_curie_map = False + + return curie_map + + +def iri_to_curie( + iri: str, known_iri_prefixes: dict[str, str] | None = None +) -> tuple[str, str] | None: + """Convert common IRI forms to CURIEs when safely possible.""" + prefixes = known_iri_prefixes or KNOWN_IRI_PREFIXES + + # OBO-style compact form, e.g. http://purl.obolibrary.org/obo/GO_0008152 -> GO:0008152 + if iri.startswith("http://purl.obolibrary.org/obo/"): + local = iri.removeprefix("http://purl.obolibrary.org/obo/") + if "_" in local: + prefix, suffix = local.split("_", 1) + if prefix and suffix: + return f"{prefix}:{suffix}", f"http://purl.obolibrary.org/obo/{prefix}_" + return None + + for prefix, base in prefixes.items(): + if iri.startswith(base): + local = iri.removeprefix(base) + if local: + return f"{prefix}:{local}", base + + # Accept https variant for doi + if iri.startswith("https://doi.org/"): + local = iri.removeprefix("https://doi.org/") + doi_base = prefixes.get("doi", "http://doi.org/") + if local: + return f"doi:{local}", doi_base + + return None + + +def normalize_object_id( + raw_identifier: str, + known_iri_prefixes: dict[str, str] | None = None, +) -> tuple[str, dict[str, str]]: + """Normalize mapping object id to CURIE where possible, else plain IRI (no <>).""" + clean = strip_angle_brackets(raw_identifier) + if not clean: + return clean, {} + + converted = iri_to_curie(clean, known_iri_prefixes=known_iri_prefixes) + if converted is not None: + curie, expansion = converted + prefix = curie.split(":", 1)[0] + return curie, {prefix: expansion} + + # Keep plain IRI if no safe CURIE normalization is available. + return clean, {} + + +def extract_prefix(identifier: str, curie_map: dict[str, str] | None = None) -> str | None: + """ + Extract prefix from CURIE/IRI identifier. + + Preference order: + 1. CURIE prefix when object is already CURIE-like. + 2. curie_map expansion match when object is an IRI. + 3. conservative structural fallbacks (OBO IRIs). + """ + text = strip_angle_brackets(identifier) + detected: str | None = None + if not text: + return detected + + lowered = text.lower() + is_iri = lowered.startswith(("http://", "https://")) + if ":" in text and not is_iri: + detected = text.split(":", 1)[0] + elif curie_map: + best_prefix: str | None = None + best_len = -1 + for prefix, expansion in curie_map.items(): + if text.startswith(expansion) and len(expansion) > best_len: + best_prefix = prefix + best_len = len(expansion) + detected = best_prefix + + if detected is None: + if "/obo/" in text and "_" in text: + detected = text.split("/obo/")[1].split("_")[0] + elif "doi.org" in lowered: + detected = "doi" + elif "biolink" in lowered: + detected = "biolink" + elif "purl.dsmz.de" in lowered: + detected = "d3o" + elif "mdatahub.org" in lowered: + detected = "meo" + elif "semanticweb.org/bipon/" in lowered: + detected = "bipon" + + return detected