Standardize SSSOM object_id normalization and curie_map-based prefix parsing (#351)

turbomam · turbomam · commit 1fdf4077a57c · 2026-02-13T13:58:44.000-05:00
diff --git a/metpo/analysis/analyze_ontology_value.py b/metpo/analysis/analyze_ontology_value.py
@@ -17,8 +17,10 @@
 import click
 import pandas as pd
 
+from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map
 
-def extract_term_source(iri: str) -> str:
+
+def extract_term_source(iri: str, curie_map: dict[str, str] | None = None) -> str:
     """
     Extract the defining ontology from an IRI.
 
@@ -27,23 +29,29 @@ def extract_term_source(iri: str) -> str:
     - http://purl.obolibrary.org/obo/GO_0008150 → GO
     - https://w3id.org/biolink/vocab/phenotype → biolink
     """
-    # OBO pattern: .../obo/PREFIX_ID
-    obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", iri)
-    if obo_match:
-        return obo_match.group(1).upper()
-
-    # Biolink pattern
-    if "biolink" in iri:
-        return "biolink"
-
-    # DOI pattern
-    if "doi.org" in iri:
-        doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", iri)
-        if doi_match:
-            return doi_match.group(1)
-
-    # Default: return as-is
-    return "unknown"
+    text = str(iri).strip()
+    source = "unknown"
+    if not text:
+        return source
+
+    extracted = extract_prefix(text, curie_map)
+    if extracted:
+        source = extracted.upper()
+    else:
+        # OBO pattern: .../obo/PREFIX_ID
+        obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", text)
+        if obo_match:
+            source = obo_match.group(1).upper()
+        elif "biolink" in text.lower():
+            source = "biolink"
+        elif "doi.org" in text.lower():
+            doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", text, flags=re.IGNORECASE)
+            if doi_match:
+                source = doi_match.group(1)
+        elif "purl.dsmz.de" in text.lower():
+            source = "D3O"
+
+    return source
 
 
 @click.command()
@@ -53,9 +61,10 @@ def main(input):
 
     print(f"Loading mappings from: {input}")
     df = pd.read_csv(input, sep="\t", comment="#")
+    curie_map = parse_sssom_curie_map(input)
 
     # Extract term source from IRI
-    df["term_source"] = df["object_id"].apply(extract_term_source)
+    df["term_source"] = df["object_id"].apply(lambda x: extract_term_source(x, curie_map))
 
     print(f"\nLoaded {len(df)} mappings")
     print(f"Unique ontology files (object_source): {df['object_source'].nunique()}")
diff --git a/metpo/analysis/extract_definitions_from_mappings.py b/metpo/analysis/extract_definitions_from_mappings.py
@@ -16,6 +16,8 @@
 
 import pandas as pd
 
+from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map
+
 # Configuration
 SSSOM_FILE = "../../data/mappings/metpo_mappings_combined_relaxed.sssom.tsv"
 METPO_SHEET = "../../src/templates/metpo_sheet.tsv"
@@ -48,27 +50,34 @@ def parse_object_label(label: str) -> tuple[str, str | None]:
     return label.strip(), None
 
 
-def extract_ontology_prefix(iri: str) -> str:
+def extract_ontology_prefix(
+    iri: str,
+    curie_map: dict[str, str] | None = None,
+) -> str:
     """Extract ontology prefix from IRI."""
-    # Handle obo format
-    if "obo/" in iri:
-        match = re.search(r"/obo/([A-Z]+)_", iri)
-        if match:
-            return match.group(1)
+    text = str(iri).strip().strip("<>").strip()
+    prefix = "UNKNOWN"
+    if not text:
+        return prefix
 
-    # Handle DOI format
-    if "doi.org" in iri:
-        return "DOI"
+    extracted = extract_prefix(text, curie_map)
+    if extracted:
+        return extracted.upper()
 
-    # Handle other formats
-    if "biolink" in iri.lower():
-        return "BIOLINK"
-    if "dsmz" in iri.lower():
-        return "D3O"
-    if "mdatahub" in iri.lower():
-        return "MEO"
+    if "obo/" in text:
+        match = re.search(r"/obo/([A-Z]+)_", text)
+        if match:
+            prefix = match.group(1)
+    elif "doi.org" in text.lower():
+        prefix = "DOI"
+    elif "biolink" in text.lower():
+        prefix = "BIOLINK"
+    elif "dsmz" in text.lower():
+        prefix = "D3O"
+    elif "mdatahub" in text.lower():
+        prefix = "MEO"
 
-    return "UNKNOWN"
+    return prefix
 
 
 def load_metpo_sheet() -> pd.DataFrame:
@@ -114,6 +123,7 @@ def load_sssom_mappings() -> pd.DataFrame:
 
     # Read from data start
     df = pd.read_csv(SSSOM_FILE, sep="\t", skiprows=data_start)
+    curie_map = parse_sssom_curie_map(SSSOM_FILE)
 
     # Extract distance from comment
     df["distance"] = df["comment"].apply(extract_distance_from_comment)
@@ -124,7 +134,7 @@ def load_sssom_mappings() -> pd.DataFrame:
     )
 
     # Extract ontology prefix
-    df["ontology"] = df["object_id"].apply(extract_ontology_prefix)
+    df["ontology"] = df["object_id"].apply(lambda x: extract_ontology_prefix(x, curie_map))
 
     return df
 
diff --git a/metpo/pipeline/chromadb_semantic_mapper.py b/metpo/pipeline/chromadb_semantic_mapper.py
@@ -42,6 +42,68 @@
 from dotenv import load_dotenv
 from tqdm import tqdm
 
+CORE_CURIE_MAP = {
+    "METPO": "http://purl.obolibrary.org/obo/METPO_",
+    "skos": "http://www.w3.org/2004/02/skos/core#",
+    "semapv": "https://w3id.org/semapv/vocab/",
+}
+
+KNOWN_IRI_PREFIXES = {
+    "biolink": "https://w3id.org/biolink/vocab/",
+    "d3o": "https://purl.dsmz.de/schema/",
+    "doi": "http://doi.org/",
+    "bipon": "http://www.semanticweb.org/BiPON/",
+}
+
+
+def strip_angle_brackets(identifier: str) -> str:
+    text = identifier.strip()
+    if text.startswith("<") and text.endswith(">"):
+        return text[1:-1]
+    return text
+
+
+def iri_to_curie(iri: str) -> tuple[str, str] | None:
+    """Convert common IRI forms to CURIEs when safely possible."""
+    # OBO-style compact form, e.g. http://purl.obolibrary.org/obo/GO_0008152 -> GO:0008152
+    if iri.startswith("http://purl.obolibrary.org/obo/"):
+        local = iri.removeprefix("http://purl.obolibrary.org/obo/")
+        if "_" in local:
+            prefix, suffix = local.split("_", 1)
+            if prefix and suffix:
+                return f"{prefix}:{suffix}", f"http://purl.obolibrary.org/obo/{prefix}_"
+        return None
+
+    for prefix, base in KNOWN_IRI_PREFIXES.items():
+        if iri.startswith(base):
+            local = iri.removeprefix(base)
+            if local:
+                return f"{prefix}:{local}", base
+
+    # Accept https variant for doi
+    if iri.startswith("https://doi.org/"):
+        local = iri.removeprefix("https://doi.org/")
+        if local:
+            return f"doi:{local}", KNOWN_IRI_PREFIXES["doi"]
+
+    return None
+
+
+def normalize_object_id(raw_identifier: str) -> tuple[str, dict[str, str]]:
+    """Normalize mapping object id to CURIE where possible, else plain IRI (no <>)."""
+    clean = strip_angle_brackets(raw_identifier)
+    if not clean:
+        return clean, {}
+
+    converted = iri_to_curie(clean)
+    if converted is not None:
+        curie, expansion = converted
+        prefix = curie.split(":", 1)[0]
+        return curie, {prefix: expansion}
+
+    # Keep plain IRI if no safe CURIE normalization is available.
+    return clean, {}
+
 
 def load_metpo_terms(
     tsv_path: str,
@@ -178,12 +240,18 @@ def write_sssom_output(
     else:
         filtered = [m for m in matches if m["distance"] <= max_distance]
 
+    normalized_rows: list[dict] = []
+    curie_map = dict(CORE_CURIE_MAP)
+    for m in filtered:
+        normalized_object_id, object_prefixes = normalize_object_id(str(m["match_iri"]))
+        curie_map.update(object_prefixes)
+        normalized_rows.append({**m, "normalized_object_id": normalized_object_id})
+
     with Path(output_path).open("w", encoding="utf-8", newline="") as f:
         # Write metadata block
         f.write("# curie_map:\n")
-        f.write("#   METPO: http://purl.obolibrary.org/obo/METPO_\n")
-        f.write("#   skos: http://www.w3.org/2004/02/skos/core#\n")
-        f.write("#   semapv: https://w3id.org/semapv/vocab/\n")
+        for prefix in sorted(curie_map):
+            f.write(f"#   {prefix}: {curie_map[prefix]}\n")
         f.write(
             f"# mapping_set_id: metpo-ontology-mappings-{datetime.now(UTC).date().isoformat()}\n"
         )
@@ -264,14 +332,14 @@ def write_sssom_output(
             ]
         )
 
-        for m in filtered:
+        for m in normalized_rows:
             similarity = 1.0 - (m["distance"] / 2.0)
             writer.writerow(
                 [
                     m["metpo_id"],
                     m["metpo_label"],
                     similarity_to_predicate(similarity),
-                    m["match_iri"],
+                    m["normalized_object_id"],
                     m["match_document"],
                     "semapv:SemanticSimilarityThresholdMatching",
                     f"{similarity:.6f}",
diff --git a/metpo/presentations/analyze_primary_sources.py b/metpo/presentations/analyze_primary_sources.py
@@ -12,6 +12,8 @@
 from collections import Counter
 from pathlib import Path
 
+from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map
+
 # PRIMARY SOURCE 1: ChromaDB databases
 CHROMA_COMBINED = Path("/home/mark/gitrepos/metpo/notebooks/chroma_combined/chroma.sqlite3")
 CHROMA_OLS_20 = Path("/home/mark/gitrepos/metpo/notebooks/chroma_ols_20/chroma.sqlite3")
@@ -65,30 +67,18 @@ def analyze_sssom(sssom_path):
     target_prefixes = Counter()
     predicates = Counter()
     total_mappings = 0
+    curie_map = parse_sssom_curie_map(sssom_path)
 
     with Path(sssom_path).open() as f:
-        reader = csv.DictReader(f, delimiter="\t")
+        data_lines = [line for line in f if not line.startswith("#")]
+        reader = csv.DictReader(data_lines, delimiter="\t")
         for row in reader:
             total_mappings += 1
 
             # Extract prefix from object_id
-            obj_id = row.get("object_id", "")
-            if "/obo/" in obj_id:
-                # e.g., http://purl.obolibrary.org/obo/PATO_0000001
-                prefix = obj_id.split("/obo/")[1].split("_")[0] if "_" in obj_id else ""
-                if prefix:
-                    target_prefixes[prefix] += 1
-            elif "doi.org/10.1601" in obj_id:
-                target_prefixes["N4L"] += 1
-            elif "purl.dsmz.de" in obj_id:
-                if "d3o" in obj_id.lower():
-                    target_prefixes["D3O"] += 1
-                elif "miso" in obj_id.lower():
-                    target_prefixes["MISO"] += 1
-            elif "mdatahub.org" in obj_id.lower():
-                target_prefixes["MEO"] += 1
-            elif "biolink" in obj_id.lower():
-                target_prefixes["BIOLINK"] += 1
+            prefix = extract_prefix(row.get("object_id", ""), curie_map)
+            if prefix:
+                target_prefixes[prefix.upper()] += 1
 
             pred = row.get("predicate_id", "")
             predicates[pred] += 1
diff --git a/metpo/utils/__init__.py b/metpo/utils/__init__.py
@@ -0,0 +1,2 @@
+"""Shared utilities for METPO scripts."""
+
diff --git a/metpo/utils/sssom_utils.py b/metpo/utils/sssom_utils.py
@@ -0,0 +1,77 @@
+"""Helpers for parsing SSSOM metadata and identifiers."""
+
+import re
+from pathlib import Path
+
+CURIE_MAP_LINE = re.compile(r"^#\s{0,4}([A-Za-z][\w.-]*):\s*(\S+)\s*$")
+
+
+def strip_angle_brackets(identifier: str) -> str:
+    """Return identifier without surrounding angle brackets."""
+    text = str(identifier).strip()
+    if text.startswith("<") and text.endswith(">"):
+        return text[1:-1].strip()
+    return text
+
+
+def parse_sssom_curie_map(sssom_path: str | Path) -> dict[str, str]:
+    """Parse ``# curie_map`` prefixes from an SSSOM TSV header."""
+    curie_map: dict[str, str] = {}
+    in_curie_map = False
+
+    with Path(sssom_path).open(encoding="utf-8") as handle:
+        for line in handle:
+            if not line.startswith("#"):
+                break
+
+            text = line.rstrip("\n")
+            if text.strip() == "# curie_map:":
+                in_curie_map = True
+                continue
+
+            if in_curie_map:
+                match = CURIE_MAP_LINE.match(text)
+                if match:
+                    prefix, expansion = match.groups()
+                    curie_map[prefix] = expansion
+                    continue
+
+                # End curie_map block at the first non-entry comment line.
+                if text.startswith("# ") and ":" in text:
+                    in_curie_map = False
+
+    return curie_map
+
+
+def extract_prefix(identifier: str, curie_map: dict[str, str] | None = None) -> str | None:
+    """
+    Extract prefix from CURIE/IRI identifier.
+
+    Preference order:
+    1. CURIE prefix when object is already CURIE-like.
+    2. curie_map expansion match when object is an IRI.
+    3. conservative structural fallbacks (OBO IRIs).
+    """
+    text = strip_angle_brackets(identifier)
+    if not text:
+        return None
+
+    lowered = text.lower()
+    is_iri = lowered.startswith(("http://", "https://"))
+    if ":" in text and not is_iri:
+        return text.split(":", 1)[0]
+
+    if curie_map:
+        best_prefix: str | None = None
+        best_len = -1
+        for prefix, expansion in curie_map.items():
+            if text.startswith(expansion) and len(expansion) > best_len:
+                best_prefix = prefix
+                best_len = len(expansion)
+        if best_prefix is not None:
+            return best_prefix
+
+    if "/obo/" in text and "_" in text:
+        return text.split("/obo/")[1].split("_")[0]
+
+    return None

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+"""Shared utilities for METPO scripts."""`
	`2`	`+`