From 6fd3a68e05a516976cc8526ff1305bc289f86fbe Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Wed, 21 Jan 2026 16:18:53 -0500 Subject: [PATCH 01/20] feat(etl): implement clinical indication section parser --- .../etl/sections/clinical_indication.py | 132 ++++++++++++++++-- tests/etl/sections/__init__.py | 0 .../etl/sections/test_clinical_indication.py | 124 ++++++++++++++++ 3 files changed, 248 insertions(+), 8 deletions(-) create mode 100644 tests/etl/sections/__init__.py create mode 100644 tests/etl/sections/test_clinical_indication.py diff --git a/src/prenatalppkt/etl/sections/clinical_indication.py b/src/prenatalppkt/etl/sections/clinical_indication.py index de31325..4cf3431 100644 --- a/src/prenatalppkt/etl/sections/clinical_indication.py +++ b/src/prenatalppkt/etl/sections/clinical_indication.py @@ -1,12 +1,128 @@ -""" -Clinical indication section parser (SKELETON). +from __future__ import annotations -TODO @VarenyaJ: Map indications to ICD-10 and HPO terms -""" +import json +import re +from typing import Dict, List, Union -from typing import Dict +def parse_clinical_indication(data: Union[str, Dict], source_format: str) -> Dict: + """ + Parse clinical indication / reason for exam from different source formats. -def parse_clinical_indication(data: str, source_format: str = "viewpoint_text") -> Dict: - """Extract indication for ultrasound exam.""" - return {"indication_text": "", "icd10_codes": [], "hpo_terms": []} + Supported formats: + - observer_json + - viewpoint_text + - viewpoint_hl7 + + Returns a normalized Dict with indication metadata. + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + result = _parse_observer_indication(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + result = _parse_viewpoint_text_indication(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + result = _parse_viewpoint_hl7_indication(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + # Standardized return schema + result.setdefault("icd10_codes", []) + result.setdefault("hpo_terms", []) + result["source_format"] = source_format + return result + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_indication(json_data: Dict) -> Dict: + """ + Extract indication from Observer JSON. + Known locations: + - exam.indication + - exam.finalize.indication + """ + indication_text = "" + + exam = json_data.get("exam", {}) + if isinstance(exam, dict): + indication_text = ( + exam.get("indication") or exam.get("finalize", {}).get("indication") or "" + ) + + return {"indication_text": indication_text.strip(), "raw_data": json_data} + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_indication(text: str) -> Dict: + """ + Extract indication section from ViewPoint text reports. + + Expected pattern: + Indication + ========== + [free text] + """ + indication_text = "" + + pattern = re.compile( + r"Indication\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) + + match = pattern.search(text) + if match: + indication_text = match.group("body").strip() + + return {"indication_text": indication_text, "raw_data": {"text": text}} + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_indication(hl7: str) -> Dict: + """ + Extract indication from HL7 ORU^R01 messages. + + Common pattern: + OBX||ST|RequestedProcedure.Indication^Indication|1|Advanced maternal age + """ + indication_lines: List[str] = [] + + for line in hl7.splitlines(): + if not line.startswith("OBX"): + continue + + fields = line.split("|") + if len(fields) < 6: + continue + + observation_id = fields[3] + value_field = fields[5] + + if "RequestedProcedure.Indication" in observation_id: + # HL7 values may be caret-delimited + value = value_field.split("^")[0] + if value: + indication_lines.append(value.strip()) + + indication_text = " ".join(indication_lines) + + return {"indication_text": indication_text, "raw_data": {"hl7": hl7}} diff --git a/tests/etl/sections/__init__.py b/tests/etl/sections/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/etl/sections/test_clinical_indication.py b/tests/etl/sections/test_clinical_indication.py new file mode 100644 index 0000000..9d1116c --- /dev/null +++ b/tests/etl/sections/test_clinical_indication.py @@ -0,0 +1,124 @@ +import json +import pytest + +from prenatalppkt.etl.sections.clinical_indication import parse_clinical_indication + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestClinicalIndicationObserver: + def test_basic_indication(self): + data = json.dumps({"exam": {"indication": "Advanced maternal age, dating"}}) + + result = parse_clinical_indication(data, "observer_json") + + assert "Advanced maternal age" in result["indication_text"] + assert result["source_format"] == "observer_json" + assert result["icd10_codes"] == [] + assert result["hpo_terms"] == [] + + def test_fallback_finalize_indication(self): + data = json.dumps( + {"exam": {"finalize": {"indication": "Poor obstetric history"}}} + ) + + result = parse_clinical_indication(data, "observer_json") + assert result["indication_text"] == "Poor obstetric history" + + def test_missing_indication(self): + data = json.dumps({"exam": {}}) + result = parse_clinical_indication(data, "observer_json") + assert result["indication_text"] == "" + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +class TestClinicalIndicationViewPointText: + def test_basic_indication(self): + text = """Indication +========== +Advanced maternal age, dating + +History +======= +Previous cesarean section +""" + result = parse_clinical_indication(text, "viewpoint_text") + + assert "Advanced maternal age" in result["indication_text"] + assert "History" not in result["indication_text"] + assert result["source_format"] == "viewpoint_text" + + def test_multiline_indication(self): + text = """Indication +========== +Advanced maternal age +Previous cesarean section +IVF pregnancy +""" + result = parse_clinical_indication(text, "viewpoint_text") + + assert "IVF pregnancy" in result["indication_text"] + assert result["indication_text"].count("\n") >= 1 + + def test_missing_indication_section(self): + text = """Fetal Biometry +============ +HC 175.0 mm +""" + result = parse_clinical_indication(text, "viewpoint_text") + assert result["indication_text"] == "" + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +class TestClinicalIndicationViewPointHL7: + def test_basic_indication(self): + hl7 = ( + "MSH|^~\\&|\n" + "OBX||ST|RequestedProcedure.Indication^Indication|1|Advanced maternal age\n" + "OBX||ST|RequestedProcedure.Indication^Indication|2|Dating scan\n" + ) + + result = parse_clinical_indication(hl7, "viewpoint_hl7") + + assert "Advanced maternal age" in result["indication_text"] + assert "Dating scan" in result["indication_text"] + assert result["source_format"] == "viewpoint_hl7" + + def test_no_indication_obx(self): + hl7 = "MSH|^~\\&|\nOBX||NM|SomeOtherField|1|123\n" + result = parse_clinical_indication(hl7, "viewpoint_hl7") + assert result["indication_text"] == "" + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestClinicalIndicationEdgeCases: + def test_invalid_format(self): + with pytest.raises(ValueError): + parse_clinical_indication("data", "unknown_format") + + def test_non_string_text(self): + with pytest.raises(ValueError): + parse_clinical_indication({"bad": "data"}, "viewpoint_text") + + def test_special_characters(self): + text = """Indication +========== +Advanced maternal age - >=35 years +""" + result = parse_clinical_indication(text, "viewpoint_text") + assert ">=35" in result["indication_text"] From 492b8541ca3589e329033a6b47087f7d35b5b433 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Wed, 21 Jan 2026 22:22:40 -0500 Subject: [PATCH 02/20] feat(etl): implement pregnancy dating section parser --- .../etl/sections/pregnancy_dating.py | 212 +++++++++++++++++- tests/etl/sections/test_pregnancy_dating.py | 106 +++++++++ 2 files changed, 306 insertions(+), 12 deletions(-) create mode 100644 tests/etl/sections/test_pregnancy_dating.py diff --git a/src/prenatalppkt/etl/sections/pregnancy_dating.py b/src/prenatalppkt/etl/sections/pregnancy_dating.py index 4c79114..93cf017 100644 --- a/src/prenatalppkt/etl/sections/pregnancy_dating.py +++ b/src/prenatalppkt/etl/sections/pregnancy_dating.py @@ -1,20 +1,208 @@ -""" -Pregnancy dating section parser (SKELETON). +from __future__ import annotations -TODO @VarenyaJ: Parse LMP, EDD, assigned dating method; Handle multiple dating methods (LMP, US, IVF) -""" +import json +import re +from datetime import datetime +from typing import Dict, Optional, Union -from typing import Dict +from prenatalppkt.gestational_age import GestationalAge -def parse_pregnancy_dating(data: str, source_format: str = "viewpoint_text") -> Dict: - """Extract pregnancy dating information.""" +DATE_FORMATS = ["%Y-%m-%d", "%m/%d/%Y", "%Y%m%d"] + + +def parse_pregnancy_dating(data: Union[str, Dict], source_format: str) -> Dict: + """ + Parse pregnancy dating information from ultrasound reports. + + Supported formats: + - observer_json + - viewpoint_text + - viewpoint_hl7 + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + result = _parse_observer_pregnancy(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + result = _parse_viewpoint_text_pregnancy(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + result = _parse_viewpoint_hl7_pregnancy(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + result["source_format"] = source_format + return result + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_pregnancy(json_data: Dict) -> Dict: + exam = json_data.get("exam", {}) + + lmp = exam.get("lmp") + edd = exam.get("edd") or exam.get("estimated_due_date") + dating_method = exam.get("dating_method") + + ga_by_lmp = _calculate_ga_from_lmp(lmp) if lmp else None + + return { + "lmp": lmp, + "edd": edd, + "assigned_edd": edd, + "dating_method": dating_method, + "ga_by_lmp": ga_by_lmp, + "ga_by_ultrasound": None, + "assigned_ga": ga_by_lmp, + "raw_data": json_data, + } + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_pregnancy(text: str) -> Dict: + """ + Extract pregnancy dating from ViewPoint text reports. + + Example: + Dating + ====== + LMP 01/15/2025 + EDD by LMP 10/22/2025 + Assigned dating based on LMP + """ + lmp = None + edd = None + dating_method = None + + section = _extract_dating_section(text) + + for line in section.splitlines(): + line = line.strip() + + if line.upper().startswith("LMP"): + lmp = _parse_date_from_text(line) + + elif "EDD" in line.upper(): + edd = _parse_date_from_text(line) + + elif "ASSIGNED" in line.upper(): + dating_method = line + + ga_by_lmp = _calculate_ga_from_lmp(lmp) if lmp else None + return { - "lmp": None, - "edd": None, - "assigned_edd": None, + "lmp": lmp, + "edd": edd, + "assigned_edd": edd, + "dating_method": dating_method, + "ga_by_lmp": ga_by_lmp, + "ga_by_ultrasound": None, + "assigned_ga": ga_by_lmp, + "raw_data": {"text": text}, + } + + +def _extract_dating_section(text: str) -> str: + pattern = re.compile( + r"Dating\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) + match = pattern.search(text) + return match.group("body") if match else "" + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_pregnancy(hl7: str) -> Dict: + lmp = None + edd = None + + for line in hl7.splitlines(): + if not line.startswith("OBX"): + continue + + fields = line.split("|") + if len(fields) < 6: + continue + + obs_id = fields[3] + value = fields[5] + + if "LastMenstrualPeriod" in obs_id: + lmp = _parse_date_string(value) + + elif "EDD" in obs_id: + edd = _parse_date_string(value) + + ga_by_lmp = _calculate_ga_from_lmp(lmp) if lmp else None + + return { + "lmp": lmp, + "edd": edd, + "assigned_edd": edd, "dating_method": None, - "ga_by_lmp": None, + "ga_by_lmp": ga_by_lmp, "ga_by_ultrasound": None, - "assigned_ga": None, + "assigned_ga": ga_by_lmp, + "raw_data": {"hl7": hl7}, } + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _parse_date_from_text(text: str) -> Optional[str]: + for token in re.split(r"\s+", text): + parsed = _parse_date_string(token) + if parsed: + return parsed + return None + + +def _parse_date_string(value: str) -> Optional[str]: + value = value.split("^")[0].strip() + + # Fast reject: must contain digits + if not any(ch.isdigit() for ch in value): + return None + + for fmt in DATE_FORMATS: + parsed = _try_parse_date(value, fmt) + if parsed: + return parsed + + return None + + +def _try_parse_date(value: str, fmt: str) -> Optional[str]: + try: + return datetime.strptime(value, fmt).date().isoformat() + except ValueError: + return None + + +def _calculate_ga_from_lmp(lmp_iso: str) -> Optional[Dict]: + try: + ga = GestationalAge.from_lmp(lmp_iso) + return {"weeks": ga.weeks, "days": ga.days} + except Exception: + return None diff --git a/tests/etl/sections/test_pregnancy_dating.py b/tests/etl/sections/test_pregnancy_dating.py new file mode 100644 index 0000000..74e68f0 --- /dev/null +++ b/tests/etl/sections/test_pregnancy_dating.py @@ -0,0 +1,106 @@ +import json +import pytest + +from prenatalppkt.etl.sections.pregnancy_dating import parse_pregnancy_dating + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestPregnancyDatingObserver: + def test_basic_lmp_and_edd(self): + data = json.dumps( + {"exam": {"lmp": "2025-01-15", "edd": "2025-10-22", "dating_method": "LMP"}} + ) + + result = parse_pregnancy_dating(data, "observer_json") + + assert result["lmp"] == "2025-01-15" + assert result["edd"] == "2025-10-22" + assert result["ga_by_lmp"] is None + assert result["source_format"] == "observer_json" + + def test_missing_dates(self): + data = json.dumps({"exam": {}}) + result = parse_pregnancy_dating(data, "observer_json") + assert result["lmp"] is None + assert result["ga_by_lmp"] is None + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +class TestPregnancyDatingViewPointText: + def test_basic_dating_section(self): + text = """Dating +====== +LMP 01/15/2025 +EDD by LMP 10/22/2025 +Assigned dating based on LMP +""" + + result = parse_pregnancy_dating(text, "viewpoint_text") + + assert result["lmp"] == "2025-01-15" + assert result["edd"] == "2025-10-22" + assert result["ga_by_lmp"] is None + assert "Assigned" in result["dating_method"] + + def test_missing_dating_section(self): + text = "Fetal Biometry\n============\nHC 175 mm" + result = parse_pregnancy_dating(text, "viewpoint_text") + assert result["lmp"] is None + assert result["edd"] is None + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +class TestPregnancyDatingViewPointHL7: + def test_basic_hl7_dates(self): + hl7 = ( + "OBX||DT|EpisodeHistory.LastMenstrualPeriod^LMP|1|20250115\n" + "OBX||DT|EpisodeHistory.EDDbyLMP^EDD|1|20251022\n" + ) + + result = parse_pregnancy_dating(hl7, "viewpoint_hl7") + + assert result["lmp"] == "2025-01-15" + assert result["edd"] == "2025-10-22" + assert result["ga_by_lmp"] is None + + def test_no_dates(self): + hl7 = "OBX||NM|SomeOtherField|1|123\n" + result = parse_pregnancy_dating(hl7, "viewpoint_hl7") + assert result["lmp"] is None + assert result["ga_by_lmp"] is None + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestPregnancyDatingEdgeCases: + def test_invalid_format(self): + with pytest.raises(ValueError): + parse_pregnancy_dating("data", "bad_format") + + def test_non_string_text(self): + with pytest.raises(ValueError): + parse_pregnancy_dating({"bad": "data"}, "viewpoint_text") + + def test_malformed_dates(self): + text = """Dating + ====== + LMP not-a-date + """ + result = parse_pregnancy_dating(text, "viewpoint_text") + assert result["lmp"] is None + assert result["ga_by_lmp"] is None From f22926a9e74ade1ea6e71db30d4caa8374b9f4d1 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Wed, 21 Jan 2026 22:22:47 -0500 Subject: [PATCH 03/20] feat(etl): implement clinical impression section parser --- .../etl/sections/clinical_impression.py | 173 ++++++++++++------ .../etl/sections/test_clinical_impression.py | 87 +++++++++ 2 files changed, 208 insertions(+), 52 deletions(-) create mode 100644 tests/etl/sections/test_clinical_impression.py diff --git a/src/prenatalppkt/etl/sections/clinical_impression.py b/src/prenatalppkt/etl/sections/clinical_impression.py index 4925e67..86038a4 100644 --- a/src/prenatalppkt/etl/sections/clinical_impression.py +++ b/src/prenatalppkt/etl/sections/clinical_impression.py @@ -1,66 +1,135 @@ -""" -Clinical impression section parser (SKELETON). +from __future__ import annotations -Extracts clinical impressions, diagnoses, and findings from report impression. +import json +import re +from typing import Dict, List, Optional, Union -TODO @VarenyaJ: Complete implementation, Map clinical findings to HPO terms, Extract structured anomalies from impression text -""" +from prenatalppkt.hpo import HpoParser -from typing import Dict - -def parse_clinical_impression(data: str, source_format: str = "viewpoint_text") -> Dict: +def parse_clinical_impression( + data: Union[str, Dict], source_format: str, hpo_parser: Optional[HpoParser] = None +) -> Dict: """ - Extract clinical impression from ultrasound report. - - Args: - data: Report content (text, JSON, or HL7) - source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" - - Returns: - Dict with keys: - - impression_text: str - Full impression narrative - - diagnoses: List[str] - Identified diagnoses - - anomalies: List[Dict] - Structured anomaly data - - gestational_age_assessment: str - GA conclusion - - growth_assessment: str - Fetal growth conclusion - - recommendations: List[str] - Follow-up recommendations - - hpo_terms: List[str] - Mapped HPO term IDs (FUTURE) - - TODO @VarenyaJ Implementation Steps: - 1. Locate impression section: - - ViewPoint Text: "Impression" section after "=========" - - Observer JSON: exam.finalize.generalComment.plain_text - - ViewPoint HL7: May be in RequestedProcedure or exam notes - 2. Parse free-text impression for key findings - 3. Extract anomalies: - - Observer JSON: fetuses[].anatomy[].anomalies[] - - Text: Look for patterns like "consistent with", "suggestive of" - 4. Identify growth conclusions (FGR, LGA, AGA) - 5. Extract recommendations for follow-up - 6. Map findings to HPO terms: - - Use src/prenatalppkt/hpo.cr_fetal_findings - - Handle synonyms and varied clinical language - - TODO @VarenyaJ: DO NOT: - - Assume impression section exists (optional in all formats) - - Parse impression without context (may reference biometry results) - - Miss negative findings (e.g., "no evidence of...") - - Ignore severity qualifiers (mild, moderate, severe) + Parse clinical impression / interpretation section. + + Supports: + - observer_json + - viewpoint_text + - viewpoint_hl7 """ - # SKELETON: Return empty structure + if hpo_parser is None: + hpo_parser = HpoParser() + + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + impression_text = _parse_observer_impression(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + impression_text = _parse_viewpoint_text_impression(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + impression_text = _parse_viewpoint_hl7_impression(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + if impression_text and hasattr(hpo_parser, "extract"): + hpo_terms = hpo_parser.extract(impression_text) + else: + hpo_terms = [] + return { - "impression_text": "", + "impression_text": impression_text, "diagnoses": [], "anomalies": [], "gestational_age_assessment": None, - "growth_assessment": None, + "growth_assessment": _infer_growth_assessment(impression_text), "recommendations": [], - "hpo_terms": [], # FUTURE + "hpo_terms": hpo_terms, + "source_format": source_format, } -# TODO @VarenyaJ: Add helper functions: -# - _extract_anomalies_from_text(text: str) -> List[Dict] -# - _classify_growth_assessment(text: str) -> str -# - _extract_recommendations(text: str) -> List[str] +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_impression(json_data: Dict) -> str: + exam = json_data.get("exam", {}) + finalize = exam.get("finalize", {}) + + return finalize.get("generalComment", {}).get("plain_text", "").strip() + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_impression(text: str) -> str: + """ + Impression + ========== + Free text narrative + """ + pattern = re.compile( + r"Impression\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) + + match = pattern.search(text) + return match.group("body").strip() if match else "" + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_impression(hl7: str) -> str: + lines: List[str] = [] + + for line in hl7.splitlines(): + if not line.startswith("OBX"): + continue + + fields = line.split("|") + if len(fields) < 6: + continue + + obs_id = fields[3] + value = fields[5].split("^")[0].strip() + + if "Impression" in obs_id or "Interpretation" in obs_id: + if value: + lines.append(value) + + return " ".join(lines) + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _infer_growth_assessment(text: str) -> Optional[str]: + if not text: + return None + + text_lower = text.lower() + + if "growth restriction" in text_lower or "fgr" in text_lower: + return "FGR" + if "large for gestational age" in text_lower or "lga" in text_lower: + return "LGA" + if "appropriate for gestational age" in text_lower or "aga" in text_lower: + return "AGA" + + return None diff --git a/tests/etl/sections/test_clinical_impression.py b/tests/etl/sections/test_clinical_impression.py new file mode 100644 index 0000000..715a2e1 --- /dev/null +++ b/tests/etl/sections/test_clinical_impression.py @@ -0,0 +1,87 @@ +import json +import pytest + +from prenatalppkt.etl.sections.clinical_impression import parse_clinical_impression + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestClinicalImpressionObserver: + def test_basic_impression(self, hpo_cr): + data = json.dumps( + { + "exam": { + "finalize": { + "generalComment": { + "plain_text": "Normal fetal anatomy. No abnormalities." + } + } + } + } + ) + + result = parse_clinical_impression(data, "observer_json", hpo_parser=hpo_cr) + + assert "Normal fetal anatomy" in result["impression_text"] + assert result["hpo_terms"] == [] + assert result["source_format"] == "observer_json" + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +class TestClinicalImpressionViewPointText: + def test_basic_impression(self, hpo_cr): + text = """Impression +========= +Fetal growth restriction is suspected. +Recommend follow-up scan. +""" + + result = parse_clinical_impression(text, "viewpoint_text", hpo_parser=hpo_cr) + + assert "growth restriction" in result["impression_text"].lower() + assert result["growth_assessment"] == "FGR" + assert isinstance(result["hpo_terms"], list) + + def test_missing_impression(self, hpo_cr): + text = "Fetal Biometry\n============\nHC 175 mm" + result = parse_clinical_impression(text, "viewpoint_text", hpo_parser=hpo_cr) + assert result["impression_text"] == "" + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +class TestClinicalImpressionViewPointHL7: + def test_basic_hl7_impression(self, hpo_cr): + hl7 = "OBX||TX|Impression^Impression|1|Appropriate for gestational age\n" + + result = parse_clinical_impression(hl7, "viewpoint_hl7", hpo_parser=hpo_cr) + + assert "Appropriate" in result["impression_text"] + assert result["growth_assessment"] == "AGA" + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestClinicalImpressionEdgeCases: + def test_invalid_format(self, hpo_cr): + with pytest.raises(ValueError): + parse_clinical_impression("data", "bad_format", hpo_parser=hpo_cr) + + def test_non_string_text(self, hpo_cr): + with pytest.raises(ValueError): + parse_clinical_impression( + {"bad": "data"}, "viewpoint_text", hpo_parser=hpo_cr + ) From bcd527fcdd826adcbd841307e2301db03c6d3a39 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 23 Jan 2026 10:19:20 -0500 Subject: [PATCH 04/20] WIP --- .../etl/sections/clinical_impression.py | 231 +++++++++++------- 1 file changed, 144 insertions(+), 87 deletions(-) diff --git a/src/prenatalppkt/etl/sections/clinical_impression.py b/src/prenatalppkt/etl/sections/clinical_impression.py index 86038a4..0003973 100644 --- a/src/prenatalppkt/etl/sections/clinical_impression.py +++ b/src/prenatalppkt/etl/sections/clinical_impression.py @@ -1,59 +1,80 @@ +""" +Clinical impression / interpretation section parser. + +Extracts clinical narrative text and optionally extracts HPO terms +from free text using the HPO Concept Recognizer. +""" + from __future__ import annotations import json import re from typing import Dict, List, Optional, Union -from prenatalppkt.hpo import HpoParser - def parse_clinical_impression( - data: Union[str, Dict], source_format: str, hpo_parser: Optional[HpoParser] = None + data: Union[str, Dict], source_format: str, hpo_cr=None ) -> Dict: - """ - Parse clinical impression / interpretation section. - - Supports: - - observer_json - - viewpoint_text - - viewpoint_hl7 - """ - if hpo_parser is None: - hpo_parser = HpoParser() - - if source_format == "observer_json": - if isinstance(data, str): - data = json.loads(data) - impression_text = _parse_observer_impression(data) - - elif source_format == "viewpoint_text": - if not isinstance(data, str): - raise ValueError("viewpoint_text data must be a string") - impression_text = _parse_viewpoint_text_impression(data) - - elif source_format == "viewpoint_hl7": - if not isinstance(data, str): - raise ValueError("viewpoint_hl7 data must be a string") - impression_text = _parse_viewpoint_hl7_impression(data) - - else: - raise ValueError(f"Unsupported source_format: {source_format}") - - if impression_text and hasattr(hpo_parser, "extract"): - hpo_terms = hpo_parser.extract(impression_text) - else: - hpo_terms = [] - - return { - "impression_text": impression_text, - "diagnoses": [], - "anomalies": [], - "gestational_age_assessment": None, - "growth_assessment": _infer_growth_assessment(impression_text), - "recommendations": [], - "hpo_terms": hpo_terms, - "source_format": source_format, - } + """ + Parse clinical impression / interpretation section. + + Supports: + - observer_json + - viewpoint_text + - viewpoint_hl7 + + Args: + data: Raw input data (JSON string, dict, or text) + source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + hpo_cr: Optional HpoExactConceptRecognizer for HPO term extraction. + If provided, will extract HPO terms from impression text. + + Returns: + Dict with keys: + - impression_text: str - Full impression narrative + - diagnoses: List[str] - Identified diagnoses (future) + - anomalies: List[Dict] - Structured anomaly data (future) + - gestational_age_assessment: Optional[str] - GA conclusion + - growth_assessment: Optional[str] - FGR, LGA, AGA, or None + - recommendations: List[str] - Follow-up recommendations (future) + - hpo_terms: List[SimpleTerm] - HPO terms extracted via CR + - source_format: str + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + impression_text = _parse_observer_impression(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + impression_text = _parse_viewpoint_text_impression(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + impression_text = _parse_viewpoint_hl7_impression(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + # Extract HPO terms if concept recognizer is provided + hpo_terms = [] + if impression_text and hpo_cr is not None: + # HpoExactConceptRecognizer uses parse() method, not extract() + if hasattr(hpo_cr, "parse"): + hpo_terms = hpo_cr.parse(impression_text) + + return { + "impression_text": impression_text, + "diagnoses": [], + "anomalies": [], + "gestational_age_assessment": None, + "growth_assessment": _infer_growth_assessment(impression_text), + "recommendations": [], + "hpo_terms": hpo_terms, + "source_format": source_format, + } # --------------------------------------------------------------------- @@ -62,10 +83,28 @@ def parse_clinical_impression( def _parse_observer_impression(json_data: Dict) -> str: - exam = json_data.get("exam", {}) - finalize = exam.get("finalize", {}) + """ + Extract impression from Observer JSON. + + The finalize block can be at: + - Root level: json_data["finalize"]["generalComment"]["plain_text"] + - Under exam: json_data["exam"]["finalize"]["generalComment"]["plain_text"] + + We check the root level first (most common), then fall back to exam. + """ + impression = "" + + # Check root level first (this is where Apple_Sally has it) + finalize = json_data.get("finalize", {}) + impression = finalize.get("generalComment", {}).get("plain_text", "").strip() - return finalize.get("generalComment", {}).get("plain_text", "").strip() + # Fall back to exam.finalize if not found at root + if not impression: + exam = json_data.get("exam", {}) + finalize = exam.get("finalize", {}) + impression = finalize.get("generalComment", {}).get("plain_text", "").strip() + + return impression # --------------------------------------------------------------------- @@ -74,18 +113,21 @@ def _parse_observer_impression(json_data: Dict) -> str: def _parse_viewpoint_text_impression(text: str) -> str: - """ - Impression - ========== - Free text narrative - """ - pattern = re.compile( - r"Impression\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", - re.DOTALL | re.IGNORECASE, - ) + """ + Extract impression from ViewPoint text reports. + + Expected pattern: + Impression + ========== + [free text narrative] + """ + pattern = re.compile( + r"Impression\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) - match = pattern.search(text) - return match.group("body").strip() if match else "" + match = pattern.search(text) + return match.group("body").strip() if match else "" # --------------------------------------------------------------------- @@ -94,24 +136,30 @@ def _parse_viewpoint_text_impression(text: str) -> str: def _parse_viewpoint_hl7_impression(hl7: str) -> str: - lines: List[str] = [] + """ + Extract impression from HL7 ORU^R01 messages. + + Looks for OBX segments containing "Impression" or "Interpretation" + in the observation identifier field. + """ + lines: List[str] = [] - for line in hl7.splitlines(): - if not line.startswith("OBX"): - continue + for line in hl7.splitlines(): + if not line.startswith("OBX"): + continue - fields = line.split("|") - if len(fields) < 6: - continue + fields = line.split("|") + if len(fields) < 6: + continue - obs_id = fields[3] - value = fields[5].split("^")[0].strip() + obs_id = fields[3] + value = fields[5].split("^")[0].strip() - if "Impression" in obs_id or "Interpretation" in obs_id: - if value: - lines.append(value) + if "Impression" in obs_id or "Interpretation" in obs_id: + if value: + lines.append(value) - return " ".join(lines) + return " ".join(lines) # --------------------------------------------------------------------- @@ -120,16 +168,25 @@ def _parse_viewpoint_hl7_impression(hl7: str) -> str: def _infer_growth_assessment(text: str) -> Optional[str]: - if not text: - return None - - text_lower = text.lower() - - if "growth restriction" in text_lower or "fgr" in text_lower: - return "FGR" - if "large for gestational age" in text_lower or "lga" in text_lower: - return "LGA" - if "appropriate for gestational age" in text_lower or "aga" in text_lower: - return "AGA" - - return None + """ + Infer fetal growth assessment from impression text. + + Returns: + "FGR" - Fetal Growth Restriction + "LGA" - Large for Gestational Age + "AGA" - Appropriate for Gestational Age + None - No assessment detected + """ + if not text: + return None + + text_lower = text.lower() + + if "growth restriction" in text_lower or "fgr" in text_lower: + return "FGR" + if "large for gestational age" in text_lower or "lga" in text_lower: + return "LGA" + if "appropriate for gestational age" in text_lower or "aga" in text_lower: + return "AGA" + + return None \ No newline at end of file From 07a2cb9a5ba95c24524d2e72174c7b8a038116a7 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Fri, 23 Jan 2026 10:20:08 -0500 Subject: [PATCH 05/20] add current draft of notebook --- prenatalppkt.ipynb | 1525 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1525 insertions(+) create mode 100644 prenatalppkt.ipynb diff --git a/prenatalppkt.ipynb b/prenatalppkt.ipynb new file mode 100644 index 0000000..2426c29 --- /dev/null +++ b/prenatalppkt.ipynb @@ -0,0 +1,1525 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8b89cf52", + "metadata": {}, + "source": [ + "# Demo" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d8f2cfce", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for head_circumference\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for biparietal_diameter\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for femur_length\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for abdominal_circumference\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for occipitofrontal_diameter\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Loaded mappings for: ['head_circumference', 'biparietal_diameter', 'femur_length', 'abdominal_circumference', 'occipitofrontal_diameter']\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Starting Observer JSON extraction\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing fetus 1\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Found 6 measurements\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: AC\n", + "DEBUG:prenatalppkt.etl.extractors.observer:AC has percentile=55.6% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for AC: value=226.20000000000002mm, percentile=55.6%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=AC, value=226.20000000000002mm, percentile=55.6%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0034207 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: BPD\n", + "DEBUG:prenatalppkt.etl.extractors.observer:BPD has percentile=51.2% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for BPD: value=66.8mm, percentile=51.2%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=BPD, value=66.8mm, percentile=51.2%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: HC\n", + "DEBUG:prenatalppkt.etl.extractors.observer:HC has percentile=42.5% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for HC: value=250.0mm, percentile=42.5%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=HC, value=250.0mm, percentile=42.5%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Femur\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Femur has percentile=46.8% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Femur: value=50.099999999999994mm, percentile=46.8%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Femur, value=50.099999999999994mm, percentile=46.8%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0002823 - Abnormal femur morphology\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0002823 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Nuchal Fold\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Nuchal Fold has percentile=0% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Nuchal Fold: value=10.0mm, percentile=0%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Nuchal Fold, value=10.0mm, percentile=0.0%, ga=, method=None\n", + "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Nuchal Fold' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Cerebellum\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Cerebellum has percentile=0% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Cerebellum: value=30.0mm, percentile=0%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Cerebellum, value=30.0mm, percentile=0.0%, ga=, method=None\n", + "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Cerebellum' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Successfully parsed 4 measurements\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Validating 4 TermBins for required measurements\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: AC\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: BPD\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: HC\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: Femur\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'Femur', 'AC', 'BPD', 'HC'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'Femur', 'AC', 'BPD', 'HC'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:All required measurements present\n", + "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "================================================================================\n", + "PRENATALPPKT ETL PIPELINE\n", + "Observer JSON → TermBins → Phenopacket v2.0\n", + "================================================================================\n", + "\n", + " STEP 1: Loading Observer JSON...\n", + "Loaded: tests/data/Apple_Sally_pretty.json\n", + "Fetuses: 1\n", + "Measurements: 6\n", + "Sample: AC = 22.62 cm\n", + "\n", + " STEP 2: Extracting biometry measurements to TermBins...\n", + " Extracted 4 TermBins\n", + "\n", + " [1] AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", + " HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + " Normal: True\n", + "\n", + " [2] BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", + " HPO: HP:0000240 - Abnormality of skull size\n", + " Normal: True\n", + "\n", + " [3] HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", + " HPO: HP:0000240 - Abnormality of skull size\n", + " Normal: True\n", + "\n", + " [4] Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", + " HPO: HP:0002823 - Abnormal femur morphology\n", + " Normal: True\n", + "\n", + " STEP 3: Converting TermBins to PhenotypicFeatures...\n", + " Generated 4 PhenotypicFeatures\n", + "\n", + " [1] HP:0034207\n", + " Status: EXCLUDED (normal)\n", + " Description: AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", + "\n", + " [2] HP:0000240\n", + " Status: EXCLUDED (normal)\n", + " Description: BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", + "\n", + " [3] HP:0000240\n", + " Status: EXCLUDED (normal)\n", + " Description: HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", + "\n", + " [4] HP:0002823\n", + " Status: EXCLUDED (normal)\n", + " Description: Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", + "\n", + " STEP 4: Building Phenopacket v2.0...\n", + "✓ Phenopacket created successfully\n", + "\n", + "================================================================================\n", + " PHENOPACKET v2.0 OUTPUT (JSON)\n", + "================================================================================\n", + "{\n", + " \"id\": \"apple-sally-fetus-1\",\n", + " \"subject\": {\n", + " \"id\": \"fetus-1\",\n", + " \"time_at_last_encounter\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " \"phenotypic_features\": [\n", + " {\n", + " \"description\": \"AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0034207\",\n", + " \"label\": \"Abnormal fetal gastrointestinal system morphology\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000240\",\n", + " \"label\": \"Abnormality of skull size\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000240\",\n", + " \"label\": \"Abnormality of skull size\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0002823\",\n", + " \"label\": \"Abnormal femur morphology\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 27\n", + " }\n", + " }\n", + " }\n", + " ],\n", + " \"meta_data\": {\n", + " \"created\": \"2026-01-23T14:56:52.244568Z\",\n", + " \"created_by\": \"prenatalppkt-etl-pipeline\",\n", + " \"resources\": [\n", + " {\n", + " \"id\": \"hp\",\n", + " \"name\": \"Human Phenotype Ontology\",\n", + " \"url\": \"http://purl.obolibrary.org/obo/hp.owl\",\n", + " \"version\": \"2025-11-24\",\n", + " \"namespace_prefix\": \"HP\",\n", + " \"iri_prefix\": \"http://purl.obolibrary.org/obo/HP_\"\n", + " }\n", + " ],\n", + " \"phenopacket_schema_version\": \"2.0\"\n", + " }\n", + "}\n", + "\n", + "================================================================================\n", + " VALIDATION SUMMARY\n", + "================================================================================\n", + "\n", + " Phenopacket Structure:\n", + " ID: apple-sally-fetus-1\n", + " Subject ID: fetus-1\n", + " Subject GA: 26w6d\n", + " Sex: UNKNOWN_SEX\n", + " Phenotypic Features: 4\n", + " Schema Version: 2.0\n", + " HPO Resource: 2025-11-24\n", + "\n", + " Phenotypic Features Detail:\n", + "\n", + " [1] HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + " Normal (excluded)\n", + " Onset: 26w6d\n", + " Detail: AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", + "\n", + " [2] HP:0000240 - Abnormality of skull size\n", + " Normal (excluded)\n", + " Onset: 26w6d\n", + " Detail: BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", + "\n", + " [3] HP:0000240 - Abnormality of skull size\n", + " Normal (excluded)\n", + " Onset: 26w6d\n", + " Detail: HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", + "\n", + " [4] HP:0002823 - Abnormal femur morphology\n", + " Normal (excluded)\n", + " Onset: 27w0d\n", + " Detail: Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", + "\n", + " Summary Statistics:\n", + " Total features: 4\n", + " Normal (excluded): 4\n", + " Abnormal (observed): 0\n", + "\n", + "================================================================================\n", + " SUCCESS: Valid Phenopacket v2.0 generated\n", + "================================================================================\n", + "\n", + " Phenopacket saved to: output/apple_sally_phenopacket_v2.json\n", + "\n", + " Validation: Round-trip test...\n", + " Validation passed\n" + ] + } + ], + "source": [ + "# Initial Demo\n", + "\"\"\"\n", + "PRENATALPPKT ETL PIPELINE\n", + "Observer JSON → TermBins → Phenopacket v2.0\n", + "\n", + "Uses the official GA4GH phenopackets library per:\n", + "https://phenopacket-schema.readthedocs.io/en/latest/python.html\n", + "\"\"\"\n", + "\n", + "import json\n", + "import re\n", + "from datetime import datetime, timezone\n", + "from pathlib import Path\n", + "\n", + "from google.protobuf.json_format import MessageToJson\n", + "from google.protobuf.timestamp_pb2 import Timestamp\n", + "import phenopackets.schema.v2 as pps2\n", + "\n", + "from prenatalppkt.etl.extractors import observer\n", + "from prenatalppkt.gestational_age import GestationalAge\n", + "\n", + "print(\"=\" * 80)\n", + "print(\"PRENATALPPKT ETL PIPELINE\")\n", + "print(\"Observer JSON → TermBins → Phenopacket v2.0\")\n", + "print(\"=\" * 80)\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# STEP 1: Load Apple Sally Observer JSON\n", + "# -----------------------------------------------------------------------------\n", + "print(\"\\n STEP 1: Loading Observer JSON...\")\n", + "\n", + "data_path = Path(\"tests/data/Apple_Sally_pretty.json\")\n", + "with open(data_path) as f:\n", + " observer_data = json.load(f)\n", + "\n", + "print(f\"Loaded: {data_path}\")\n", + "print(f\"Fetuses: {len(observer_data.get('fetuses', []))}\")\n", + "\n", + "first_fetus = observer_data[\"fetuses\"][0]\n", + "measurements = first_fetus.get(\"measurements\", [])\n", + "print(f\"Measurements: {len(measurements)}\")\n", + "print(\n", + " f\"Sample: {measurements[0]['label']} = \"\n", + " f\"{measurements[0]['value']} {measurements[0]['unit_of_measure']}\"\n", + ")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# STEP 2: Extract TermBins using Observer extractor\n", + "# -----------------------------------------------------------------------------\n", + "print(\"\\n STEP 2: Extracting biometry measurements to TermBins...\")\n", + "\n", + "term_bins = observer.extract(observer_data)\n", + "print(f\" Extracted {len(term_bins)} TermBins\")\n", + "\n", + "for i, tb in enumerate(term_bins, 1):\n", + " print(f\"\\n [{i}] {tb.description}\")\n", + " print(f\" HPO: {tb.hpo_id} - {tb.hpo_label}\")\n", + " print(f\" Normal: {tb.normal}\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# STEP 3: Convert TermBins → Phenotypic Features (using phenopackets library)\n", + "# -----------------------------------------------------------------------------\n", + "print(\"\\n STEP 3: Converting TermBins to PhenotypicFeatures...\")\n", + "\n", + "\n", + "def parse_ga_from_description(description: str) -> tuple[int, int]:\n", + " \"\"\"Extract weeks and days from TermBin description.\"\"\"\n", + " match = re.search(r\"at (\\d+)w(\\d+)d\", description)\n", + " if match:\n", + " return int(match.group(1)), int(match.group(2))\n", + " # Fallback\n", + " first_m = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", + " ga = GestationalAge.from_weeks(first_m.get(\"calculated_ega\", 26.9))\n", + " return ga.weeks, ga.days\n", + "\n", + "\n", + "phenotypic_features = []\n", + "\n", + "for tb in term_bins:\n", + " weeks, days = parse_ga_from_description(tb.description)\n", + "\n", + " # Create GestationalAge message\n", + " gestational_age = pps2.GestationalAge(weeks=weeks, days=days)\n", + "\n", + " # Create TimeElement with gestational_age\n", + " onset = pps2.TimeElement(gestational_age=gestational_age)\n", + "\n", + " # Create OntologyClass for the HPO term\n", + " hpo_type = pps2.OntologyClass(id=tb.hpo_id, label=tb.hpo_label)\n", + "\n", + " # Create PhenotypicFeature\n", + " pf = pps2.PhenotypicFeature(\n", + " type=hpo_type,\n", + " excluded=tb.normal, # If normal=True, abnormality is excluded\n", + " onset=onset,\n", + " description=tb.description,\n", + " )\n", + "\n", + " phenotypic_features.append(pf)\n", + "\n", + "print(f\" Generated {len(phenotypic_features)} PhenotypicFeatures\")\n", + "\n", + "for i, pf in enumerate(phenotypic_features, 1):\n", + " status = \"EXCLUDED (normal)\" if pf.excluded else \"OBSERVED (abnormal)\"\n", + " print(f\"\\n [{i}] {pf.type.id}\")\n", + " print(f\" Status: {status}\")\n", + " print(f\" Description: {pf.description}\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# STEP 4: Build Complete Phenopacket v2.0\n", + "# -----------------------------------------------------------------------------\n", + "print(\"\\n STEP 4: Building Phenopacket v2.0...\")\n", + "\n", + "# Get subject GA from first measurement\n", + "first_measurement = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", + "subject_ga_weeks = first_measurement.get(\"calculated_ega\", 26.9)\n", + "subject_ga = GestationalAge.from_weeks(subject_ga_weeks)\n", + "\n", + "# Create Individual (subject) with GestationalAge\n", + "subject_time = pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + ")\n", + "\n", + "subject = pps2.Individual(\n", + " id=\"fetus-1\",\n", + " sex=pps2.Sex.UNKNOWN_SEX,\n", + " time_at_last_encounter=subject_time,\n", + ")\n", + "\n", + "# Create timestamp for metadata\n", + "now = datetime.now(timezone.utc)\n", + "created_timestamp = Timestamp()\n", + "created_timestamp.FromDatetime(now)\n", + "\n", + "# Create HPO Resource\n", + "hpo_resource = pps2.Resource(\n", + " id=\"hp\",\n", + " name=\"Human Phenotype Ontology\",\n", + " url=\"http://purl.obolibrary.org/obo/hp.owl\",\n", + " version=\"2025-11-24\",\n", + " namespace_prefix=\"HP\",\n", + " iri_prefix=\"http://purl.obolibrary.org/obo/HP_\",\n", + ")\n", + "\n", + "# Create MetaData\n", + "metadata = pps2.MetaData(\n", + " created=created_timestamp,\n", + " created_by=\"prenatalppkt-etl-pipeline\",\n", + " phenopacket_schema_version=\"2.0\",\n", + ")\n", + "metadata.resources.append(hpo_resource)\n", + "\n", + "# Create the Phenopacket\n", + "phenopacket = pps2.Phenopacket(\n", + " id=\"apple-sally-fetus-1\",\n", + " subject=subject,\n", + " meta_data=metadata,\n", + ")\n", + "phenopacket.phenotypic_features.extend(phenotypic_features)\n", + "\n", + "print(\"✓ Phenopacket created successfully\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# STEP 5: Display Results as JSON\n", + "# -----------------------------------------------------------------------------\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\" PHENOPACKET v2.0 OUTPUT (JSON)\")\n", + "print(\"=\" * 80)\n", + "\n", + "# Convert protobuf message to JSON using official method\n", + "phenopacket_json = MessageToJson(phenopacket, preserving_proto_field_name=True)\n", + "print(phenopacket_json)\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# STEP 6: Validation Summary\n", + "# -----------------------------------------------------------------------------\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\" VALIDATION SUMMARY\")\n", + "print(\"=\" * 80)\n", + "\n", + "print(\"\\n Phenopacket Structure:\")\n", + "print(f\" ID: {phenopacket.id}\")\n", + "print(f\" Subject ID: {phenopacket.subject.id}\")\n", + "print(f\" Subject GA: {subject_ga.weeks}w{subject_ga.days}d\")\n", + "print(f\" Sex: {pps2.Sex.Name(phenopacket.subject.sex)}\")\n", + "print(f\" Phenotypic Features: {len(phenopacket.phenotypic_features)}\")\n", + "print(f\" Schema Version: {phenopacket.meta_data.phenopacket_schema_version}\")\n", + "print(f\" HPO Resource: {phenopacket.meta_data.resources[0].version}\")\n", + "\n", + "print(\"\\n Phenotypic Features Detail:\")\n", + "for i, pf in enumerate(phenopacket.phenotypic_features, 1):\n", + " status = \" Normal (excluded)\" if pf.excluded else \"Abnormal (observed)\"\n", + " ga = pf.onset.gestational_age\n", + " print(f\"\\n [{i}] {pf.type.id} - {pf.type.label}\")\n", + " print(f\" {status}\")\n", + " print(f\" Onset: {ga.weeks}w{ga.days}d\")\n", + " print(f\" Detail: {pf.description}\")\n", + "\n", + "# Count normal vs abnormal\n", + "normal_count = sum(1 for pf in phenopacket.phenotypic_features if pf.excluded)\n", + "abnormal_count = len(phenopacket.phenotypic_features) - normal_count\n", + "\n", + "print(\"\\n Summary Statistics:\")\n", + "print(f\" Total features: {len(phenopacket.phenotypic_features)}\")\n", + "print(f\" Normal (excluded): {normal_count}\")\n", + "print(f\" Abnormal (observed): {abnormal_count}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\" SUCCESS: Valid Phenopacket v2.0 generated\")\n", + "print(\"=\" * 80)\n", + "\n", + "# Save to file\n", + "output_path = Path(\"output/apple_sally_phenopacket_v2.json\")\n", + "output_path.parent.mkdir(exist_ok=True)\n", + "with open(output_path, \"w\") as f:\n", + " f.write(phenopacket_json)\n", + "print(f\"\\n Phenopacket saved to: {output_path}\")\n", + "\n", + "# Validate by round-tripping\n", + "print(\"\\n Validation: Round-trip test...\")\n", + "from google.protobuf.json_format import Parse\n", + "\n", + "parsed_back = Parse(phenopacket_json, pps2.Phenopacket())\n", + "assert parsed_back.id == phenopacket.id\n", + "assert len(parsed_back.phenotypic_features) == len(phenopacket.phenotypic_features)\n", + "print(\" Validation passed\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3685f9e5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for head_circumference\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for biparietal_diameter\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for femur_length\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for abdominal_circumference\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for occipitofrontal_diameter\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Loaded mappings for: ['head_circumference', 'biparietal_diameter', 'femur_length', 'abdominal_circumference', 'occipitofrontal_diameter']\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Starting Observer JSON extraction\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing fetus 1\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Found 6 measurements\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: AC\n", + "DEBUG:prenatalppkt.etl.extractors.observer:AC has percentile=55.6% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for AC: value=226.20000000000002mm, percentile=55.6%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=AC, value=226.20000000000002mm, percentile=55.6%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0034207 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: BPD\n", + "DEBUG:prenatalppkt.etl.extractors.observer:BPD has percentile=51.2% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for BPD: value=66.8mm, percentile=51.2%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=BPD, value=66.8mm, percentile=51.2%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: HC\n", + "DEBUG:prenatalppkt.etl.extractors.observer:HC has percentile=42.5% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for HC: value=250.0mm, percentile=42.5%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=HC, value=250.0mm, percentile=42.5%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Femur\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Femur has percentile=46.8% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Femur: value=50.099999999999994mm, percentile=46.8%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Femur, value=50.099999999999994mm, percentile=46.8%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0002823 - Abnormal femur morphology\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0002823 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Nuchal Fold\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Nuchal Fold has percentile=0% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Nuchal Fold: value=10.0mm, percentile=0%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Nuchal Fold, value=10.0mm, percentile=0.0%, ga=, method=None\n", + "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Nuchal Fold' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Cerebellum\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Cerebellum has percentile=0% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Cerebellum: value=30.0mm, percentile=0%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Cerebellum, value=30.0mm, percentile=0.0%, ga=, method=None\n", + "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Cerebellum' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Successfully parsed 4 measurements\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Validating 4 TermBins for required measurements\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: AC\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: BPD\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: HC\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: Femur\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'Femur', 'AC', 'BPD', 'HC'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'Femur', 'AC', 'BPD', 'HC'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:All required measurements present\n", + "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " STEP 1: Loading Observer JSON...\n", + "Loaded: tests/data/Apple_Sally_pretty.json\n", + "Fetuses: 1\n", + "Measurements: 6\n", + "Sample: AC = 22.62 cm\n", + "\n", + " STEP 2: Extracting biometry measurements to TermBins...\n", + " Extracted 4 TermBins\n", + "\n", + " [1] AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", + " HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + " Normal: True\n", + "\n", + " [2] BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", + " HPO: HP:0000240 - Abnormality of skull size\n", + " Normal: True\n", + "\n", + " [3] HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", + " HPO: HP:0000240 - Abnormality of skull size\n", + " Normal: True\n", + "\n", + " [4] Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", + " HPO: HP:0002823 - Abnormal femur morphology\n", + " Normal: True\n", + "\n", + " STEP 3: Converting TermBins to PhenotypicFeatures...\n", + " Generated 4 PhenotypicFeatures\n", + "\n", + " [1] HP:0034207\n", + " Status: EXCLUDED (normal)\n", + " Description: AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", + "\n", + " [2] HP:0000240\n", + " Status: EXCLUDED (normal)\n", + " Description: BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", + "\n", + " [3] HP:0000240\n", + " Status: EXCLUDED (normal)\n", + " Description: HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", + "\n", + " [4] HP:0002823\n", + " Status: EXCLUDED (normal)\n", + " Description: Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", + "\n", + " STEP 4: Building Phenopacket v2.0...\n", + "{\n", + " \"id\": \"apple-sally-fetus-1\",\n", + " \"subject\": {\n", + " \"id\": \"fetus-1\",\n", + " \"time_at_last_encounter\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " \"phenotypic_features\": [\n", + " {\n", + " \"description\": \"AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0034207\",\n", + " \"label\": \"Abnormal fetal gastrointestinal system morphology\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000240\",\n", + " \"label\": \"Abnormality of skull size\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000240\",\n", + " \"label\": \"Abnormality of skull size\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0002823\",\n", + " \"label\": \"Abnormal femur morphology\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 27\n", + " }\n", + " }\n", + " }\n", + " ],\n", + " \"meta_data\": {\n", + " \"created\": \"2026-01-23T14:56:52.295444Z\",\n", + " \"created_by\": \"prenatalppkt-etl-pipeline\",\n", + " \"resources\": [\n", + " {\n", + " \"id\": \"hp\",\n", + " \"name\": \"Human Phenotype Ontology\",\n", + " \"url\": \"http://purl.obolibrary.org/obo/hp.owl\",\n", + " \"version\": \"2025-11-24\",\n", + " \"namespace_prefix\": \"HP\",\n", + " \"iri_prefix\": \"http://purl.obolibrary.org/obo/HP_\"\n", + " }\n", + " ],\n", + " \"phenopacket_schema_version\": \"2.0\"\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "# Shorter Test\n", + "\n", + "import json\n", + "import re\n", + "from datetime import datetime, timezone\n", + "from pathlib import Path\n", + "from google.protobuf.json_format import MessageToJson\n", + "from google.protobuf.timestamp_pb2 import Timestamp\n", + "import phenopackets.schema.v2 as pps2\n", + "from prenatalppkt.etl.extractors import observer\n", + "from prenatalppkt.gestational_age import GestationalAge\n", + "\n", + "print(\"\\n STEP 1: Loading Observer JSON...\")\n", + "data_path = Path(\"tests/data/Apple_Sally_pretty.json\")\n", + "with open(data_path) as f:\n", + " observer_data = json.load(f)\n", + "print(f\"Loaded: {data_path}\")\n", + "print(f\"Fetuses: {len(observer_data.get('fetuses', []))}\")\n", + "\n", + "first_fetus = observer_data[\"fetuses\"][0]\n", + "measurements = first_fetus.get(\"measurements\", [])\n", + "print(f\"Measurements: {len(measurements)}\")\n", + "print(f\"Sample: {measurements[0]['label']} = \", f\"{measurements[0]['value']} {measurements[0]['unit_of_measure']}\")\n", + "\n", + "print(\"\\n STEP 2: Extracting biometry measurements to TermBins...\")\n", + "term_bins = observer.extract(observer_data)\n", + "print(f\" Extracted {len(term_bins)} TermBins\")\n", + "for i, tb in enumerate(term_bins, 1):\n", + " print(f\"\\n [{i}] {tb.description}\")\n", + " print(f\" HPO: {tb.hpo_id} - {tb.hpo_label}\")\n", + " print(f\" Normal: {tb.normal}\")\n", + "\n", + "print(\"\\n STEP 3: Converting TermBins to PhenotypicFeatures...\")\n", + "def parse_ga_from_description(description: str) -> tuple[int, int]:\n", + " \"\"\"Extract weeks and days from TermBin description.\"\"\"\n", + " match = re.search(r\"at (\\d+)w(\\d+)d\", description)\n", + " if match:\n", + " return int(match.group(1)), int(match.group(2))\n", + " # Fallback\n", + " first_m = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", + " ga = GestationalAge.from_weeks(first_m.get(\"calculated_ega\", 26.9))\n", + " return ga.weeks, ga.days\n", + "phenotypic_features = []\n", + "for tb in term_bins:\n", + " weeks, days = parse_ga_from_description(tb.description)\n", + " # Create GestationalAge message\n", + " gestational_age = pps2.GestationalAge(weeks=weeks, days=days)\n", + " # Create TimeElement with gestational_age\n", + " onset = pps2.TimeElement(gestational_age=gestational_age)\n", + " # Create OntologyClass for the HPO term\n", + " hpo_type = pps2.OntologyClass(id=tb.hpo_id, label=tb.hpo_label)\n", + " # Create PhenotypicFeature\n", + " pf = pps2.PhenotypicFeature( type=hpo_type, excluded=tb.normal, onset=onset, description=tb.description)\n", + " phenotypic_features.append(pf)\n", + "print(f\" Generated {len(phenotypic_features)} PhenotypicFeatures\")\n", + "for i, pf in enumerate(phenotypic_features, 1):\n", + " status = \"EXCLUDED (normal)\" if pf.excluded else \"OBSERVED (abnormal)\"\n", + " print(f\"\\n [{i}] {pf.type.id}\")\n", + " print(f\" Status: {status}\")\n", + " print(f\" Description: {pf.description}\")\n", + "\n", + "\n", + "print(\"\\n STEP 4: Building Phenopacket v2.0...\")\n", + "# Get subject GA from first measurement\n", + "first_measurement = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", + "subject_ga_weeks = first_measurement.get(\"calculated_ega\", 26.9)\n", + "subject_ga = GestationalAge.from_weeks(subject_ga_weeks)\n", + "# Create Individual (subject) with GestationalAge\n", + "subject_time = pps2.TimeElement(gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days))\n", + "\n", + "subject = pps2.Individual(id=\"fetus-1\", sex=pps2.Sex.UNKNOWN_SEX, time_at_last_encounter=subject_time)\n", + "\n", + "# Create timestamp for metadata\n", + "now = datetime.now(timezone.utc)\n", + "created_timestamp = Timestamp()\n", + "created_timestamp.FromDatetime(now)\n", + "\n", + "# Create HPO Resource\n", + "hpo_resource = pps2.Resource(id=\"hp\", name=\"Human Phenotype Ontology\", url=\"http://purl.obolibrary.org/obo/hp.owl\", version=\"2025-11-24\", namespace_prefix=\"HP\", iri_prefix=\"http://purl.obolibrary.org/obo/HP_\")\n", + "\n", + "# Create MetaData\n", + "metadata = pps2.MetaData(created=created_timestamp, created_by=\"prenatalppkt-etl-pipeline\", phenopacket_schema_version=\"2.0\")\n", + "metadata.resources.append(hpo_resource)\n", + "\n", + "# Create the Phenopacket\n", + "phenopacket = pps2.Phenopacket(id=\"apple-sally-fetus-1\", subject=subject, meta_data=metadata)\n", + "phenopacket.phenotypic_features.extend(phenotypic_features)\n", + "\n", + "# Convert protobuf message to JSON using official method\n", + "phenopacket_json = MessageToJson(phenopacket, preserving_proto_field_name=True)\n", + "print(phenopacket_json)" + ] + }, + { + "cell_type": "markdown", + "id": "1e24f7ff", + "metadata": {}, + "source": [ + "# New" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0f79d3fe", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:hpotk.util:Using default encoding 'utf-8'\n", + "DEBUG:hpotk.util:Opening /tmp/hp.json\n", + "DEBUG:hpotk.util:Looks like a local file: /tmp/hp.json\n", + "DEBUG:hpotk.util:Looks like decompressed data\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "================================================================================\n", + "PRENATALPPKT EXPANDED ETL PIPELINE\n", + "Observer JSON -> Biometry + Clinical Sections -> Phenopacket v2.0\n", + "================================================================================\n", + "\n", + "[STEP 1] Loading HPO Concept Recognizer...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:hpotk.ontology.load.obographs._load:Extracting ontology terms\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/BFO_0000051', 'lbl': 'has part', 'meta': {'xrefs': [{'val': 'BFO:0000051'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'has_part'}]}}\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/BFO_0000066', 'lbl': 'occurs in', 'meta': {'xrefs': [{'val': 'BFO:0000066'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'occurs_in'}]}}\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/RO_0002503', 'lbl': 'towards', 'meta': {'xrefs': [{'val': 'RO:0002503'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'towards'}]}}\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/RO_0002573', 'lbl': 'has modifier', 'meta': {'comments': ['placeholder relation to indicate normality/abnormality.'], 'xrefs': [{'val': 'RO:0002180'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'qualifier'}]}}\n", + "DEBUG:hpotk.ontology.load.obographs._load:Creating the edge list\n", + "DEBUG:hpotk.ontology.load.obographs._load:Building ontology graph\n", + "DEBUG:hpotk.graph._factory:Creating ontology graph from 23612 edges\n", + "DEBUG:hpotk.graph._factory:Found root HP:0000001\n", + "DEBUG:hpotk.graph._factory:Extracted 19262 nodes\n", + "DEBUG:hpotk.ontology.load.obographs._load:Assembling the ontology\n", + "DEBUG:hpotk.ontology.load.obographs._load:Done\n", + "DEBUG:prenatalppkt.hpo.hpo_parser:Instantiating HPO concept recognizer.\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for head_circumference\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for biparietal_diameter\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for femur_length\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for abdominal_circumference\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for occipitofrontal_diameter\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Loaded mappings for: ['head_circumference', 'biparietal_diameter', 'femur_length', 'abdominal_circumference', 'occipitofrontal_diameter']\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Starting Observer JSON extraction\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing fetus 1\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Found 6 measurements\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: AC\n", + "DEBUG:prenatalppkt.etl.extractors.observer:AC has percentile=55.6% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for AC: value=226.20000000000002mm, percentile=55.6%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=AC, value=226.20000000000002mm, percentile=55.6%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0034207 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: BPD\n", + "DEBUG:prenatalppkt.etl.extractors.observer:BPD has percentile=51.2% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for BPD: value=66.8mm, percentile=51.2%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=BPD, value=66.8mm, percentile=51.2%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: HC\n", + "DEBUG:prenatalppkt.etl.extractors.observer:HC has percentile=42.5% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for HC: value=250.0mm, percentile=42.5%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=HC, value=250.0mm, percentile=42.5%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Femur\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Femur has percentile=46.8% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Femur: value=50.099999999999994mm, percentile=46.8%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Femur, value=50.099999999999994mm, percentile=46.8%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0002823 - Abnormal femur morphology\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0002823 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Nuchal Fold\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Nuchal Fold has percentile=0% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Nuchal Fold: value=10.0mm, percentile=0%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Nuchal Fold, value=10.0mm, percentile=0.0%, ga=, method=None\n", + "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Nuchal Fold' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Cerebellum\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Cerebellum has percentile=0% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Cerebellum: value=30.0mm, percentile=0%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Cerebellum, value=30.0mm, percentile=0.0%, ga=, method=None\n", + "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Cerebellum' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Successfully parsed 4 measurements\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Validating 4 TermBins for required measurements\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: AC\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: BPD\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: HC\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: Femur\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'Femur', 'AC', 'BPD', 'HC'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'Femur', 'AC', 'BPD', 'HC'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:All required measurements present\n", + "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n", + "DEBUG:hpotk.store._github:Pulling tag from https://api.github.com/repos/obophenotype/human-phenotype-ontology/tags\n", + "DEBUG:hpotk.store._github:Fetched 30 tags\n", + "DEBUG:hpotk.util:Using default encoding 'utf-8'\n", + "DEBUG:hpotk.util:Opening /home/varenya/.hpo-toolkit/HP/hp.v2026-01-08.json\n", + "DEBUG:hpotk.util:Looks like a local file: /home/varenya/.hpo-toolkit/HP/hp.v2026-01-08.json\n", + "DEBUG:hpotk.util:Looks like decompressed data\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ? HPO version: 2025-10-22\n", + " ? Concept recognizer: HpoExactConceptRecognizer\n", + "\n", + "[STEP 2] Loading Observer JSON...\n", + " ? Loaded: tests/data/Apple_Sally_pretty.json\n", + " ? Fetuses: 1\n", + " ? Measurements: 6\n", + " ? Sample: AC = 22.62 cm\n", + "\n", + "[STEP 3] Extracting biometry measurements to TermBins...\n", + " ? Extracted 4 TermBins:\n", + " [1] HP:0034207 (Abnormal fetal gastrointestinal system morphology) - ? Normal\n", + " AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", + " [2] HP:0000240 (Abnormality of skull size) - ? Normal\n", + " BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", + " [3] HP:0000240 (Abnormality of skull size) - ? Normal\n", + " HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", + " [4] HP:0002823 (Abnormal femur morphology) - ? Normal\n", + " Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", + "\n", + "[STEP 4] Parsing clinical sections...\n", + "\n", + " --- Clinical Indication ---\n", + " Indication: (not found)\n", + "\n", + " --- Pregnancy Dating ---\n", + " LMP: 0001-01-01\n", + " EDD: None\n", + " Dating Method: None\n", + " GA by Ultrasound: None\n", + "\n", + " --- Clinical Impression ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:hpotk.ontology.load.obographs._load:Extracting ontology terms\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/BFO_0000051', 'lbl': 'has part', 'meta': {'xrefs': [{'val': 'BFO:0000051'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'has_part'}]}}\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/BFO_0000066', 'lbl': 'occurs in', 'meta': {'xrefs': [{'val': 'BFO:0000066'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'occurs_in'}]}}\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/RO_0002503', 'lbl': 'towards', 'meta': {'xrefs': [{'val': 'RO:0002503'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'towards'}]}}\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/RO_0002573', 'lbl': 'has modifier', 'meta': {'comments': ['placeholder relation to indicate normality/abnormality.'], 'xrefs': [{'val': 'RO:0002180'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'qualifier'}]}}\n", + "DEBUG:hpotk.ontology.load.obographs._load:Creating the edge list\n", + "DEBUG:hpotk.ontology.load.obographs._load:Building ontology graph\n", + "DEBUG:hpotk.graph._factory:Creating ontology graph from 23765 edges\n", + "DEBUG:hpotk.graph._factory:Found root HP:0000001\n", + "DEBUG:hpotk.graph._factory:Extracted 19408 nodes\n", + "DEBUG:hpotk.ontology.load.obographs._load:Assembling the ontology\n", + "DEBUG:hpotk.ontology.load.obographs._load:Done\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Impression: (not found)\n", + " Growth Assessment: None\n", + "\n", + " --- HPO Concept Recognition from Clinical Text ---\n", + " (no impression text to parse)\n", + " (no HPO terms matched)\n", + "\n", + "[STEP 5] Previewing anatomy findings...\n", + " Normal (0): ...\n", + " Abnormal (0): (none)\n", + " Not visualized (0): ...\n", + " (Note: Anatomy section parser not yet implemented in ETL)\n", + "\n", + "[STEP 6] Converting to PhenotypicFeatures...\n", + "\n", + " --- From Biometry ---\n", + " ? Added 4 features from biometry\n", + "\n", + " --- From Clinical Text ---\n", + " ? Added 0 features from clinical text\n", + "\n", + " Total PhenotypicFeatures: 4\n", + "\n", + "[STEP 7] Building Phenopacket v2.0...\n", + " ? Phenopacket assembled successfully\n", + " ID: apple-sally-fetus-1\n", + " Subject: fetus-1 at 26w6d\n", + " Features: 4\n", + "\n", + "================================================================================\n", + "PHENOPACKET v2.0 OUTPUT (JSON)\n", + "================================================================================\n", + "{\n", + " \"id\": \"apple-sally-fetus-1\",\n", + " \"subject\": {\n", + " \"id\": \"fetus-1\",\n", + " \"time_at_last_encounter\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " \"phenotypic_features\": [\n", + " {\n", + " \"description\": \"[Biometry] AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0034207\",\n", + " \"label\": \"Abnormal fetal gastrointestinal system morphology\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"[Biometry] BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000240\",\n", + " \"label\": \"Abnormality of skull size\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"[Biometry] HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000240\",\n", + " \"label\": \"Abnormality of skull size\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"[Biometry] Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0002823\",\n", + " \"label\": \"Abnormal femur morphology\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 27\n", + " }\n", + " }\n", + " }\n", + " ],\n", + " \"meta_data\": {\n", + " \"created\": \"2026-01-23T14:56:57.292752Z\",\n", + " \"created_by\": \"prenatalppkt-etl-pipeline\",\n", + " \"resources\": [\n", + " {\n", + " \"id\": \"hp\",\n", + " \"name\": \"Human Phenotype Ontology\",\n", + " \"url\": \"http://purl.obolibrary.org/obo/hp.owl\",\n", + " \"version\": \"2025-10-22\",\n", + " \"namespace_prefix\": \"HP\",\n", + " \"iri_prefix\": \"http://purl.obolibrary.org/obo/HP_\"\n", + " }\n", + " ],\n", + " \"phenopacket_schema_version\": \"2.0\"\n", + " }\n", + "}\n", + "\n", + "================================================================================\n", + "VALIDATION & SUMMARY\n", + "================================================================================\n", + "\n", + "[Validation] Round-trip test...\n", + " ? Round-trip validation passed\n", + "\n", + "[Summary] Phenotypic Features:\n", + " Total: 4\n", + " From Biometry: 4\n", + " From Clinical Text: 0\n", + " Normal (excluded): 4\n", + " Abnormal (observed): 0\n", + "\n", + "[Detail] All Phenotypic Features:\n", + "------------------------------------------------------------\n", + "\n", + " [1] HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + " Source: Biometry\n", + " Status: EXCLUDED (normal)\n", + " Onset: 26w6d\n", + "\n", + " [2] HP:0000240 - Abnormality of skull size\n", + " Source: Biometry\n", + " Status: EXCLUDED (normal)\n", + " Onset: 26w6d\n", + "\n", + " [3] HP:0000240 - Abnormality of skull size\n", + " Source: Biometry\n", + " Status: EXCLUDED (normal)\n", + " Onset: 26w6d\n", + "\n", + " [4] HP:0002823 - Abnormal femur morphology\n", + " Source: Biometry\n", + " Status: EXCLUDED (normal)\n", + " Onset: 27w0d\n", + "\n", + "================================================================================\n", + "SUCCESS: Phenopacket saved to output/apple_sally_phenopacket_expanded.json\n", + "================================================================================\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "PRENATALPPKT EXPANDED ETL PIPELINE\n", + "Observer JSON -> Biometry + Clinical Sections -> Phenopacket v2.0\n", + "\n", + "Demonstrates the complete ETL pipeline:\n", + "1. Biometry extraction -> List[TermBin] -> quantitative HPO terms\n", + "2. Clinical indication -> reason for exam\n", + "3. Pregnancy dating -> LMP, EDD, gestational age context\n", + "4. Clinical impression -> qualitative HPO terms from free text\n", + "5. Phenopacket assembly -> GA4GH Phenopacket v2.0 JSON\n", + "\n", + "Uses the official GA4GH phenopackets library per:\n", + "https://phenopacket-schema.readthedocs.io/en/latest/python.html\n", + "\"\"\"\n", + "\n", + "import gzip\n", + "import json\n", + "import re\n", + "from datetime import datetime, timezone\n", + "from pathlib import Path\n", + "\n", + "from google.protobuf.json_format import MessageToJson, Parse\n", + "from google.protobuf.timestamp_pb2 import Timestamp\n", + "import phenopackets.schema.v2 as pps2\n", + "\n", + "# ETL Extractors (biometry -> TermBins)\n", + "from prenatalppkt.etl.extractors import observer\n", + "\n", + "# ETL Section Parsers (clinical metadata -> Dicts)\n", + "from prenatalppkt.etl.sections import (\n", + " parse_clinical_indication,\n", + " parse_pregnancy_dating,\n", + " parse_clinical_impression,\n", + ")\n", + "\n", + "# HPO Concept Recognition\n", + "from prenatalppkt.hpo import HpoParser\n", + "\n", + "# Gestational Age utilities\n", + "from prenatalppkt.gestational_age import GestationalAge\n", + "\n", + "print(\"=\" * 80)\n", + "print(\"PRENATALPPKT EXPANDED ETL PIPELINE\")\n", + "print(\"Observer JSON -> Biometry + Clinical Sections -> Phenopacket v2.0\")\n", + "print(\"=\" * 80)\n", + "\n", + "# =============================================================================\n", + "# STEP 1: Load HPO Concept Recognizer\n", + "# =============================================================================\n", + "print(\"\\n[STEP 1] Loading HPO Concept Recognizer...\")\n", + "\n", + "HP_JSON_GZ = Path(\"tests/data/hp.json.gz\")\n", + "TMP_HP_JSON = Path(\"/tmp/hp.json\")\n", + "\n", + "# Decompress hp.json.gz to temp location\n", + "with gzip.open(HP_JSON_GZ, \"rt\", encoding=\"utf-8\") as f_in:\n", + " with open(TMP_HP_JSON, \"w\", encoding=\"utf-8\") as f_out:\n", + " f_out.write(f_in.read())\n", + "\n", + "hpo_parser = HpoParser(hpo_json_file=str(TMP_HP_JSON))\n", + "hpo_cr = hpo_parser.get_hpo_concept_recognizer()\n", + "\n", + "print(f\" ? HPO version: {hpo_parser.get_version()}\")\n", + "print(f\" ? Concept recognizer: {type(hpo_cr).__name__}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 2: Load Observer JSON Data\n", + "# =============================================================================\n", + "print(\"\\n[STEP 2] Loading Observer JSON...\")\n", + "\n", + "DATA_PATH = Path(\"tests/data/Apple_Sally_pretty.json\")\n", + "\n", + "with open(DATA_PATH) as f:\n", + " observer_data = json.load(f)\n", + "\n", + "# Keep raw JSON string for section parsers\n", + "with open(DATA_PATH) as f:\n", + " observer_json_str = f.read()\n", + "\n", + "print(f\" ? Loaded: {DATA_PATH}\")\n", + "print(f\" ? Fetuses: {len(observer_data.get('fetuses', []))}\")\n", + "\n", + "first_fetus = observer_data[\"fetuses\"][0]\n", + "measurements = first_fetus.get(\"measurements\", [])\n", + "print(f\" ? Measurements: {len(measurements)}\")\n", + "print(f\" ? Sample: {measurements[0]['label']} = {measurements[0]['value']} {measurements[0]['unit_of_measure']}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 3: Extract Biometry -> TermBins\n", + "# =============================================================================\n", + "print(\"\\n[STEP 3] Extracting biometry measurements to TermBins...\")\n", + "\n", + "term_bins = observer.extract(observer_data)\n", + "\n", + "print(f\" ? Extracted {len(term_bins)} TermBins:\")\n", + "for i, tb in enumerate(term_bins, 1):\n", + " status = \"? Normal\" if tb.normal else \"? Abnormal\"\n", + " print(f\" [{i}] {tb.hpo_id} ({tb.hpo_label}) - {status}\")\n", + " print(f\" {tb.description}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 4: Parse Clinical Sections\n", + "# =============================================================================\n", + "print(\"\\n[STEP 4] Parsing clinical sections...\")\n", + "\n", + "SOURCE_FORMAT = \"observer_json\"\n", + "\n", + "# 4a: Clinical Indication\n", + "print(\"\\n --- Clinical Indication ---\")\n", + "indication = parse_clinical_indication(observer_json_str, SOURCE_FORMAT)\n", + "indication_text = indication.get(\"indication_text\", \"\")\n", + "if indication_text:\n", + " print(f\" Indication: {indication_text[:100]}{'...' if len(indication_text) > 100 else ''}\")\n", + "else:\n", + " print(\" Indication: (not found)\")\n", + "\n", + "# 4b: Pregnancy Dating\n", + "print(\"\\n --- Pregnancy Dating ---\")\n", + "dating = parse_pregnancy_dating(observer_json_str, SOURCE_FORMAT)\n", + "print(f\" LMP: {dating.get('lmp', '(not found)')}\")\n", + "print(f\" EDD: {dating.get('edd', '(not found)')}\")\n", + "print(f\" Dating Method: {dating.get('dating_method', '(not found)')}\")\n", + "print(f\" GA by Ultrasound: {dating.get('ga_by_ultrasound', '(not found)')}\")\n", + "\n", + "# 4c: Clinical Impression\n", + "print(\"\\n --- Clinical Impression ---\")\n", + "impression = parse_clinical_impression(observer_json_str, SOURCE_FORMAT)\n", + "impression_text = impression.get(\"impression_text\", \"\")\n", + "\n", + "if impression_text:\n", + " # Clean up for display\n", + " preview = impression_text[:200].replace('\\r', ' ').replace('\\n', ' ')\n", + " print(f\" Impression ({len(impression_text)} chars): \\\"{preview}...\\\"\")\n", + "else:\n", + " print(\" Impression: (not found)\")\n", + "\n", + "print(f\" Growth Assessment: {impression.get('growth_assessment', '(not detected)')}\")\n", + "\n", + "# 4d: Extract HPO terms from clinical narrative\n", + "print(\"\\n --- HPO Concept Recognition from Clinical Text ---\")\n", + "if impression_text:\n", + " hpo_terms_from_text = hpo_cr.parse(impression_text)\n", + " print(f\" Found {len(hpo_terms_from_text)} HPO terms in clinical narrative:\")\n", + " for term in hpo_terms_from_text:\n", + " print(f\" ? {term.hpo_id}: {term.hpo_label}\")\n", + "else:\n", + " hpo_terms_from_text = []\n", + " print(\" (no impression text to parse)\")\n", + "\n", + "if not hpo_terms_from_text:\n", + " print(\" (no HPO terms matched)\")\n", + "\n", + "# =============================================================================\n", + "# STEP 5: Preview Anatomy Findings (Structured Data)\n", + "# =============================================================================\n", + "print(\"\\n[STEP 5] Previewing anatomy findings...\")\n", + "\n", + "fetus_data = observer_data[\"fetuses\"][0].get(\"fetus\", {})\n", + "anatomy_list = fetus_data.get(\"anatomy\", [])\n", + "\n", + "normal_structures = []\n", + "abnormal_structures = []\n", + "unseen_structures = []\n", + "anomalies_found = []\n", + "\n", + "for item in anatomy_list:\n", + " main = item.get(\"main\", {})\n", + " label = main.get(\"label\", \"Unknown\")\n", + " state = main.get(\"anat_state\", \"\")\n", + " \n", + " if state == \"Normal\":\n", + " normal_structures.append(label)\n", + " elif state == \"Abnormal\":\n", + " abnormal_structures.append(label)\n", + " # Check for specific anomalies\n", + " anomalies = item.get(\"anomalies\", [])\n", + " if anomalies:\n", + " for anom in anomalies:\n", + " desc = anom.get(\"description\", \"?\")\n", + " anomalies_found.append(f\"{label}: {desc}\")\n", + " elif state == \"Unseen\":\n", + " unseen_structures.append(label)\n", + "\n", + "print(f\" Normal ({len(normal_structures)}): {', '.join(normal_structures[:5])}...\")\n", + "print(f\" Abnormal ({len(abnormal_structures)}): {', '.join(abnormal_structures) if abnormal_structures else '(none)'}\")\n", + "print(f\" Not visualized ({len(unseen_structures)}): {', '.join(unseen_structures[:3])}...\")\n", + "\n", + "if anomalies_found:\n", + " print(f\" ? Anomalies detected:\")\n", + " for anom in anomalies_found:\n", + " print(f\" - {anom}\")\n", + "\n", + "print(\" (Note: Anatomy section parser not yet implemented in ETL)\")\n", + "\n", + "# =============================================================================\n", + "# STEP 6: Convert to PhenotypicFeatures\n", + "# =============================================================================\n", + "print(\"\\n[STEP 6] Converting to PhenotypicFeatures...\")\n", + "\n", + "\n", + "def parse_ga_from_description(description: str, fallback_weeks: float = 26.9) -> tuple[int, int]:\n", + " \"\"\"Extract weeks and days from TermBin description.\"\"\"\n", + " match = re.search(r\"at (\\d+)w(\\d+)d\", description)\n", + " if match:\n", + " return int(match.group(1)), int(match.group(2))\n", + " ga = GestationalAge.from_weeks(fallback_weeks)\n", + " return ga.weeks, ga.days\n", + "\n", + "\n", + "# Get subject GA for features without specific timing\n", + "first_measurement = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", + "subject_ga_weeks = first_measurement.get(\"calculated_ega\", 26.9)\n", + "subject_ga = GestationalAge.from_weeks(subject_ga_weeks)\n", + "\n", + "phenotypic_features = []\n", + "\n", + "# 6a: Convert biometry TermBins -> PhenotypicFeatures\n", + "print(\"\\n --- From Biometry ---\")\n", + "for tb in term_bins:\n", + " weeks, days = parse_ga_from_description(tb.description, subject_ga_weeks)\n", + " \n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=tb.hpo_id, label=tb.hpo_label),\n", + " excluded=tb.normal, # normal=True means abnormality is EXCLUDED\n", + " onset=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=weeks, days=days)\n", + " ),\n", + " description=f\"[Biometry] {tb.description}\",\n", + " )\n", + " phenotypic_features.append(pf)\n", + "\n", + "print(f\" ? Added {len(term_bins)} features from biometry\")\n", + "\n", + "# 6b: Convert clinical text HPO terms -> PhenotypicFeatures\n", + "print(\"\\n --- From Clinical Text ---\")\n", + "text_feature_count = 0\n", + "for term in hpo_terms_from_text:\n", + " # Findings mentioned in clinical impression are OBSERVED (not excluded)\n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=term.hpo_id, label=term.hpo_label),\n", + " excluded=False, # These are observed findings\n", + " onset=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + " ),\n", + " description=f\"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", + " )\n", + " phenotypic_features.append(pf)\n", + " text_feature_count += 1\n", + "\n", + "print(f\" ? Added {text_feature_count} features from clinical text\")\n", + "print(f\"\\n Total PhenotypicFeatures: {len(phenotypic_features)}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 7: Build Complete Phenopacket v2.0\n", + "# =============================================================================\n", + "print(\"\\n[STEP 7] Building Phenopacket v2.0...\")\n", + "\n", + "# Subject (fetus)\n", + "subject = pps2.Individual(\n", + " id=\"fetus-1\",\n", + " sex=pps2.Sex.UNKNOWN_SEX,\n", + " time_at_last_encounter=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + " ),\n", + ")\n", + "\n", + "# Metadata\n", + "now = datetime.now(timezone.utc)\n", + "created_timestamp = Timestamp()\n", + "created_timestamp.FromDatetime(now)\n", + "\n", + "hpo_resource = pps2.Resource(\n", + " id=\"hp\",\n", + " name=\"Human Phenotype Ontology\",\n", + " url=\"http://purl.obolibrary.org/obo/hp.owl\",\n", + " version=hpo_parser.get_version() or \"2025-01-01\",\n", + " namespace_prefix=\"HP\",\n", + " iri_prefix=\"http://purl.obolibrary.org/obo/HP_\",\n", + ")\n", + "\n", + "metadata = pps2.MetaData(\n", + " created=created_timestamp,\n", + " created_by=\"prenatalppkt-etl-pipeline\",\n", + " phenopacket_schema_version=\"2.0\",\n", + ")\n", + "metadata.resources.append(hpo_resource)\n", + "\n", + "# Assemble the Phenopacket\n", + "phenopacket = pps2.Phenopacket(\n", + " id=\"apple-sally-fetus-1\",\n", + " subject=subject,\n", + " meta_data=metadata,\n", + ")\n", + "phenopacket.phenotypic_features.extend(phenotypic_features)\n", + "\n", + "print(\" ? Phenopacket assembled successfully\")\n", + "print(f\" ID: {phenopacket.id}\")\n", + "print(f\" Subject: {phenopacket.subject.id} at {subject_ga.weeks}w{subject_ga.days}d\")\n", + "print(f\" Features: {len(phenopacket.phenotypic_features)}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 8: Output JSON\n", + "# =============================================================================\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"PHENOPACKET v2.0 OUTPUT (JSON)\")\n", + "print(\"=\" * 80)\n", + "\n", + "phenopacket_json = MessageToJson(phenopacket, preserving_proto_field_name=True)\n", + "print(phenopacket_json)\n", + "\n", + "# =============================================================================\n", + "# STEP 9: Validation & Summary\n", + "# =============================================================================\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"VALIDATION & SUMMARY\")\n", + "print(\"=\" * 80)\n", + "\n", + "# Round-trip validation\n", + "print(\"\\n[Validation] Round-trip test...\")\n", + "parsed_back = Parse(phenopacket_json, pps2.Phenopacket())\n", + "assert parsed_back.id == phenopacket.id\n", + "assert len(parsed_back.phenotypic_features) == len(phenopacket.phenotypic_features)\n", + "print(\" ? Round-trip validation passed\")\n", + "\n", + "# Feature breakdown\n", + "biometry_features = [pf for pf in phenopacket.phenotypic_features if \"[Biometry]\" in pf.description]\n", + "clinical_features = [pf for pf in phenopacket.phenotypic_features if \"[Clinical\" in pf.description]\n", + "excluded_count = sum(1 for pf in phenopacket.phenotypic_features if pf.excluded)\n", + "observed_count = len(phenopacket.phenotypic_features) - excluded_count\n", + "\n", + "print(\"\\n[Summary] Phenotypic Features:\")\n", + "print(f\" Total: {len(phenopacket.phenotypic_features)}\")\n", + "print(f\" From Biometry: {len(biometry_features)}\")\n", + "print(f\" From Clinical Text: {len(clinical_features)}\")\n", + "print(f\" Normal (excluded): {excluded_count}\")\n", + "print(f\" Abnormal (observed): {observed_count}\")\n", + "\n", + "# Detailed feature list\n", + "print(\"\\n[Detail] All Phenotypic Features:\")\n", + "print(\"-\" * 60)\n", + "for i, pf in enumerate(phenopacket.phenotypic_features, 1):\n", + " status = \"EXCLUDED (normal)\" if pf.excluded else \"OBSERVED (abnormal)\"\n", + " ga = pf.onset.gestational_age\n", + " source = \"Biometry\" if \"[Biometry]\" in pf.description else \"Clinical Text\"\n", + " print(f\"\\n [{i}] {pf.type.id} - {pf.type.label}\")\n", + " print(f\" Source: {source}\")\n", + " print(f\" Status: {status}\")\n", + " print(f\" Onset: {ga.weeks}w{ga.days}d\")\n", + "\n", + "# Save to file\n", + "output_path = Path(\"output/apple_sally_phenopacket_expanded.json\")\n", + "output_path.parent.mkdir(exist_ok=True)\n", + "with open(output_path, \"w\") as f:\n", + " f.write(phenopacket_json)\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(f\"SUCCESS: Phenopacket saved to {output_path}\")\n", + "print(\"=\" * 80)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "prenatalppkt", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From fb5bbcb71ae41cd4ecc0fec972c55b0f0b862a56 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 26 Jan 2026 15:46:58 -0500 Subject: [PATCH 06/20] ruffing --- .../etl/sections/clinical_impression.py | 262 +++++++++--------- 1 file changed, 131 insertions(+), 131 deletions(-) diff --git a/src/prenatalppkt/etl/sections/clinical_impression.py b/src/prenatalppkt/etl/sections/clinical_impression.py index 0003973..6f83ee5 100644 --- a/src/prenatalppkt/etl/sections/clinical_impression.py +++ b/src/prenatalppkt/etl/sections/clinical_impression.py @@ -13,68 +13,68 @@ def parse_clinical_impression( - data: Union[str, Dict], source_format: str, hpo_cr=None + data: Union[str, Dict], source_format: str, hpo_cr=None ) -> Dict: - """ - Parse clinical impression / interpretation section. - - Supports: - - observer_json - - viewpoint_text - - viewpoint_hl7 - - Args: - data: Raw input data (JSON string, dict, or text) - source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" - hpo_cr: Optional HpoExactConceptRecognizer for HPO term extraction. - If provided, will extract HPO terms from impression text. - - Returns: - Dict with keys: - - impression_text: str - Full impression narrative - - diagnoses: List[str] - Identified diagnoses (future) - - anomalies: List[Dict] - Structured anomaly data (future) - - gestational_age_assessment: Optional[str] - GA conclusion - - growth_assessment: Optional[str] - FGR, LGA, AGA, or None - - recommendations: List[str] - Follow-up recommendations (future) - - hpo_terms: List[SimpleTerm] - HPO terms extracted via CR - - source_format: str - """ - if source_format == "observer_json": - if isinstance(data, str): - data = json.loads(data) - impression_text = _parse_observer_impression(data) - - elif source_format == "viewpoint_text": - if not isinstance(data, str): - raise ValueError("viewpoint_text data must be a string") - impression_text = _parse_viewpoint_text_impression(data) - - elif source_format == "viewpoint_hl7": - if not isinstance(data, str): - raise ValueError("viewpoint_hl7 data must be a string") - impression_text = _parse_viewpoint_hl7_impression(data) - - else: - raise ValueError(f"Unsupported source_format: {source_format}") - - # Extract HPO terms if concept recognizer is provided - hpo_terms = [] - if impression_text and hpo_cr is not None: - # HpoExactConceptRecognizer uses parse() method, not extract() - if hasattr(hpo_cr, "parse"): - hpo_terms = hpo_cr.parse(impression_text) - - return { - "impression_text": impression_text, - "diagnoses": [], - "anomalies": [], - "gestational_age_assessment": None, - "growth_assessment": _infer_growth_assessment(impression_text), - "recommendations": [], - "hpo_terms": hpo_terms, - "source_format": source_format, - } + """ + Parse clinical impression / interpretation section. + + Supports: + - observer_json + - viewpoint_text + - viewpoint_hl7 + + Args: + data: Raw input data (JSON string, dict, or text) + source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + hpo_cr: Optional HpoExactConceptRecognizer for HPO term extraction. + If provided, will extract HPO terms from impression text. + + Returns: + Dict with keys: + - impression_text: str - Full impression narrative + - diagnoses: List[str] - Identified diagnoses (future) + - anomalies: List[Dict] - Structured anomaly data (future) + - gestational_age_assessment: Optional[str] - GA conclusion + - growth_assessment: Optional[str] - FGR, LGA, AGA, or None + - recommendations: List[str] - Follow-up recommendations (future) + - hpo_terms: List[SimpleTerm] - HPO terms extracted via CR + - source_format: str + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + impression_text = _parse_observer_impression(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + impression_text = _parse_viewpoint_text_impression(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + impression_text = _parse_viewpoint_hl7_impression(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + # Extract HPO terms if concept recognizer is provided + hpo_terms = [] + if impression_text and hpo_cr is not None: + # HpoExactConceptRecognizer uses parse() method, not extract() + if hasattr(hpo_cr, "parse"): + hpo_terms = hpo_cr.parse(impression_text) + + return { + "impression_text": impression_text, + "diagnoses": [], + "anomalies": [], + "gestational_age_assessment": None, + "growth_assessment": _infer_growth_assessment(impression_text), + "recommendations": [], + "hpo_terms": hpo_terms, + "source_format": source_format, + } # --------------------------------------------------------------------- @@ -83,28 +83,28 @@ def parse_clinical_impression( def _parse_observer_impression(json_data: Dict) -> str: - """ - Extract impression from Observer JSON. + """ + Extract impression from Observer JSON. - The finalize block can be at: - - Root level: json_data["finalize"]["generalComment"]["plain_text"] - - Under exam: json_data["exam"]["finalize"]["generalComment"]["plain_text"] + The finalize block can be at: + - Root level: json_data["finalize"]["generalComment"]["plain_text"] + - Under exam: json_data["exam"]["finalize"]["generalComment"]["plain_text"] - We check the root level first (most common), then fall back to exam. - """ - impression = "" + We check the root level first (most common), then fall back to exam. + """ + impression = "" - # Check root level first (this is where Apple_Sally has it) - finalize = json_data.get("finalize", {}) - impression = finalize.get("generalComment", {}).get("plain_text", "").strip() + # Check root level first (this is where Apple_Sally has it) + finalize = json_data.get("finalize", {}) + impression = finalize.get("generalComment", {}).get("plain_text", "").strip() - # Fall back to exam.finalize if not found at root - if not impression: - exam = json_data.get("exam", {}) - finalize = exam.get("finalize", {}) - impression = finalize.get("generalComment", {}).get("plain_text", "").strip() + # Fall back to exam.finalize if not found at root + if not impression: + exam = json_data.get("exam", {}) + finalize = exam.get("finalize", {}) + impression = finalize.get("generalComment", {}).get("plain_text", "").strip() - return impression + return impression # --------------------------------------------------------------------- @@ -113,21 +113,21 @@ def _parse_observer_impression(json_data: Dict) -> str: def _parse_viewpoint_text_impression(text: str) -> str: - """ - Extract impression from ViewPoint text reports. + """ + Extract impression from ViewPoint text reports. - Expected pattern: - Impression - ========== - [free text narrative] - """ - pattern = re.compile( - r"Impression\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", - re.DOTALL | re.IGNORECASE, - ) + Expected pattern: + Impression + ========== + [free text narrative] + """ + pattern = re.compile( + r"Impression\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) - match = pattern.search(text) - return match.group("body").strip() if match else "" + match = pattern.search(text) + return match.group("body").strip() if match else "" # --------------------------------------------------------------------- @@ -136,30 +136,30 @@ def _parse_viewpoint_text_impression(text: str) -> str: def _parse_viewpoint_hl7_impression(hl7: str) -> str: - """ - Extract impression from HL7 ORU^R01 messages. + """ + Extract impression from HL7 ORU^R01 messages. - Looks for OBX segments containing "Impression" or "Interpretation" - in the observation identifier field. - """ - lines: List[str] = [] + Looks for OBX segments containing "Impression" or "Interpretation" + in the observation identifier field. + """ + lines: List[str] = [] - for line in hl7.splitlines(): - if not line.startswith("OBX"): - continue + for line in hl7.splitlines(): + if not line.startswith("OBX"): + continue - fields = line.split("|") - if len(fields) < 6: - continue + fields = line.split("|") + if len(fields) < 6: + continue - obs_id = fields[3] - value = fields[5].split("^")[0].strip() + obs_id = fields[3] + value = fields[5].split("^")[0].strip() - if "Impression" in obs_id or "Interpretation" in obs_id: - if value: - lines.append(value) + if "Impression" in obs_id or "Interpretation" in obs_id: + if value: + lines.append(value) - return " ".join(lines) + return " ".join(lines) # --------------------------------------------------------------------- @@ -168,25 +168,25 @@ def _parse_viewpoint_hl7_impression(hl7: str) -> str: def _infer_growth_assessment(text: str) -> Optional[str]: - """ - Infer fetal growth assessment from impression text. - - Returns: - "FGR" - Fetal Growth Restriction - "LGA" - Large for Gestational Age - "AGA" - Appropriate for Gestational Age - None - No assessment detected - """ - if not text: - return None - - text_lower = text.lower() - - if "growth restriction" in text_lower or "fgr" in text_lower: - return "FGR" - if "large for gestational age" in text_lower or "lga" in text_lower: - return "LGA" - if "appropriate for gestational age" in text_lower or "aga" in text_lower: - return "AGA" - - return None \ No newline at end of file + """ + Infer fetal growth assessment from impression text. + + Returns: + "FGR" - Fetal Growth Restriction + "LGA" - Large for Gestational Age + "AGA" - Appropriate for Gestational Age + None - No assessment detected + """ + if not text: + return None + + text_lower = text.lower() + + if "growth restriction" in text_lower or "fgr" in text_lower: + return "FGR" + if "large for gestational age" in text_lower or "lga" in text_lower: + return "LGA" + if "appropriate for gestational age" in text_lower or "aga" in text_lower: + return "AGA" + + return None From 8703a58eebcabb28caed45c8119e9151014be981 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 26 Jan 2026 15:47:11 -0500 Subject: [PATCH 07/20] fix(tests): correct hpo_cr parameter name in clinical impression tests. Tests were using but function signature uses --- tests/etl/sections/test_clinical_impression.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/etl/sections/test_clinical_impression.py b/tests/etl/sections/test_clinical_impression.py index 715a2e1..dc2c057 100644 --- a/tests/etl/sections/test_clinical_impression.py +++ b/tests/etl/sections/test_clinical_impression.py @@ -23,7 +23,7 @@ def test_basic_impression(self, hpo_cr): } ) - result = parse_clinical_impression(data, "observer_json", hpo_parser=hpo_cr) + result = parse_clinical_impression(data, "observer_json", hpo_cr=hpo_cr) assert "Normal fetal anatomy" in result["impression_text"] assert result["hpo_terms"] == [] @@ -43,7 +43,7 @@ def test_basic_impression(self, hpo_cr): Recommend follow-up scan. """ - result = parse_clinical_impression(text, "viewpoint_text", hpo_parser=hpo_cr) + result = parse_clinical_impression(text, "viewpoint_text", hpo_cr=hpo_cr) assert "growth restriction" in result["impression_text"].lower() assert result["growth_assessment"] == "FGR" @@ -51,7 +51,7 @@ def test_basic_impression(self, hpo_cr): def test_missing_impression(self, hpo_cr): text = "Fetal Biometry\n============\nHC 175 mm" - result = parse_clinical_impression(text, "viewpoint_text", hpo_parser=hpo_cr) + result = parse_clinical_impression(text, "viewpoint_text", hpo_cr=hpo_cr) assert result["impression_text"] == "" @@ -64,7 +64,7 @@ class TestClinicalImpressionViewPointHL7: def test_basic_hl7_impression(self, hpo_cr): hl7 = "OBX||TX|Impression^Impression|1|Appropriate for gestational age\n" - result = parse_clinical_impression(hl7, "viewpoint_hl7", hpo_parser=hpo_cr) + result = parse_clinical_impression(hl7, "viewpoint_hl7", hpo_cr=hpo_cr) assert "Appropriate" in result["impression_text"] assert result["growth_assessment"] == "AGA" @@ -78,10 +78,8 @@ def test_basic_hl7_impression(self, hpo_cr): class TestClinicalImpressionEdgeCases: def test_invalid_format(self, hpo_cr): with pytest.raises(ValueError): - parse_clinical_impression("data", "bad_format", hpo_parser=hpo_cr) + parse_clinical_impression("data", "bad_format", hpo_cr=hpo_cr) def test_non_string_text(self, hpo_cr): with pytest.raises(ValueError): - parse_clinical_impression( - {"bad": "data"}, "viewpoint_text", hpo_parser=hpo_cr - ) + parse_clinical_impression({"bad": "data"}, "viewpoint_text", hpo_cr=hpo_cr) From 0c33026832ab0c61d040f5861da8722a074b6a19 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 26 Jan 2026 15:47:37 -0500 Subject: [PATCH 08/20] docs(notebook): expand ETL demo with clinical text HPO extraction --- prenatalppkt.ipynb | 171 +++++++++++++++++++++++++++------------------ 1 file changed, 102 insertions(+), 69 deletions(-) diff --git a/prenatalppkt.ipynb b/prenatalppkt.ipynb index 2426c29..badee2e 100644 --- a/prenatalppkt.ipynb +++ b/prenatalppkt.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "id": "d8f2cfce", "metadata": {}, "outputs": [ @@ -67,8 +67,8 @@ "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: BPD\n", "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: HC\n", "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: Femur\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'Femur', 'AC', 'BPD', 'HC'}\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'Femur', 'AC', 'BPD', 'HC'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'HC', 'Femur', 'AC', 'BPD'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'HC', 'Femur', 'AC', 'BPD'}\n", "DEBUG:prenatalppkt.etl.term_bin_factory:All required measurements present\n", "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n" ] @@ -201,7 +201,7 @@ " }\n", " ],\n", " \"meta_data\": {\n", - " \"created\": \"2026-01-23T14:56:52.244568Z\",\n", + " \"created\": \"2026-01-26T15:21:08.287048Z\",\n", " \"created_by\": \"prenatalppkt-etl-pipeline\",\n", " \"resources\": [\n", " {\n", @@ -499,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "id": "3685f9e5", "metadata": {}, "outputs": [ @@ -556,8 +556,8 @@ "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: BPD\n", "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: HC\n", "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: Femur\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'Femur', 'AC', 'BPD', 'HC'}\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'Femur', 'AC', 'BPD', 'HC'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'HC', 'Femur', 'AC', 'BPD'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'HC', 'Femur', 'AC', 'BPD'}\n", "DEBUG:prenatalppkt.etl.term_bin_factory:All required measurements present\n", "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n" ] @@ -681,7 +681,7 @@ " }\n", " ],\n", " \"meta_data\": {\n", - " \"created\": \"2026-01-23T14:56:52.295444Z\",\n", + " \"created\": \"2026-01-26T15:21:08.337338Z\",\n", " \"created_by\": \"prenatalppkt-etl-pipeline\",\n", " \"resources\": [\n", " {\n", @@ -803,20 +803,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "id": "0f79d3fe", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DEBUG:hpotk.util:Using default encoding 'utf-8'\n", - "DEBUG:hpotk.util:Opening /tmp/hp.json\n", - "DEBUG:hpotk.util:Looks like a local file: /tmp/hp.json\n", - "DEBUG:hpotk.util:Looks like decompressed data\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -833,6 +823,10 @@ "name": "stderr", "output_type": "stream", "text": [ + "DEBUG:hpotk.util:Using default encoding 'utf-8'\n", + "DEBUG:hpotk.util:Opening /tmp/hp.json\n", + "DEBUG:hpotk.util:Looks like a local file: /tmp/hp.json\n", + "DEBUG:hpotk.util:Looks like decompressed data\n", "DEBUG:hpotk.ontology.load.obographs._load:Extracting ontology terms\n", "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", @@ -901,16 +895,10 @@ "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: BPD\n", "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: HC\n", "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: Femur\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'Femur', 'AC', 'BPD', 'HC'}\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'Femur', 'AC', 'BPD', 'HC'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'HC', 'Femur', 'AC', 'BPD'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'HC', 'Femur', 'AC', 'BPD'}\n", "DEBUG:prenatalppkt.etl.term_bin_factory:All required measurements present\n", - "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n", - "DEBUG:hpotk.store._github:Pulling tag from https://api.github.com/repos/obophenotype/human-phenotype-ontology/tags\n", - "DEBUG:hpotk.store._github:Fetched 30 tags\n", - "DEBUG:hpotk.util:Using default encoding 'utf-8'\n", - "DEBUG:hpotk.util:Opening /home/varenya/.hpo-toolkit/HP/hp.v2026-01-08.json\n", - "DEBUG:hpotk.util:Looks like a local file: /home/varenya/.hpo-toolkit/HP/hp.v2026-01-08.json\n", - "DEBUG:hpotk.util:Looks like decompressed data\n" + "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n" ] }, { @@ -948,43 +936,16 @@ " Dating Method: None\n", " GA by Ultrasound: None\n", "\n", - " --- Clinical Impression ---\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DEBUG:hpotk.ontology.load.obographs._load:Extracting ontology terms\n", - "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", - "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", - "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", - "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", - "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", - "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", - "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/BFO_0000051', 'lbl': 'has part', 'meta': {'xrefs': [{'val': 'BFO:0000051'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'has_part'}]}}\n", - "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/BFO_0000066', 'lbl': 'occurs in', 'meta': {'xrefs': [{'val': 'BFO:0000066'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'occurs_in'}]}}\n", - "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/RO_0002503', 'lbl': 'towards', 'meta': {'xrefs': [{'val': 'RO:0002503'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'towards'}]}}\n", - "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/RO_0002573', 'lbl': 'has modifier', 'meta': {'comments': ['placeholder relation to indicate normality/abnormality.'], 'xrefs': [{'val': 'RO:0002180'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'qualifier'}]}}\n", - "DEBUG:hpotk.ontology.load.obographs._load:Creating the edge list\n", - "DEBUG:hpotk.ontology.load.obographs._load:Building ontology graph\n", - "DEBUG:hpotk.graph._factory:Creating ontology graph from 23765 edges\n", - "DEBUG:hpotk.graph._factory:Found root HP:0000001\n", - "DEBUG:hpotk.graph._factory:Extracted 19408 nodes\n", - "DEBUG:hpotk.ontology.load.obographs._load:Assembling the ontology\n", - "DEBUG:hpotk.ontology.load.obographs._load:Done\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Impression: (not found)\n", + " --- Clinical Impression ---\n", + " Impression (1294 chars): \"The patient was referred for a fetal anatomical survey. Sonographic measurements were consistent with the expected gestational age. The amniotic fluid volume was normal. A detailed fetal anatomic s...\"\n", " Growth Assessment: None\n", "\n", " --- HPO Concept Recognition from Clinical Text ---\n", - " (no impression text to parse)\n", - " (no HPO terms matched)\n", + " Found 4 HPO terms in clinical narrative:\n", + " ? HP:0001274: Agenesis of corpus callosum\n", + " ? HP:0000256: Macrocephaly\n", + " ? HP:0001305: Dandy-Walker malformation\n", + " ? HP:0002119: Ventriculomegaly\n", "\n", "[STEP 5] Previewing anatomy findings...\n", " Normal (0): ...\n", @@ -998,15 +959,15 @@ " ? Added 4 features from biometry\n", "\n", " --- From Clinical Text ---\n", - " ? Added 0 features from clinical text\n", + " ? Added 4 features from clinical text\n", "\n", - " Total PhenotypicFeatures: 4\n", + " Total PhenotypicFeatures: 8\n", "\n", "[STEP 7] Building Phenopacket v2.0...\n", " ? Phenopacket assembled successfully\n", " ID: apple-sally-fetus-1\n", " Subject: fetus-1 at 26w6d\n", - " Features: 4\n", + " Features: 8\n", "\n", "================================================================================\n", "PHENOPACKET v2.0 OUTPUT (JSON)\n", @@ -1077,10 +1038,62 @@ " \"weeks\": 27\n", " }\n", " }\n", + " },\n", + " {\n", + " \"description\": \"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", + " \"type\": {\n", + " \"id\": \"HP:0001274\",\n", + " \"label\": \"Agenesis of corpus callosum\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000256\",\n", + " \"label\": \"Macrocephaly\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", + " \"type\": {\n", + " \"id\": \"HP:0001305\",\n", + " \"label\": \"Dandy-Walker malformation\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", + " \"type\": {\n", + " \"id\": \"HP:0002119\",\n", + " \"label\": \"Ventriculomegaly\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", " }\n", " ],\n", " \"meta_data\": {\n", - " \"created\": \"2026-01-23T14:56:57.292752Z\",\n", + " \"created\": \"2026-01-26T15:21:11.051438Z\",\n", " \"created_by\": \"prenatalppkt-etl-pipeline\",\n", " \"resources\": [\n", " {\n", @@ -1104,11 +1117,11 @@ " ? Round-trip validation passed\n", "\n", "[Summary] Phenotypic Features:\n", - " Total: 4\n", + " Total: 8\n", " From Biometry: 4\n", - " From Clinical Text: 0\n", + " From Clinical Text: 4\n", " Normal (excluded): 4\n", - " Abnormal (observed): 0\n", + " Abnormal (observed): 4\n", "\n", "[Detail] All Phenotypic Features:\n", "------------------------------------------------------------\n", @@ -1133,6 +1146,26 @@ " Status: EXCLUDED (normal)\n", " Onset: 27w0d\n", "\n", + " [5] HP:0001274 - Agenesis of corpus callosum\n", + " Source: Clinical Text\n", + " Status: OBSERVED (abnormal)\n", + " Onset: 26w6d\n", + "\n", + " [6] HP:0000256 - Macrocephaly\n", + " Source: Clinical Text\n", + " Status: OBSERVED (abnormal)\n", + " Onset: 26w6d\n", + "\n", + " [7] HP:0001305 - Dandy-Walker malformation\n", + " Source: Clinical Text\n", + " Status: OBSERVED (abnormal)\n", + " Onset: 26w6d\n", + "\n", + " [8] HP:0002119 - Ventriculomegaly\n", + " Source: Clinical Text\n", + " Status: OBSERVED (abnormal)\n", + " Onset: 26w6d\n", + "\n", "================================================================================\n", "SUCCESS: Phenopacket saved to output/apple_sally_phenopacket_expanded.json\n", "================================================================================\n" From 3527e2d7099bd0612f4b546cf7aabeb3b4b94592 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 02:41:48 -0500 Subject: [PATCH 09/20] feat(etl): add fetal anatomy section parser with HPO extraction --- .../etl/sections/fetal_anatomy.py | 265 +++++++++++++++++- tests/etl/sections/test_fetal_anatomy.py | 219 +++++++++++++++ 2 files changed, 469 insertions(+), 15 deletions(-) create mode 100644 tests/etl/sections/test_fetal_anatomy.py diff --git a/src/prenatalppkt/etl/sections/fetal_anatomy.py b/src/prenatalppkt/etl/sections/fetal_anatomy.py index 694eab5..5613aef 100644 --- a/src/prenatalppkt/etl/sections/fetal_anatomy.py +++ b/src/prenatalppkt/etl/sections/fetal_anatomy.py @@ -1,21 +1,256 @@ """ -Fetal anatomy section parser (SKELETON). +Fetal anatomy section parser. -TODO @VarenyaJ: Parse anatomy checklist (normal/abnormal/not visualized) -TODO @VarenyaJ: Map anatomical findings to HPO terms -TODO @VarenyaJ: Handle detailed anatomy subsections +Extracts structured anatomy findings and free-text anatomy narrative, +with optional HPO term extraction from anomaly descriptions. """ -from typing import Dict +from __future__ import annotations +import json +import re +from typing import Dict, List, Union -def parse_fetal_anatomy(data: str, source_format: str = "viewpoint_text") -> Dict: - """Extract fetal anatomy assessment.""" - return { - "structures_examined": [], - "normal_structures": [], - "abnormal_structures": [], - "not_visualized": [], - "anomalies": [], - "hpo_terms": [], - } + +def parse_fetal_anatomy( + data: Union[str, Dict], source_format: str, hpo_cr=None +) -> Dict: + """ + Parse fetal anatomy section. + + Supports: + - observer_json + - viewpoint_text (skeleton) + - viewpoint_hl7 (skeleton) + + Args: + data: Raw input data (JSON string, dict, or text) + source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + hpo_cr: Optional HpoExactConceptRecognizer for HPO term extraction. + If provided, will extract HPO terms from anomaly descriptions. + + Returns: + Dict with keys: + - anatomy_text: str - Free text anatomy narrative + - normal_structures: List[str] - Structures marked Normal + - abnormal_structures: List[str] - Structures marked Abnormal + - not_visualized: List[str] - Structures marked Unseen + - anomalies: List[Dict] - Specific anomaly findings + - hpo_terms: List[SimpleTerm] - HPO terms extracted via CR + - source_format: str + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + return _parse_observer_anatomy(data, hpo_cr) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + return _parse_viewpoint_text_anatomy(data, hpo_cr) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + return _parse_viewpoint_hl7_anatomy(data, hpo_cr) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _classify_structure( + label: str, + state: str, + normal: List[str], + abnormal: List[str], + unseen: List[str], +) -> None: + """Classify a structure into the appropriate list based on state.""" + if not label: + return + if state == "Normal" and label not in normal: + normal.append(label) + elif state == "Abnormal" and label not in abnormal: + abnormal.append(label) + elif state == "Unseen" and label not in unseen: + unseen.append(label) + + +def _process_anatomy_item( + item: Dict, + normal: List[str], + abnormal: List[str], + unseen: List[str], + anomalies: List[Dict], +) -> None: + """Process a single anatomy item, extracting structures and anomalies.""" + main = item.get("main", {}) + label = main.get("label", "") + state = main.get("anat_state", "") + + # Classify main structure + _classify_structure(label, state, normal, abnormal, unseen) + + # Process detail sub-structures + for detail in item.get("detail", []): + detail_label = detail.get("label", "") + detail_state = detail.get("anat_det_state", "") + _classify_structure(detail_label, detail_state, normal, abnormal, unseen) + + # Process anomalies + for anom in item.get("anomalies", []): + description = anom.get("description", "") + if description: + anomalies.append({ + "structure": label, + "description": description, + "variant_type": anom.get("abnormal_or_normal_variant", "Abnormal"), + }) + + +def _extract_hpo_terms(anatomy_text: str, anomalies: List[Dict], hpo_cr) -> List: + """Extract HPO terms from anatomy text and anomaly descriptions.""" + if hpo_cr is None or not hasattr(hpo_cr, "parse"): + return [] + + all_anomaly_text = " ".join( + a["description"] for a in anomalies if a.get("description") + ) + combined_text = f"{anatomy_text} {all_anomaly_text}".strip() + + if not combined_text: + return [] + + return hpo_cr.parse(combined_text) + + +def _parse_observer_anatomy(json_data: Dict, hpo_cr=None) -> Dict: + """ + Extract anatomy findings from Observer JSON. + + Paths: + - fetuses[i].fetus.anatomy_text - free text narrative + - fetuses[i].fetus.anatomy[] - structured findings + - main.label - structure name (e.g., "Head", "Face") + - main.anat_state - "Normal", "Abnormal", or "Unseen" + - detail[].label - sub-structure name + - detail[].anat_det_state - sub-structure state + - anomalies[].description - specific finding text + - anomalies[].abnormal_or_normal_variant - classification + """ + fetuses = json_data.get("fetuses", []) + if not fetuses: + return _empty_result("observer_json") + + fetus_block = fetuses[0].get("fetus", {}) + anatomy_text = fetus_block.get("anatomy_text", "") + + normal_structures: List[str] = [] + abnormal_structures: List[str] = [] + not_visualized: List[str] = [] + anomalies: List[Dict] = [] + + for item in fetus_block.get("anatomy", []): + _process_anatomy_item( + item, normal_structures, abnormal_structures, not_visualized, anomalies + ) + + hpo_terms = _extract_hpo_terms(anatomy_text, anomalies, hpo_cr) + + return { + "anatomy_text": anatomy_text, + "normal_structures": normal_structures, + "abnormal_structures": abnormal_structures, + "not_visualized": not_visualized, + "anomalies": anomalies, + "hpo_terms": hpo_terms, + "source_format": "observer_json", + } + + +# --------------------------------------------------------------------- +# ViewPoint Text (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_anatomy(text: str, hpo_cr=None) -> Dict: + """ + Extract anatomy from ViewPoint text reports. + + Expected pattern: + Fetal Anatomy + ============= + The following structures appear normal: + Cranium. Brain. Face. ... + + The following structures appear abnormal: + GI tract: dilated bowel loops. + + The following structures could not be adequately visualized: + LVOT view. RVOT view. ... + + TODO @VarenyaJ: Implement full parsing + """ + # Skeleton: Extract the Fetal Anatomy section + pattern = re.compile( + r"Fetal Anatomy\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) + match = pattern.search(text) + anatomy_text = match.group("body").strip() if match else "" + + # TODO @VarenyaJ: Parse "appear normal", "appear abnormal", "could not be visualized" lists + + hpo_terms = [] + if anatomy_text and hpo_cr is not None and hasattr(hpo_cr, "parse"): + hpo_terms = hpo_cr.parse(anatomy_text) + + return { + "anatomy_text": anatomy_text, + "normal_structures": [], + "abnormal_structures": [], + "not_visualized": [], + "anomalies": [], + "hpo_terms": hpo_terms, + "source_format": "viewpoint_text", + } + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_anatomy(hl7: str, hpo_cr=None) -> Dict: + """ + Extract anatomy from HL7 ORU^R01 messages. + + Note: Anatomy is typically not encoded in discrete HL7 fields. + This is a skeleton for potential future implementation. + + TODO @VarenyaJ: Implement if HL7 anatomy encoding is discovered + """ + return _empty_result("viewpoint_hl7") + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _empty_result(source_format: str) -> Dict: + """Return empty result structure.""" + return { + "anatomy_text": "", + "normal_structures": [], + "abnormal_structures": [], + "not_visualized": [], + "anomalies": [], + "hpo_terms": [], + "source_format": source_format, + } \ No newline at end of file diff --git a/tests/etl/sections/test_fetal_anatomy.py b/tests/etl/sections/test_fetal_anatomy.py new file mode 100644 index 0000000..d2f42ff --- /dev/null +++ b/tests/etl/sections/test_fetal_anatomy.py @@ -0,0 +1,219 @@ +"""Tests for fetal anatomy section parser.""" + +import json +import pytest + +from prenatalppkt.etl.sections.fetal_anatomy import parse_fetal_anatomy + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestFetalAnatomyObserver: + def test_basic_anatomy_structures(self): + """Test parsing of normal/abnormal/unseen structures.""" + data = { + "fetuses": [ + { + "fetus": { + "anatomy_text": "The fetal anatomy was assessed.", + "anatomy": [ + { + "main": {"label": "Head", "anat_state": "Normal"}, + "detail": [], + "anomalies": [], + }, + { + "main": {"label": "Heart", "anat_state": "Abnormal"}, + "detail": [], + "anomalies": [], + }, + { + "main": {"label": "Spine", "anat_state": "Unseen"}, + "detail": [], + "anomalies": [], + }, + ], + } + } + ] + } + + result = parse_fetal_anatomy(data, "observer_json") + + assert "Head" in result["normal_structures"] + assert "Heart" in result["abnormal_structures"] + assert "Spine" in result["not_visualized"] + assert result["anatomy_text"] == "The fetal anatomy was assessed." + assert result["source_format"] == "observer_json" + + def test_anatomy_with_anomalies(self): + """Test parsing of specific anomaly descriptions.""" + data = { + "fetuses": [ + { + "fetus": { + "anatomy_text": "", + "anatomy": [ + { + "main": {"label": "Head", "anat_state": "Abnormal"}, + "detail": [ + { + "label": "Cerebellum", + "anat_det_state": "Abnormal", + } + ], + "anomalies": [ + { + "description": "Dandy Walker", + "abnormal_or_normal_variant": "Abnormal", + } + ], + } + ], + } + } + ] + } + + result = parse_fetal_anatomy(data, "observer_json") + + assert "Head" in result["abnormal_structures"] + assert "Cerebellum" in result["abnormal_structures"] + assert len(result["anomalies"]) == 1 + assert result["anomalies"][0]["structure"] == "Head" + assert result["anomalies"][0]["description"] == "Dandy Walker" + assert result["anomalies"][0]["variant_type"] == "Abnormal" + + def test_anatomy_with_hpo_extraction(self, hpo_cr): + """Test HPO term extraction from anomaly descriptions.""" + data = { + "fetuses": [ + { + "fetus": { + "anatomy_text": "Findings consistent with Dandy-Walker malformation.", + "anatomy": [ + { + "main": {"label": "Brain", "anat_state": "Abnormal"}, + "detail": [], + "anomalies": [ + {"description": "Ventriculomegaly noted"} + ], + } + ], + } + } + ] + } + + result = parse_fetal_anatomy(data, "observer_json", hpo_cr=hpo_cr) + + # Should find HPO terms from the combined text + assert len(result["hpo_terms"]) > 0 + hpo_ids = [t.hpo_id for t in result["hpo_terms"]] + # Dandy-Walker malformation is HP:0001305 + assert "HP:0001305" in hpo_ids or "HP:0002119" in hpo_ids # Ventriculomegaly + + def test_anatomy_json_string_input(self): + """Test that JSON string input is handled correctly.""" + data = json.dumps( + { + "fetuses": [ + { + "fetus": { + "anatomy_text": "Normal anatomy.", + "anatomy": [ + {"main": {"label": "Face", "anat_state": "Normal"}} + ], + } + } + ] + } + ) + + result = parse_fetal_anatomy(data, "observer_json") + + assert "Face" in result["normal_structures"] + + def test_empty_fetuses(self): + """Test handling of empty fetuses array.""" + data = {"fetuses": []} + + result = parse_fetal_anatomy(data, "observer_json") + + assert result["normal_structures"] == [] + assert result["abnormal_structures"] == [] + assert result["anomalies"] == [] + + def test_missing_anatomy_key(self): + """Test handling of fetus without anatomy key.""" + data = {"fetuses": [{"fetus": {"anatomy_text": "Some text."}}]} + + result = parse_fetal_anatomy(data, "observer_json") + + assert result["anatomy_text"] == "Some text." + assert result["normal_structures"] == [] + + +# --------------------------------------------------------------------- +# ViewPoint Text (Skeleton) +# --------------------------------------------------------------------- + + +class TestFetalAnatomyViewPointText: + def test_skeleton_returns_empty_structures(self): + """Test that skeleton implementation returns expected structure.""" + text = """Fetal Anatomy +============= +The following structures appear normal: +Cranium. Brain. Face. +""" + + result = parse_fetal_anatomy(text, "viewpoint_text") + + assert result["source_format"] == "viewpoint_text" + assert isinstance(result["normal_structures"], list) + assert isinstance(result["abnormal_structures"], list) + # Skeleton extracts anatomy_text but doesn't parse structure lists yet + assert "normal" in result["anatomy_text"].lower() + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (Skeleton) +# --------------------------------------------------------------------- + + +class TestFetalAnatomyViewPointHL7: + def test_skeleton_returns_empty(self): + """Test that HL7 skeleton returns empty result.""" + hl7 = "MSH|...\nOBX|..." + + result = parse_fetal_anatomy(hl7, "viewpoint_hl7") + + assert result["source_format"] == "viewpoint_hl7" + assert result["normal_structures"] == [] + assert result["anatomy_text"] == "" + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestFetalAnatomyEdgeCases: + def test_invalid_format(self): + """Test that invalid format raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_anatomy("data", "invalid_format") + + def test_non_string_viewpoint_text(self): + """Test that non-string viewpoint_text raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_anatomy({"not": "string"}, "viewpoint_text") + + def test_non_string_viewpoint_hl7(self): + """Test that non-string viewpoint_hl7 raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_anatomy({"not": "string"}, "viewpoint_hl7") From 7bfc89e554164894eddf45d2545d33cb096bb7be Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 02:41:56 -0500 Subject: [PATCH 10/20] feat(etl): add estimated fetal weight section parser with growth classification --- .../etl/sections/estimated_fetal_weight.py | 249 +++++++++++++++ .../sections/test_estimated_fetal_weight.py | 295 ++++++++++++++++++ 2 files changed, 544 insertions(+) create mode 100644 src/prenatalppkt/etl/sections/estimated_fetal_weight.py create mode 100644 tests/etl/sections/test_estimated_fetal_weight.py diff --git a/src/prenatalppkt/etl/sections/estimated_fetal_weight.py b/src/prenatalppkt/etl/sections/estimated_fetal_weight.py new file mode 100644 index 0000000..04ac910 --- /dev/null +++ b/src/prenatalppkt/etl/sections/estimated_fetal_weight.py @@ -0,0 +1,249 @@ +""" +Estimated fetal weight (EFW) section parser. + +Extracts EFW values, percentiles, and growth classification. +""" + +from __future__ import annotations + +import json +import re +from typing import Dict, List, Optional, Union + + +def parse_estimated_fetal_weight(data: Union[str, Dict], source_format: str) -> Dict: + """ + Parse estimated fetal weight section. + + Supports: + - observer_json + - viewpoint_text (skeleton) + - viewpoint_hl7 (skeleton) + + Args: + data: Raw input data (JSON string, dict, or text) + source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + + Returns: + Dict with keys: + - efw_grams: float - Primary EFW value in grams + - percentile: float - Percentile for primary EFW + - method: str - Calculation method (e.g., "Hadlock (AC, FL, HC)") + - within_normal_range: bool - True if 10th-90th percentile + - growth_category: str - "SGA", "AGA", or "LGA" + - all_estimates: List[Dict] - All EFW calculations + - source_format: str + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + return _parse_observer_efw(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + return _parse_viewpoint_text_efw(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + return _parse_viewpoint_hl7_efw(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_efw(json_data: Dict) -> Dict: + """ + Extract EFW from Observer JSON. + + Path: fetuses[i].efws[] + - fetus_number: int + - label: str - method description (e.g., "EFW (AC, FL, HC)") + - value: float - weight in grams + - calculated_percentile: float + - percentile_for_display: str + - print_in_report: int - 1 if this is the primary EFW + - range: str - optional expected range + """ + all_estimates: List[Dict] = [] + primary_efw: Optional[Dict] = None + + # Get first fetus + fetuses = json_data.get("fetuses", []) + if not fetuses: + return _empty_result("observer_json") + + efws = fetuses[0].get("efws", []) + if not efws: + return _empty_result("observer_json") + + for efw in efws: + label = efw.get("label", "") + value = efw.get("value", 0) + percentile = efw.get("calculated_percentile", 0) + print_in_report = efw.get("print_in_report", 0) + + # Extract method from label (e.g., "EFW (AC, FL, HC)" -> "AC, FL, HC") + method = _extract_method_from_label(label) + + estimate = { + "method": method, + "grams": round(value, 1), + "percentile": round(percentile, 1), + "print_in_report": bool(print_in_report), + } + all_estimates.append(estimate) + + # Select primary EFW (print_in_report=1 or first one) + if print_in_report == 1 and primary_efw is None: + primary_efw = estimate + + # Fallback to first estimate if none marked for report + if primary_efw is None and all_estimates: + primary_efw = all_estimates[0] + + if primary_efw is None: + return _empty_result("observer_json") + + # Classify growth + percentile = primary_efw["percentile"] + growth_category = _classify_growth(percentile) + within_normal = 10 <= percentile <= 90 + + return { + "efw_grams": primary_efw["grams"], + "percentile": primary_efw["percentile"], + "method": primary_efw["method"], + "within_normal_range": within_normal, + "growth_category": growth_category, + "all_estimates": all_estimates, + "source_format": "observer_json", + } + + +# --------------------------------------------------------------------- +# ViewPoint Text (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_efw(text: str) -> Dict: + """ + Extract EFW from ViewPoint text reports. + + Expected patterns: + EFW 2,042 g 2% + EFW (lb,oz) 4 lb 8 oz + EFW by Hadlock (BPD-HC-AC-FL) + + TODO @VarenyaJ: Implement full parsing + """ + efw_grams = None + percentile = None + method = None + + # Try to find EFW line with grams + efw_pattern = re.compile(r"EFW\s+([0-9,]+)\s+g\s+(\d+)%", re.IGNORECASE) + match = efw_pattern.search(text) + if match: + efw_grams = float(match.group(1).replace(",", "")) + percentile = float(match.group(2)) + + # Try to find method + method_pattern = re.compile(r"EFW by\s+(.+)", re.IGNORECASE) + method_match = method_pattern.search(text) + if method_match: + method = method_match.group(1).strip() + + if efw_grams is None: + return _empty_result("viewpoint_text") + + growth_category = _classify_growth(percentile) if percentile else "Unknown" + within_normal = 10 <= percentile <= 90 if percentile else False + + return { + "efw_grams": efw_grams, + "percentile": percentile, + "method": method or "Unknown", + "within_normal_range": within_normal, + "growth_category": growth_category, + "all_estimates": [ + { + "method": method or "Unknown", + "grams": efw_grams, + "percentile": percentile, + } + ], + "source_format": "viewpoint_text", + } + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_efw(hl7: str) -> Dict: + """ + Extract EFW from HL7 ORU^R01 messages. + + Note: EFW may not be present in all HL7 exports. + This is a skeleton for potential future implementation. + + TODO @VarenyaJ: Implement if HL7 EFW encoding is discovered + """ + return _empty_result("viewpoint_hl7") + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _extract_method_from_label(label: str) -> str: + """ + Extract method from EFW label. + + Examples: + "EFW (AC, FL, HC)" -> "Hadlock (AC, FL, HC)" + "EFW (AC, FL)" -> "Hadlock (AC, FL)" + """ + match = re.search(r"\(([^)]+)\)", label) + if match: + params = match.group(1) + return f"Hadlock ({params})" + return "Hadlock" + + +def _classify_growth(percentile: float) -> str: + """ + Classify fetal growth based on EFW percentile. + + - SGA (Small for Gestational Age): <10th percentile + - AGA (Appropriate for Gestational Age): 10th-90th percentile + - LGA (Large for Gestational Age): >90th percentile + """ + if percentile < 10: + return "SGA" + elif percentile > 90: + return "LGA" + else: + return "AGA" + + +def _empty_result(source_format: str) -> Dict: + """Return empty result structure.""" + return { + "efw_grams": None, + "percentile": None, + "method": None, + "within_normal_range": None, + "growth_category": None, + "all_estimates": [], + "source_format": source_format, + } diff --git a/tests/etl/sections/test_estimated_fetal_weight.py b/tests/etl/sections/test_estimated_fetal_weight.py new file mode 100644 index 0000000..68bc210 --- /dev/null +++ b/tests/etl/sections/test_estimated_fetal_weight.py @@ -0,0 +1,295 @@ +"""Tests for estimated fetal weight section parser.""" + +import json +import pytest + +from prenatalppkt.etl.sections.estimated_fetal_weight import ( + parse_estimated_fetal_weight, +) + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestEstimatedFetalWeightObserver: + def test_basic_efw(self): + """Test parsing of basic EFW data.""" + data = { + "fetuses": [ + { + "efws": [ + { + "fetus_number": 1, + "label": "EFW (AC, FL, HC)", + "value": 1014.828, + "calculated_percentile": 55.6, + "percentile_for_display": "56%", + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["efw_grams"] == 1014.8 + assert result["percentile"] == 55.6 + assert result["method"] == "Hadlock (AC, FL, HC)" + assert result["within_normal_range"] is True + assert result["growth_category"] == "AGA" + assert result["source_format"] == "observer_json" + + def test_multiple_efw_estimates(self): + """Test that primary EFW is selected correctly.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL, HC)", + "value": 1014.828, + "calculated_percentile": 55.6, + "print_in_report": 1, + }, + { + "label": "EFW (AC, FL)", + "value": 1042.214, + "calculated_percentile": 63.7, + "print_in_report": 0, + }, + { + "label": "EFW (AC, BPD)", + "value": 1000.887, + "calculated_percentile": 51.2, + "print_in_report": 0, + }, + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + # Should select the one with print_in_report=1 + assert result["efw_grams"] == 1014.8 + assert len(result["all_estimates"]) == 3 + + def test_sga_classification(self): + """Test SGA (Small for Gestational Age) classification.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL, HC)", + "value": 800.0, + "calculated_percentile": 5.0, + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["growth_category"] == "SGA" + assert result["within_normal_range"] is False + + def test_lga_classification(self): + """Test LGA (Large for Gestational Age) classification.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL, HC)", + "value": 2500.0, + "calculated_percentile": 95.0, + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["growth_category"] == "LGA" + assert result["within_normal_range"] is False + + def test_json_string_input(self): + """Test that JSON string input is handled correctly.""" + data = json.dumps( + { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL)", + "value": 1200.0, + "calculated_percentile": 50.0, + "print_in_report": 1, + } + ] + } + ] + } + ) + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["efw_grams"] == 1200.0 + + def test_empty_fetuses(self): + """Test handling of empty fetuses array.""" + data = {"fetuses": []} + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["efw_grams"] is None + assert result["all_estimates"] == [] + + def test_missing_efws_key(self): + """Test handling of fetus without efws key.""" + data = {"fetuses": [{"fetus": {}}]} + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["efw_grams"] is None + + def test_fallback_to_first_estimate(self): + """Test fallback when no estimate has print_in_report=1.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL)", + "value": 1100.0, + "calculated_percentile": 45.0, + "print_in_report": 0, + }, + { + "label": "EFW (AC, BPD)", + "value": 1050.0, + "calculated_percentile": 40.0, + "print_in_report": 0, + }, + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + # Should fall back to first estimate + assert result["efw_grams"] == 1100.0 + + +# --------------------------------------------------------------------- +# ViewPoint Text (Skeleton) +# --------------------------------------------------------------------- + + +class TestEstimatedFetalWeightViewPointText: + def test_skeleton_returns_structure(self): + """Test that skeleton implementation returns expected structure.""" + text = "EFW 2,042 g 2%\nEFW by Hadlock" + + result = parse_estimated_fetal_weight(text, "viewpoint_text") + + assert result["source_format"] == "viewpoint_text" + # Skeleton may parse basic patterns + assert isinstance(result["all_estimates"], list) + + def test_no_efw_in_text(self): + """Test handling when no EFW is found.""" + text = "Fetal Biometry\nHC 250 mm" + + result = parse_estimated_fetal_weight(text, "viewpoint_text") + + assert result["efw_grams"] is None + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (Skeleton) +# --------------------------------------------------------------------- + + +class TestEstimatedFetalWeightViewPointHL7: + def test_skeleton_returns_empty(self): + """Test that HL7 skeleton returns empty result.""" + hl7 = "MSH|...\nOBX|..." + + result = parse_estimated_fetal_weight(hl7, "viewpoint_hl7") + + assert result["source_format"] == "viewpoint_hl7" + assert result["efw_grams"] is None + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestEstimatedFetalWeightEdgeCases: + def test_invalid_format(self): + """Test that invalid format raises ValueError.""" + with pytest.raises(ValueError): + parse_estimated_fetal_weight("data", "invalid_format") + + def test_non_string_viewpoint_text(self): + """Test that non-string viewpoint_text raises ValueError.""" + with pytest.raises(ValueError): + parse_estimated_fetal_weight({"not": "string"}, "viewpoint_text") + + def test_non_string_viewpoint_hl7(self): + """Test that non-string viewpoint_hl7 raises ValueError.""" + with pytest.raises(ValueError): + parse_estimated_fetal_weight({"not": "string"}, "viewpoint_hl7") + + def test_boundary_aga_at_10_percentile(self): + """Test AGA classification at exactly 10th percentile.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW", + "value": 900.0, + "calculated_percentile": 10.0, + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + assert result["growth_category"] == "AGA" + assert result["within_normal_range"] is True + + def test_boundary_aga_at_90_percentile(self): + """Test AGA classification at exactly 90th percentile.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW", + "value": 2000.0, + "calculated_percentile": 90.0, + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + assert result["growth_category"] == "AGA" + assert result["within_normal_range"] is True From 52cd807bc791f024373e2b4e4973b0137078dccb Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 02:42:19 -0500 Subject: [PATCH 11/20] feat(etl): add fetal ratios section parser with proportionality assessment --- src/prenatalppkt/etl/sections/fetal_ratios.py | 246 ++++++++++++++++ tests/etl/sections/test_fetal_ratios.py | 271 ++++++++++++++++++ 2 files changed, 517 insertions(+) create mode 100644 src/prenatalppkt/etl/sections/fetal_ratios.py create mode 100644 tests/etl/sections/test_fetal_ratios.py diff --git a/src/prenatalppkt/etl/sections/fetal_ratios.py b/src/prenatalppkt/etl/sections/fetal_ratios.py new file mode 100644 index 0000000..732b715 --- /dev/null +++ b/src/prenatalppkt/etl/sections/fetal_ratios.py @@ -0,0 +1,246 @@ +""" +Fetal ratios section parser. + +Extracts biometric ratios (HC/AC, FL/BPD, FL/AC) and assesses proportionality. +""" + +from __future__ import annotations + +import json +import re +from typing import Dict, List, Optional, Tuple, Union + + +def parse_fetal_ratios(data: Union[str, Dict], source_format: str) -> Dict: + """ + Parse fetal ratios section. + + Supports: + - observer_json + - viewpoint_text (skeleton) + - viewpoint_hl7 (skeleton) + + Args: + data: Raw input data (JSON string, dict, or text) + source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + + Returns: + Dict with keys: + - ratios: List[Dict] - Individual ratio data + - all_within_range: bool - True if all ratios are normal + - proportionality_assessment: str - "Normal" or "Asymmetric" + - source_format: str + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + return _parse_observer_ratios(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + return _parse_viewpoint_text_ratios(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + return _parse_viewpoint_hl7_ratios(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_ratios(json_data: Dict) -> Dict: + """ + Extract ratios from Observer JSON. + + Path: fetuses[i].ratios[] + - label: str - ratio name (e.g., "HC/AC", "FL/BPD") + - value: float - calculated ratio value + - range: str - expected normal range (e.g., "1.04 - 1.22") + - fetus_number: int + """ + ratios: List[Dict] = [] + + # Get first fetus + fetuses = json_data.get("fetuses", []) + if not fetuses: + return _empty_result("observer_json") + + ratio_list = fetuses[0].get("ratios", []) + if not ratio_list: + return _empty_result("observer_json") + + all_within_range = True + + for ratio in ratio_list: + label = ratio.get("label", "") + value = ratio.get("value", 0) + range_str = ratio.get("range", "") + + # Parse expected range + expected_range = _parse_range_string(range_str) + + # Check if within range + within_range = _is_within_range(value, expected_range) + if not within_range: + all_within_range = False + + ratios.append( + { + "name": label, + "value": round(value, 3) if isinstance(value, float) else value, + "expected_range": expected_range, + "within_range": within_range, + } + ) + + # Assess overall proportionality + # Asymmetric growth typically indicated by abnormal HC/AC ratio + proportionality = _assess_proportionality(ratios) + + return { + "ratios": ratios, + "all_within_range": all_within_range, + "proportionality_assessment": proportionality, + "source_format": "observer_json", + } + + +# --------------------------------------------------------------------- +# ViewPoint Text (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_ratios(text: str) -> Dict: + """ + Extract ratios from ViewPoint text reports. + + Expected pattern (under Fetal Biometry section): + FL / HC 0.23 + + TODO @VarenyaJ: : Implement full parsing + """ + ratios: List[Dict] = [] + + # Try to find ratio lines + # Pattern: FL / HC 0.23 + ratio_pattern = re.compile( + r"(FL|HC|AC|BPD)\s*/\s*(FL|HC|AC|BPD)\s+([\d.]+)", re.IGNORECASE + ) + + for match in ratio_pattern.finditer(text): + name = f"{match.group(1).upper()}/{match.group(2).upper()}" + value = float(match.group(3)) + ratios.append( + { + "name": name, + "value": value, + "expected_range": None, # Not available in text format + "within_range": None, + } + ) + + return { + "ratios": ratios, + "all_within_range": None, # Cannot assess without ranges + "proportionality_assessment": "Unknown", + "source_format": "viewpoint_text", + } + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_ratios(hl7: str) -> Dict: + """ + Extract ratios from HL7 ORU^R01 messages. + + Note: Ratios may not be present in all HL7 exports. + This is a skeleton for potential future implementation. + + TODO @VarenyaJ: : Implement if HL7 ratio encoding is discovered + """ + return _empty_result("viewpoint_hl7") + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _parse_range_string(range_str: str) -> Optional[Tuple[float, float]]: + """ + Parse a range string into a tuple. + + Examples: + "1.04 - 1.22" -> (1.04, 1.22) + "20 - 24" -> (20.0, 24.0) + "" -> None + """ + if not range_str: + return None + + # Pattern: "min - max" or "min-max" + match = re.match(r"([\d.]+)\s*-\s*([\d.]+)", range_str.strip()) + if match: + return (float(match.group(1)), float(match.group(2))) + + return None + + +def _is_within_range( + value: float, expected_range: Optional[Tuple[float, float]] +) -> Optional[bool]: + """ + Check if a value is within the expected range. + + Returns None if range is not available. + """ + if expected_range is None: + return None + + min_val, max_val = expected_range + return min_val <= value <= max_val + + +def _assess_proportionality(ratios: List[Dict]) -> str: + """ + Assess overall fetal proportionality based on ratios. + + Asymmetric growth is typically indicated when: + - HC/AC ratio is abnormal (head-sparing or brain-sparing pattern) + - FL/AC ratio is abnormal + """ + if not ratios: + return "Unknown" + + # Check HC/AC specifically for asymmetric growth + for ratio in ratios: + if ratio["name"] == "HC/AC" and ratio["within_range"] is False: + return "Asymmetric" + + # Check if all ratios with known ranges are within range + ratios_with_ranges = [r for r in ratios if r["within_range"] is not None] + if not ratios_with_ranges: + return "Unknown" + + all_normal = all(r["within_range"] for r in ratios_with_ranges) + return "Normal" if all_normal else "Asymmetric" + + +def _empty_result(source_format: str) -> Dict: + """Return empty result structure.""" + return { + "ratios": [], + "all_within_range": None, + "proportionality_assessment": "Unknown", + "source_format": source_format, + } diff --git a/tests/etl/sections/test_fetal_ratios.py b/tests/etl/sections/test_fetal_ratios.py new file mode 100644 index 0000000..b5acd66 --- /dev/null +++ b/tests/etl/sections/test_fetal_ratios.py @@ -0,0 +1,271 @@ +"""Tests for fetal ratios section parser.""" + +import json +import pytest + +from prenatalppkt.etl.sections.fetal_ratios import parse_fetal_ratios + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestFetalRatiosObserver: + def test_basic_ratios(self): + """Test parsing of basic ratio data.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 1.105, + "range": "1.04 - 1.22", + "fetus_number": 1, + }, + { + "label": "FL/AC", + "value": 22.149, + "range": "20 - 24", + "fetus_number": 1, + }, + { + "label": "FL/BPD", + "value": 75, + "range": "71 - 87", + "fetus_number": 1, + }, + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert len(result["ratios"]) == 3 + assert result["all_within_range"] is True + assert result["proportionality_assessment"] == "Normal" + assert result["source_format"] == "observer_json" + + # Check specific ratio + hc_ac = next(r for r in result["ratios"] if r["name"] == "HC/AC") + assert hc_ac["value"] == 1.105 + assert hc_ac["expected_range"] == (1.04, 1.22) + assert hc_ac["within_range"] is True + + def test_ratio_out_of_range(self): + """Test detection of out-of-range ratio.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 1.35, # Above normal range + "range": "1.04 - 1.22", + "fetus_number": 1, + } + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["all_within_range"] is False + assert result["proportionality_assessment"] == "Asymmetric" + assert result["ratios"][0]["within_range"] is False + + def test_asymmetric_growth_detection(self): + """Test asymmetric growth pattern detection via HC/AC.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 0.95, # Below normal - head-sparing + "range": "1.04 - 1.22", + }, + { + "label": "FL/BPD", + "value": 80, # Within range + "range": "71 - 87", + }, + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["proportionality_assessment"] == "Asymmetric" + + def test_json_string_input(self): + """Test that JSON string input is handled correctly.""" + data = json.dumps( + { + "fetuses": [ + { + "ratios": [ + {"label": "HC/AC", "value": 1.1, "range": "1.04 - 1.22"} + ] + } + ] + } + ) + + result = parse_fetal_ratios(data, "observer_json") + + assert len(result["ratios"]) == 1 + + def test_empty_fetuses(self): + """Test handling of empty fetuses array.""" + data = {"fetuses": []} + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"] == [] + assert result["all_within_range"] is None + + def test_missing_ratios_key(self): + """Test handling of fetus without ratios key.""" + data = {"fetuses": [{"fetus": {}}]} + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"] == [] + + def test_ratio_without_range(self): + """Test handling of ratio without expected range.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 1.1, + "range": "", # Empty range + } + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"][0]["expected_range"] is None + assert result["ratios"][0]["within_range"] is None + + def test_boundary_values(self): + """Test boundary values at exactly min and max of range.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 1.04, + "range": "1.04 - 1.22", + }, # At min + {"label": "FL/AC", "value": 24, "range": "20 - 24"}, # At max + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert all(r["within_range"] for r in result["ratios"]) + + +# --------------------------------------------------------------------- +# ViewPoint Text (Skeleton) +# --------------------------------------------------------------------- + + +class TestFetalRatiosViewPointText: + def test_skeleton_parses_ratio_pattern(self): + """Test that skeleton can parse basic ratio patterns.""" + text = """Fetal Biometry +============ +FL / HC 0.23 +""" + + result = parse_fetal_ratios(text, "viewpoint_text") + + assert result["source_format"] == "viewpoint_text" + # Skeleton may parse the FL/HC ratio + assert isinstance(result["ratios"], list) + + def test_no_ratios_in_text(self): + """Test handling when no ratios are found.""" + text = "Fetal Biometry\nHC 250 mm" + + result = parse_fetal_ratios(text, "viewpoint_text") + + assert result["ratios"] == [] + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (Skeleton) +# --------------------------------------------------------------------- + + +class TestFetalRatiosViewPointHL7: + def test_skeleton_returns_empty(self): + """Test that HL7 skeleton returns empty result.""" + hl7 = "MSH|...\nOBX|..." + + result = parse_fetal_ratios(hl7, "viewpoint_hl7") + + assert result["source_format"] == "viewpoint_hl7" + assert result["ratios"] == [] + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestFetalRatiosEdgeCases: + def test_invalid_format(self): + """Test that invalid format raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_ratios("data", "invalid_format") + + def test_non_string_viewpoint_text(self): + """Test that non-string viewpoint_text raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_ratios({"not": "string"}, "viewpoint_text") + + def test_non_string_viewpoint_hl7(self): + """Test that non-string viewpoint_hl7 raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_ratios({"not": "string"}, "viewpoint_hl7") + + def test_malformed_range_string(self): + """Test handling of malformed range string.""" + data = { + "fetuses": [ + {"ratios": [{"label": "HC/AC", "value": 1.1, "range": "invalid"}]} + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"][0]["expected_range"] is None + + def test_integer_ratio_value(self): + """Test that integer ratio values are handled.""" + data = { + "fetuses": [ + {"ratios": [{"label": "FL/BPD", "value": 75, "range": "71 - 87"}]} + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"][0]["value"] == 75 + assert result["ratios"][0]["within_range"] is True From 0972eb5e062783b65067af440d6dc21aa35953c5 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 02:42:35 -0500 Subject: [PATCH 12/20] feat(etl): export new section parsers from sections module --- src/prenatalppkt/etl/sections/__init__.py | 29 +++++++++++++++-------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/prenatalppkt/etl/sections/__init__.py b/src/prenatalppkt/etl/sections/__init__.py index 2d3fc17..3893145 100644 --- a/src/prenatalppkt/etl/sections/__init__.py +++ b/src/prenatalppkt/etl/sections/__init__.py @@ -2,18 +2,21 @@ Section parsers for non-biometry clinical data. These parsers extract additional clinical information from ultrasound reports -beyond fetal biometry measurements. They are designed to eventually integrate -with HPO Clinical Record (CR) modules for comprehensive phenotype capture. +beyond fetal biometry measurements. They return Dict objects with parsed data. -Current Status: SKELETON IMPLEMENTATIONS -- Basic parsing structure in place -- Returns placeholder data -- TODO comments describe future implementation +Implemented parsers: +- parse_clinical_indication: Extract reason for exam +- parse_pregnancy_dating: Extract LMP, EDD, gestational age +- parse_clinical_impression: Extract clinical narrative and HPO terms +- parse_fetal_anatomy: Extract anatomy findings and HPO terms +- parse_estimated_fetal_weight: Extract EFW and growth classification +- parse_fetal_ratios: Extract biometric ratios and proportionality -Future Integration: -- Map findings to HPO terms using src/prenatalppkt/hpo modules -- Support symmetric processing across Observer JSON, ViewPoint Text, and HL7 -- Enable full phenotype packet generation +Skeleton parsers (TODO): +- parse_maternal_history: OB history, complications +- parse_placenta: Placental assessment +- parse_amniotic_fluid: AFI, MVP measurements +- parse_umbilical_cord: Vessel count, insertion site """ from prenatalppkt.etl.sections.maternal_history import parse_maternal_history @@ -21,6 +24,10 @@ from prenatalppkt.etl.sections.clinical_indication import parse_clinical_indication from prenatalppkt.etl.sections.pregnancy_dating import parse_pregnancy_dating from prenatalppkt.etl.sections.fetal_anatomy import parse_fetal_anatomy +from prenatalppkt.etl.sections.estimated_fetal_weight import ( + parse_estimated_fetal_weight, +) +from prenatalppkt.etl.sections.fetal_ratios import parse_fetal_ratios from prenatalppkt.etl.sections.placenta import parse_placenta from prenatalppkt.etl.sections.amniotic_fluid import parse_amniotic_fluid from prenatalppkt.etl.sections.umbilical_cord import parse_umbilical_cord @@ -31,6 +38,8 @@ "parse_clinical_indication", "parse_pregnancy_dating", "parse_fetal_anatomy", + "parse_estimated_fetal_weight", + "parse_fetal_ratios", "parse_placenta", "parse_amniotic_fluid", "parse_umbilical_cord", From f4285b4204f1d285579d0da2eabeb876a9dcfc03 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 02:43:23 -0500 Subject: [PATCH 13/20] ruffing --- .../etl/sections/fetal_anatomy.py | 382 +++++++++--------- 1 file changed, 190 insertions(+), 192 deletions(-) diff --git a/src/prenatalppkt/etl/sections/fetal_anatomy.py b/src/prenatalppkt/etl/sections/fetal_anatomy.py index 5613aef..34c0351 100644 --- a/src/prenatalppkt/etl/sections/fetal_anatomy.py +++ b/src/prenatalppkt/etl/sections/fetal_anatomy.py @@ -13,49 +13,49 @@ def parse_fetal_anatomy( - data: Union[str, Dict], source_format: str, hpo_cr=None + data: Union[str, Dict], source_format: str, hpo_cr=None ) -> Dict: - """ - Parse fetal anatomy section. - - Supports: - - observer_json - - viewpoint_text (skeleton) - - viewpoint_hl7 (skeleton) - - Args: - data: Raw input data (JSON string, dict, or text) - source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" - hpo_cr: Optional HpoExactConceptRecognizer for HPO term extraction. - If provided, will extract HPO terms from anomaly descriptions. - - Returns: - Dict with keys: - - anatomy_text: str - Free text anatomy narrative - - normal_structures: List[str] - Structures marked Normal - - abnormal_structures: List[str] - Structures marked Abnormal - - not_visualized: List[str] - Structures marked Unseen - - anomalies: List[Dict] - Specific anomaly findings - - hpo_terms: List[SimpleTerm] - HPO terms extracted via CR - - source_format: str - """ - if source_format == "observer_json": - if isinstance(data, str): - data = json.loads(data) - return _parse_observer_anatomy(data, hpo_cr) - - elif source_format == "viewpoint_text": - if not isinstance(data, str): - raise ValueError("viewpoint_text data must be a string") - return _parse_viewpoint_text_anatomy(data, hpo_cr) - - elif source_format == "viewpoint_hl7": - if not isinstance(data, str): - raise ValueError("viewpoint_hl7 data must be a string") - return _parse_viewpoint_hl7_anatomy(data, hpo_cr) - - else: - raise ValueError(f"Unsupported source_format: {source_format}") + """ + Parse fetal anatomy section. + + Supports: + - observer_json + - viewpoint_text (skeleton) + - viewpoint_hl7 (skeleton) + + Args: + data: Raw input data (JSON string, dict, or text) + source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + hpo_cr: Optional HpoExactConceptRecognizer for HPO term extraction. + If provided, will extract HPO terms from anomaly descriptions. + + Returns: + Dict with keys: + - anatomy_text: str - Free text anatomy narrative + - normal_structures: List[str] - Structures marked Normal + - abnormal_structures: List[str] - Structures marked Abnormal + - not_visualized: List[str] - Structures marked Unseen + - anomalies: List[Dict] - Specific anomaly findings + - hpo_terms: List[SimpleTerm] - HPO terms extracted via CR + - source_format: str + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + return _parse_observer_anatomy(data, hpo_cr) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + return _parse_viewpoint_text_anatomy(data, hpo_cr) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + return _parse_viewpoint_hl7_anatomy(data, hpo_cr) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") # --------------------------------------------------------------------- @@ -64,113 +64,111 @@ def parse_fetal_anatomy( def _classify_structure( - label: str, - state: str, - normal: List[str], - abnormal: List[str], - unseen: List[str], + label: str, state: str, normal: List[str], abnormal: List[str], unseen: List[str] ) -> None: - """Classify a structure into the appropriate list based on state.""" - if not label: - return - if state == "Normal" and label not in normal: - normal.append(label) - elif state == "Abnormal" and label not in abnormal: - abnormal.append(label) - elif state == "Unseen" and label not in unseen: - unseen.append(label) + """Classify a structure into the appropriate list based on state.""" + if not label: + return + if state == "Normal" and label not in normal: + normal.append(label) + elif state == "Abnormal" and label not in abnormal: + abnormal.append(label) + elif state == "Unseen" and label not in unseen: + unseen.append(label) def _process_anatomy_item( - item: Dict, - normal: List[str], - abnormal: List[str], - unseen: List[str], - anomalies: List[Dict], + item: Dict, + normal: List[str], + abnormal: List[str], + unseen: List[str], + anomalies: List[Dict], ) -> None: - """Process a single anatomy item, extracting structures and anomalies.""" - main = item.get("main", {}) - label = main.get("label", "") - state = main.get("anat_state", "") - - # Classify main structure - _classify_structure(label, state, normal, abnormal, unseen) - - # Process detail sub-structures - for detail in item.get("detail", []): - detail_label = detail.get("label", "") - detail_state = detail.get("anat_det_state", "") - _classify_structure(detail_label, detail_state, normal, abnormal, unseen) - - # Process anomalies - for anom in item.get("anomalies", []): - description = anom.get("description", "") - if description: - anomalies.append({ - "structure": label, - "description": description, - "variant_type": anom.get("abnormal_or_normal_variant", "Abnormal"), - }) + """Process a single anatomy item, extracting structures and anomalies.""" + main = item.get("main", {}) + label = main.get("label", "") + state = main.get("anat_state", "") + + # Classify main structure + _classify_structure(label, state, normal, abnormal, unseen) + + # Process detail sub-structures + for detail in item.get("detail", []): + detail_label = detail.get("label", "") + detail_state = detail.get("anat_det_state", "") + _classify_structure(detail_label, detail_state, normal, abnormal, unseen) + + # Process anomalies + for anom in item.get("anomalies", []): + description = anom.get("description", "") + if description: + anomalies.append( + { + "structure": label, + "description": description, + "variant_type": anom.get("abnormal_or_normal_variant", "Abnormal"), + } + ) def _extract_hpo_terms(anatomy_text: str, anomalies: List[Dict], hpo_cr) -> List: - """Extract HPO terms from anatomy text and anomaly descriptions.""" - if hpo_cr is None or not hasattr(hpo_cr, "parse"): - return [] + """Extract HPO terms from anatomy text and anomaly descriptions.""" + if hpo_cr is None or not hasattr(hpo_cr, "parse"): + return [] - all_anomaly_text = " ".join( - a["description"] for a in anomalies if a.get("description") - ) - combined_text = f"{anatomy_text} {all_anomaly_text}".strip() + all_anomaly_text = " ".join( + a["description"] for a in anomalies if a.get("description") + ) + combined_text = f"{anatomy_text} {all_anomaly_text}".strip() - if not combined_text: - return [] + if not combined_text: + return [] - return hpo_cr.parse(combined_text) + return hpo_cr.parse(combined_text) def _parse_observer_anatomy(json_data: Dict, hpo_cr=None) -> Dict: - """ - Extract anatomy findings from Observer JSON. - - Paths: - - fetuses[i].fetus.anatomy_text - free text narrative - - fetuses[i].fetus.anatomy[] - structured findings - - main.label - structure name (e.g., "Head", "Face") - - main.anat_state - "Normal", "Abnormal", or "Unseen" - - detail[].label - sub-structure name - - detail[].anat_det_state - sub-structure state - - anomalies[].description - specific finding text - - anomalies[].abnormal_or_normal_variant - classification - """ - fetuses = json_data.get("fetuses", []) - if not fetuses: - return _empty_result("observer_json") - - fetus_block = fetuses[0].get("fetus", {}) - anatomy_text = fetus_block.get("anatomy_text", "") - - normal_structures: List[str] = [] - abnormal_structures: List[str] = [] - not_visualized: List[str] = [] - anomalies: List[Dict] = [] - - for item in fetus_block.get("anatomy", []): - _process_anatomy_item( - item, normal_structures, abnormal_structures, not_visualized, anomalies - ) - - hpo_terms = _extract_hpo_terms(anatomy_text, anomalies, hpo_cr) - - return { - "anatomy_text": anatomy_text, - "normal_structures": normal_structures, - "abnormal_structures": abnormal_structures, - "not_visualized": not_visualized, - "anomalies": anomalies, - "hpo_terms": hpo_terms, - "source_format": "observer_json", - } + """ + Extract anatomy findings from Observer JSON. + + Paths: + - fetuses[i].fetus.anatomy_text - free text narrative + - fetuses[i].fetus.anatomy[] - structured findings + - main.label - structure name (e.g., "Head", "Face") + - main.anat_state - "Normal", "Abnormal", or "Unseen" + - detail[].label - sub-structure name + - detail[].anat_det_state - sub-structure state + - anomalies[].description - specific finding text + - anomalies[].abnormal_or_normal_variant - classification + """ + fetuses = json_data.get("fetuses", []) + if not fetuses: + return _empty_result("observer_json") + + fetus_block = fetuses[0].get("fetus", {}) + anatomy_text = fetus_block.get("anatomy_text", "") + + normal_structures: List[str] = [] + abnormal_structures: List[str] = [] + not_visualized: List[str] = [] + anomalies: List[Dict] = [] + + for item in fetus_block.get("anatomy", []): + _process_anatomy_item( + item, normal_structures, abnormal_structures, not_visualized, anomalies + ) + + hpo_terms = _extract_hpo_terms(anatomy_text, anomalies, hpo_cr) + + return { + "anatomy_text": anatomy_text, + "normal_structures": normal_structures, + "abnormal_structures": abnormal_structures, + "not_visualized": not_visualized, + "anomalies": anomalies, + "hpo_terms": hpo_terms, + "source_format": "observer_json", + } # --------------------------------------------------------------------- @@ -179,46 +177,46 @@ def _parse_observer_anatomy(json_data: Dict, hpo_cr=None) -> Dict: def _parse_viewpoint_text_anatomy(text: str, hpo_cr=None) -> Dict: - """ - Extract anatomy from ViewPoint text reports. - - Expected pattern: - Fetal Anatomy - ============= - The following structures appear normal: - Cranium. Brain. Face. ... - - The following structures appear abnormal: - GI tract: dilated bowel loops. - - The following structures could not be adequately visualized: - LVOT view. RVOT view. ... - - TODO @VarenyaJ: Implement full parsing - """ - # Skeleton: Extract the Fetal Anatomy section - pattern = re.compile( - r"Fetal Anatomy\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", - re.DOTALL | re.IGNORECASE, - ) - match = pattern.search(text) - anatomy_text = match.group("body").strip() if match else "" - - # TODO @VarenyaJ: Parse "appear normal", "appear abnormal", "could not be visualized" lists - - hpo_terms = [] - if anatomy_text and hpo_cr is not None and hasattr(hpo_cr, "parse"): - hpo_terms = hpo_cr.parse(anatomy_text) - - return { - "anatomy_text": anatomy_text, - "normal_structures": [], - "abnormal_structures": [], - "not_visualized": [], - "anomalies": [], - "hpo_terms": hpo_terms, - "source_format": "viewpoint_text", - } + """ + Extract anatomy from ViewPoint text reports. + + Expected pattern: + Fetal Anatomy + ============= + The following structures appear normal: + Cranium. Brain. Face. ... + + The following structures appear abnormal: + GI tract: dilated bowel loops. + + The following structures could not be adequately visualized: + LVOT view. RVOT view. ... + + TODO @VarenyaJ: Implement full parsing + """ + # Skeleton: Extract the Fetal Anatomy section + pattern = re.compile( + r"Fetal Anatomy\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) + match = pattern.search(text) + anatomy_text = match.group("body").strip() if match else "" + + # TODO @VarenyaJ: Parse "appear normal", "appear abnormal", "could not be visualized" lists + + hpo_terms = [] + if anatomy_text and hpo_cr is not None and hasattr(hpo_cr, "parse"): + hpo_terms = hpo_cr.parse(anatomy_text) + + return { + "anatomy_text": anatomy_text, + "normal_structures": [], + "abnormal_structures": [], + "not_visualized": [], + "anomalies": [], + "hpo_terms": hpo_terms, + "source_format": "viewpoint_text", + } # --------------------------------------------------------------------- @@ -227,15 +225,15 @@ def _parse_viewpoint_text_anatomy(text: str, hpo_cr=None) -> Dict: def _parse_viewpoint_hl7_anatomy(hl7: str, hpo_cr=None) -> Dict: - """ - Extract anatomy from HL7 ORU^R01 messages. + """ + Extract anatomy from HL7 ORU^R01 messages. - Note: Anatomy is typically not encoded in discrete HL7 fields. - This is a skeleton for potential future implementation. + Note: Anatomy is typically not encoded in discrete HL7 fields. + This is a skeleton for potential future implementation. - TODO @VarenyaJ: Implement if HL7 anatomy encoding is discovered - """ - return _empty_result("viewpoint_hl7") + TODO @VarenyaJ: Implement if HL7 anatomy encoding is discovered + """ + return _empty_result("viewpoint_hl7") # --------------------------------------------------------------------- @@ -244,13 +242,13 @@ def _parse_viewpoint_hl7_anatomy(hl7: str, hpo_cr=None) -> Dict: def _empty_result(source_format: str) -> Dict: - """Return empty result structure.""" - return { - "anatomy_text": "", - "normal_structures": [], - "abnormal_structures": [], - "not_visualized": [], - "anomalies": [], - "hpo_terms": [], - "source_format": source_format, - } \ No newline at end of file + """Return empty result structure.""" + return { + "anatomy_text": "", + "normal_structures": [], + "abnormal_structures": [], + "not_visualized": [], + "anomalies": [], + "hpo_terms": [], + "source_format": source_format, + } From c2e73534de7ab44690ce38594ee73e9706cd0351 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 09:46:18 -0500 Subject: [PATCH 14/20] update yq parsing of toml --- .github/actions/python_from_pyproject/action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/python_from_pyproject/action.yaml b/.github/actions/python_from_pyproject/action.yaml index 79ee605..e3ce1cb 100644 --- a/.github/actions/python_from_pyproject/action.yaml +++ b/.github/actions/python_from_pyproject/action.yaml @@ -12,7 +12,7 @@ runs: steps: - name: Get project version with yq id: get_python_version - uses: mikefarah/yq@v4.46.1 + uses: mikefarah/yq@v4.52.2 with: cmd: yq '.project.requires-python' ${{ inputs.pyproject-file-path }} From 580d4a262675878f8fe5bd59692a9470beed7e49 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 09:49:27 -0500 Subject: [PATCH 15/20] ci: bump yq to v4.52.2 to fix TOML comment parsing and force toml input for yq to avoid comment parsing bug --- .github/actions/python_from_pyproject/action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/python_from_pyproject/action.yaml b/.github/actions/python_from_pyproject/action.yaml index e3ce1cb..9f2359c 100644 --- a/.github/actions/python_from_pyproject/action.yaml +++ b/.github/actions/python_from_pyproject/action.yaml @@ -14,7 +14,7 @@ runs: id: get_python_version uses: mikefarah/yq@v4.52.2 with: - cmd: yq '.project.requires-python' ${{ inputs.pyproject-file-path }} + cmd: yq --input-format toml '.project.requires-python' ${{ inputs.pyproject-file-path }} - name: Set up Python uses: actions/setup-python@v5.6.0 From cb055f96368ce72a1a3a7abf9269051658a83843 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 10:00:03 -0500 Subject: [PATCH 16/20] ci: read pyproject.toml using python tomllib instead of yq --- .../actions/python_from_pyproject/action.yaml | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/.github/actions/python_from_pyproject/action.yaml b/.github/actions/python_from_pyproject/action.yaml index 9f2359c..5330579 100644 --- a/.github/actions/python_from_pyproject/action.yaml +++ b/.github/actions/python_from_pyproject/action.yaml @@ -1,22 +1,29 @@ name: Install python from pyproject.toml -description: 'Installs Python from the version found in the pyproject.toml' +description: Installs Python from the version found in pyproject.toml inputs: - pyproject-file-path: - required: False - description: "Path to the pyproject.toml including filename" - default: "./pyproject.toml" + pyproject-file-path: + required: false + description: Path to the pyproject.toml including filename + default: ./pyproject.toml runs: - using: composite - steps: - - name: Get project version with yq - id: get_python_version - uses: mikefarah/yq@v4.52.2 - with: - cmd: yq --input-format toml '.project.requires-python' ${{ inputs.pyproject-file-path }} + using: composite + steps: + - name: Read requires-python from pyproject.toml + id: get_python_version + shell: bash + run: | + python - <<'EOF' + import tomllib + from pathlib import Path - - name: Set up Python - uses: actions/setup-python@v5.6.0 - with: - python-version: ${{ steps.get_python_version.outputs.result }} \ No newline at end of file + path = Path("${{ inputs.pyproject-file-path }}") + data = tomllib.loads(path.read_text()) + print(f"result={data['project']['requires-python']}") + EOF >> "$GITHUB_OUTPUT" + + - name: Set up Python + uses: actions/setup-python@v5.6.0 + with: + python-version: ${{ steps.get_python_version.outputs.result }} \ No newline at end of file From 5c4e04029a2df1699ba3e02b3776217b459e33a8 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 10:26:56 -0500 Subject: [PATCH 17/20] ci: retrigger workflow after removing yq From a39156af0da0a81062d553eebd98fd594638b197 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 10:48:43 -0500 Subject: [PATCH 18/20] ci: replace yq with tomllib for pyproject.toml parsing --- .github/actions/python_from_pyproject/action.yaml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/actions/python_from_pyproject/action.yaml b/.github/actions/python_from_pyproject/action.yaml index 5330579..ab44d39 100644 --- a/.github/actions/python_from_pyproject/action.yaml +++ b/.github/actions/python_from_pyproject/action.yaml @@ -14,14 +14,10 @@ runs: id: get_python_version shell: bash run: | - python - <<'EOF' - import tomllib - from pathlib import Path - - path = Path("${{ inputs.pyproject-file-path }}") - data = tomllib.loads(path.read_text()) - print(f"result={data['project']['requires-python']}") - EOF >> "$GITHUB_OUTPUT" + python -c "import tomllib, pathlib; \ + p = pathlib.Path('${{ inputs.pyproject-file-path }}'); \ + req = tomllib.loads(p.read_text())['project']['requires-python']; \ + print(f'result={req}')" >> \"$GITHUB_OUTPUT\" - name: Set up Python uses: actions/setup-python@v5.6.0 From bccfd4e9e094aebec1170469b22391bb45e91c68 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Mon, 2 Feb 2026 10:50:52 -0500 Subject: [PATCH 19/20] ci: replace tomllib fix with a one-liner --- .github/actions/python_from_pyproject/action.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/actions/python_from_pyproject/action.yaml b/.github/actions/python_from_pyproject/action.yaml index ab44d39..13d6210 100644 --- a/.github/actions/python_from_pyproject/action.yaml +++ b/.github/actions/python_from_pyproject/action.yaml @@ -14,10 +14,7 @@ runs: id: get_python_version shell: bash run: | - python -c "import tomllib, pathlib; \ - p = pathlib.Path('${{ inputs.pyproject-file-path }}'); \ - req = tomllib.loads(p.read_text())['project']['requires-python']; \ - print(f'result={req}')" >> \"$GITHUB_OUTPUT\" + python -c "import tomllib, pathlib; p = pathlib.Path('${{ inputs.pyproject-file-path }}'); req = tomllib.loads(p.read_text())['project']['requires-python']; print(f'result={req}')" >> $GITHUB_OUTPUT - name: Set up Python uses: actions/setup-python@v5.6.0 From 31ead32174a9d690db5bf392618b1d42ce6cdbd5 Mon Sep 17 00:00:00 2001 From: VarenyaJ Date: Thu, 5 Feb 2026 20:09:03 -0500 Subject: [PATCH 20/20] feat(notebook): include new section parsers in revised demo --- prenatalppkt.ipynb | 1470 ++++++++++---------------------------------- 1 file changed, 332 insertions(+), 1138 deletions(-) diff --git a/prenatalppkt.ipynb b/prenatalppkt.ipynb index badee2e..cb51d6d 100644 --- a/prenatalppkt.ipynb +++ b/prenatalppkt.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "d8f2cfce", "metadata": {}, "outputs": [ @@ -18,815 +18,28 @@ "name": "stderr", "output_type": "stream", "text": [ - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for head_circumference\n", - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for biparietal_diameter\n", - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for femur_length\n", - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for abdominal_circumference\n", - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for occipitofrontal_diameter\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Loaded mappings for: ['head_circumference', 'biparietal_diameter', 'femur_length', 'abdominal_circumference', 'occipitofrontal_diameter']\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Starting Observer JSON extraction\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing fetus 1\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Found 6 measurements\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: AC\n", - "DEBUG:prenatalppkt.etl.extractors.observer:AC has percentile=55.6% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for AC: value=226.20000000000002mm, percentile=55.6%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=AC, value=226.20000000000002mm, percentile=55.6%, ga=, method=None\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0034207 - normal=True\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: BPD\n", - "DEBUG:prenatalppkt.etl.extractors.observer:BPD has percentile=51.2% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for BPD: value=66.8mm, percentile=51.2%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=BPD, value=66.8mm, percentile=51.2%, ga=, method=None\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: HC\n", - "DEBUG:prenatalppkt.etl.extractors.observer:HC has percentile=42.5% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for HC: value=250.0mm, percentile=42.5%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=HC, value=250.0mm, percentile=42.5%, ga=, method=None\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Femur\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Femur has percentile=46.8% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Femur: value=50.099999999999994mm, percentile=46.8%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Femur, value=50.099999999999994mm, percentile=46.8%, ga=, method=None\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0002823 - Abnormal femur morphology\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0002823 - normal=True\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Nuchal Fold\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Nuchal Fold has percentile=0% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Nuchal Fold: value=10.0mm, percentile=0%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Nuchal Fold, value=10.0mm, percentile=0.0%, ga=, method=None\n", - "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Nuchal Fold' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Cerebellum\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Cerebellum has percentile=0% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Cerebellum: value=30.0mm, percentile=0%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Cerebellum, value=30.0mm, percentile=0.0%, ga=, method=None\n", - "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Cerebellum' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Successfully parsed 4 measurements\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Validating 4 TermBins for required measurements\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: AC\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: BPD\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: HC\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: Femur\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'HC', 'Femur', 'AC', 'BPD'}\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'HC', 'Femur', 'AC', 'BPD'}\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:All required measurements present\n", - "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "================================================================================\n", - "PRENATALPPKT ETL PIPELINE\n", - "Observer JSON → TermBins → Phenopacket v2.0\n", - "================================================================================\n", - "\n", - " STEP 1: Loading Observer JSON...\n", - "Loaded: tests/data/Apple_Sally_pretty.json\n", - "Fetuses: 1\n", - "Measurements: 6\n", - "Sample: AC = 22.62 cm\n", - "\n", - " STEP 2: Extracting biometry measurements to TermBins...\n", - " Extracted 4 TermBins\n", - "\n", - " [1] AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", - " HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", - " Normal: True\n", - "\n", - " [2] BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", - " HPO: HP:0000240 - Abnormality of skull size\n", - " Normal: True\n", - "\n", - " [3] HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", - " HPO: HP:0000240 - Abnormality of skull size\n", - " Normal: True\n", - "\n", - " [4] Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", - " HPO: HP:0002823 - Abnormal femur morphology\n", - " Normal: True\n", - "\n", - " STEP 3: Converting TermBins to PhenotypicFeatures...\n", - " Generated 4 PhenotypicFeatures\n", - "\n", - " [1] HP:0034207\n", - " Status: EXCLUDED (normal)\n", - " Description: AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", - "\n", - " [2] HP:0000240\n", - " Status: EXCLUDED (normal)\n", - " Description: BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", - "\n", - " [3] HP:0000240\n", - " Status: EXCLUDED (normal)\n", - " Description: HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", - "\n", - " [4] HP:0002823\n", - " Status: EXCLUDED (normal)\n", - " Description: Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", - "\n", - " STEP 4: Building Phenopacket v2.0...\n", - "✓ Phenopacket created successfully\n", - "\n", - "================================================================================\n", - " PHENOPACKET v2.0 OUTPUT (JSON)\n", - "================================================================================\n", - "{\n", - " \"id\": \"apple-sally-fetus-1\",\n", - " \"subject\": {\n", - " \"id\": \"fetus-1\",\n", - " \"time_at_last_encounter\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 26,\n", - " \"days\": 6\n", - " }\n", - " }\n", - " },\n", - " \"phenotypic_features\": [\n", - " {\n", - " \"description\": \"AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\",\n", - " \"type\": {\n", - " \"id\": \"HP:0034207\",\n", - " \"label\": \"Abnormal fetal gastrointestinal system morphology\"\n", - " },\n", - " \"excluded\": true,\n", - " \"onset\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 26,\n", - " \"days\": 6\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"description\": \"BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\",\n", - " \"type\": {\n", - " \"id\": \"HP:0000240\",\n", - " \"label\": \"Abnormality of skull size\"\n", - " },\n", - " \"excluded\": true,\n", - " \"onset\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 26,\n", - " \"days\": 6\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"description\": \"HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\",\n", - " \"type\": {\n", - " \"id\": \"HP:0000240\",\n", - " \"label\": \"Abnormality of skull size\"\n", - " },\n", - " \"excluded\": true,\n", - " \"onset\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 26,\n", - " \"days\": 6\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"description\": \"Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\",\n", - " \"type\": {\n", - " \"id\": \"HP:0002823\",\n", - " \"label\": \"Abnormal femur morphology\"\n", - " },\n", - " \"excluded\": true,\n", - " \"onset\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 27\n", - " }\n", - " }\n", - " }\n", - " ],\n", - " \"meta_data\": {\n", - " \"created\": \"2026-01-26T15:21:08.287048Z\",\n", - " \"created_by\": \"prenatalppkt-etl-pipeline\",\n", - " \"resources\": [\n", - " {\n", - " \"id\": \"hp\",\n", - " \"name\": \"Human Phenotype Ontology\",\n", - " \"url\": \"http://purl.obolibrary.org/obo/hp.owl\",\n", - " \"version\": \"2025-11-24\",\n", - " \"namespace_prefix\": \"HP\",\n", - " \"iri_prefix\": \"http://purl.obolibrary.org/obo/HP_\"\n", - " }\n", - " ],\n", - " \"phenopacket_schema_version\": \"2.0\"\n", - " }\n", - "}\n", - "\n", - "================================================================================\n", - " VALIDATION SUMMARY\n", - "================================================================================\n", - "\n", - " Phenopacket Structure:\n", - " ID: apple-sally-fetus-1\n", - " Subject ID: fetus-1\n", - " Subject GA: 26w6d\n", - " Sex: UNKNOWN_SEX\n", - " Phenotypic Features: 4\n", - " Schema Version: 2.0\n", - " HPO Resource: 2025-11-24\n", - "\n", - " Phenotypic Features Detail:\n", - "\n", - " [1] HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", - " Normal (excluded)\n", - " Onset: 26w6d\n", - " Detail: AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", - "\n", - " [2] HP:0000240 - Abnormality of skull size\n", - " Normal (excluded)\n", - " Onset: 26w6d\n", - " Detail: BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", - "\n", - " [3] HP:0000240 - Abnormality of skull size\n", - " Normal (excluded)\n", - " Onset: 26w6d\n", - " Detail: HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", - "\n", - " [4] HP:0002823 - Abnormal femur morphology\n", - " Normal (excluded)\n", - " Onset: 27w0d\n", - " Detail: Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", - "\n", - " Summary Statistics:\n", - " Total features: 4\n", - " Normal (excluded): 4\n", - " Abnormal (observed): 0\n", - "\n", - "================================================================================\n", - " SUCCESS: Valid Phenopacket v2.0 generated\n", - "================================================================================\n", - "\n", - " Phenopacket saved to: output/apple_sally_phenopacket_v2.json\n", - "\n", - " Validation: Round-trip test...\n", - " Validation passed\n" - ] - } - ], - "source": [ - "# Initial Demo\n", - "\"\"\"\n", - "PRENATALPPKT ETL PIPELINE\n", - "Observer JSON → TermBins → Phenopacket v2.0\n", - "\n", - "Uses the official GA4GH phenopackets library per:\n", - "https://phenopacket-schema.readthedocs.io/en/latest/python.html\n", - "\"\"\"\n", - "\n", - "import json\n", - "import re\n", - "from datetime import datetime, timezone\n", - "from pathlib import Path\n", - "\n", - "from google.protobuf.json_format import MessageToJson\n", - "from google.protobuf.timestamp_pb2 import Timestamp\n", - "import phenopackets.schema.v2 as pps2\n", - "\n", - "from prenatalppkt.etl.extractors import observer\n", - "from prenatalppkt.gestational_age import GestationalAge\n", - "\n", - "print(\"=\" * 80)\n", - "print(\"PRENATALPPKT ETL PIPELINE\")\n", - "print(\"Observer JSON → TermBins → Phenopacket v2.0\")\n", - "print(\"=\" * 80)\n", - "\n", - "# -----------------------------------------------------------------------------\n", - "# STEP 1: Load Apple Sally Observer JSON\n", - "# -----------------------------------------------------------------------------\n", - "print(\"\\n STEP 1: Loading Observer JSON...\")\n", - "\n", - "data_path = Path(\"tests/data/Apple_Sally_pretty.json\")\n", - "with open(data_path) as f:\n", - " observer_data = json.load(f)\n", - "\n", - "print(f\"Loaded: {data_path}\")\n", - "print(f\"Fetuses: {len(observer_data.get('fetuses', []))}\")\n", - "\n", - "first_fetus = observer_data[\"fetuses\"][0]\n", - "measurements = first_fetus.get(\"measurements\", [])\n", - "print(f\"Measurements: {len(measurements)}\")\n", - "print(\n", - " f\"Sample: {measurements[0]['label']} = \"\n", - " f\"{measurements[0]['value']} {measurements[0]['unit_of_measure']}\"\n", - ")\n", - "\n", - "# -----------------------------------------------------------------------------\n", - "# STEP 2: Extract TermBins using Observer extractor\n", - "# -----------------------------------------------------------------------------\n", - "print(\"\\n STEP 2: Extracting biometry measurements to TermBins...\")\n", - "\n", - "term_bins = observer.extract(observer_data)\n", - "print(f\" Extracted {len(term_bins)} TermBins\")\n", - "\n", - "for i, tb in enumerate(term_bins, 1):\n", - " print(f\"\\n [{i}] {tb.description}\")\n", - " print(f\" HPO: {tb.hpo_id} - {tb.hpo_label}\")\n", - " print(f\" Normal: {tb.normal}\")\n", - "\n", - "# -----------------------------------------------------------------------------\n", - "# STEP 3: Convert TermBins → Phenotypic Features (using phenopackets library)\n", - "# -----------------------------------------------------------------------------\n", - "print(\"\\n STEP 3: Converting TermBins to PhenotypicFeatures...\")\n", - "\n", - "\n", - "def parse_ga_from_description(description: str) -> tuple[int, int]:\n", - " \"\"\"Extract weeks and days from TermBin description.\"\"\"\n", - " match = re.search(r\"at (\\d+)w(\\d+)d\", description)\n", - " if match:\n", - " return int(match.group(1)), int(match.group(2))\n", - " # Fallback\n", - " first_m = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", - " ga = GestationalAge.from_weeks(first_m.get(\"calculated_ega\", 26.9))\n", - " return ga.weeks, ga.days\n", - "\n", - "\n", - "phenotypic_features = []\n", - "\n", - "for tb in term_bins:\n", - " weeks, days = parse_ga_from_description(tb.description)\n", - "\n", - " # Create GestationalAge message\n", - " gestational_age = pps2.GestationalAge(weeks=weeks, days=days)\n", - "\n", - " # Create TimeElement with gestational_age\n", - " onset = pps2.TimeElement(gestational_age=gestational_age)\n", - "\n", - " # Create OntologyClass for the HPO term\n", - " hpo_type = pps2.OntologyClass(id=tb.hpo_id, label=tb.hpo_label)\n", - "\n", - " # Create PhenotypicFeature\n", - " pf = pps2.PhenotypicFeature(\n", - " type=hpo_type,\n", - " excluded=tb.normal, # If normal=True, abnormality is excluded\n", - " onset=onset,\n", - " description=tb.description,\n", - " )\n", - "\n", - " phenotypic_features.append(pf)\n", - "\n", - "print(f\" Generated {len(phenotypic_features)} PhenotypicFeatures\")\n", - "\n", - "for i, pf in enumerate(phenotypic_features, 1):\n", - " status = \"EXCLUDED (normal)\" if pf.excluded else \"OBSERVED (abnormal)\"\n", - " print(f\"\\n [{i}] {pf.type.id}\")\n", - " print(f\" Status: {status}\")\n", - " print(f\" Description: {pf.description}\")\n", - "\n", - "# -----------------------------------------------------------------------------\n", - "# STEP 4: Build Complete Phenopacket v2.0\n", - "# -----------------------------------------------------------------------------\n", - "print(\"\\n STEP 4: Building Phenopacket v2.0...\")\n", - "\n", - "# Get subject GA from first measurement\n", - "first_measurement = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", - "subject_ga_weeks = first_measurement.get(\"calculated_ega\", 26.9)\n", - "subject_ga = GestationalAge.from_weeks(subject_ga_weeks)\n", - "\n", - "# Create Individual (subject) with GestationalAge\n", - "subject_time = pps2.TimeElement(\n", - " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", - ")\n", - "\n", - "subject = pps2.Individual(\n", - " id=\"fetus-1\",\n", - " sex=pps2.Sex.UNKNOWN_SEX,\n", - " time_at_last_encounter=subject_time,\n", - ")\n", - "\n", - "# Create timestamp for metadata\n", - "now = datetime.now(timezone.utc)\n", - "created_timestamp = Timestamp()\n", - "created_timestamp.FromDatetime(now)\n", - "\n", - "# Create HPO Resource\n", - "hpo_resource = pps2.Resource(\n", - " id=\"hp\",\n", - " name=\"Human Phenotype Ontology\",\n", - " url=\"http://purl.obolibrary.org/obo/hp.owl\",\n", - " version=\"2025-11-24\",\n", - " namespace_prefix=\"HP\",\n", - " iri_prefix=\"http://purl.obolibrary.org/obo/HP_\",\n", - ")\n", - "\n", - "# Create MetaData\n", - "metadata = pps2.MetaData(\n", - " created=created_timestamp,\n", - " created_by=\"prenatalppkt-etl-pipeline\",\n", - " phenopacket_schema_version=\"2.0\",\n", - ")\n", - "metadata.resources.append(hpo_resource)\n", - "\n", - "# Create the Phenopacket\n", - "phenopacket = pps2.Phenopacket(\n", - " id=\"apple-sally-fetus-1\",\n", - " subject=subject,\n", - " meta_data=metadata,\n", - ")\n", - "phenopacket.phenotypic_features.extend(phenotypic_features)\n", - "\n", - "print(\"✓ Phenopacket created successfully\")\n", - "\n", - "# -----------------------------------------------------------------------------\n", - "# STEP 5: Display Results as JSON\n", - "# -----------------------------------------------------------------------------\n", - "print(\"\\n\" + \"=\" * 80)\n", - "print(\" PHENOPACKET v2.0 OUTPUT (JSON)\")\n", - "print(\"=\" * 80)\n", - "\n", - "# Convert protobuf message to JSON using official method\n", - "phenopacket_json = MessageToJson(phenopacket, preserving_proto_field_name=True)\n", - "print(phenopacket_json)\n", - "\n", - "# -----------------------------------------------------------------------------\n", - "# STEP 6: Validation Summary\n", - "# -----------------------------------------------------------------------------\n", - "print(\"\\n\" + \"=\" * 80)\n", - "print(\" VALIDATION SUMMARY\")\n", - "print(\"=\" * 80)\n", - "\n", - "print(\"\\n Phenopacket Structure:\")\n", - "print(f\" ID: {phenopacket.id}\")\n", - "print(f\" Subject ID: {phenopacket.subject.id}\")\n", - "print(f\" Subject GA: {subject_ga.weeks}w{subject_ga.days}d\")\n", - "print(f\" Sex: {pps2.Sex.Name(phenopacket.subject.sex)}\")\n", - "print(f\" Phenotypic Features: {len(phenopacket.phenotypic_features)}\")\n", - "print(f\" Schema Version: {phenopacket.meta_data.phenopacket_schema_version}\")\n", - "print(f\" HPO Resource: {phenopacket.meta_data.resources[0].version}\")\n", - "\n", - "print(\"\\n Phenotypic Features Detail:\")\n", - "for i, pf in enumerate(phenopacket.phenotypic_features, 1):\n", - " status = \" Normal (excluded)\" if pf.excluded else \"Abnormal (observed)\"\n", - " ga = pf.onset.gestational_age\n", - " print(f\"\\n [{i}] {pf.type.id} - {pf.type.label}\")\n", - " print(f\" {status}\")\n", - " print(f\" Onset: {ga.weeks}w{ga.days}d\")\n", - " print(f\" Detail: {pf.description}\")\n", - "\n", - "# Count normal vs abnormal\n", - "normal_count = sum(1 for pf in phenopacket.phenotypic_features if pf.excluded)\n", - "abnormal_count = len(phenopacket.phenotypic_features) - normal_count\n", - "\n", - "print(\"\\n Summary Statistics:\")\n", - "print(f\" Total features: {len(phenopacket.phenotypic_features)}\")\n", - "print(f\" Normal (excluded): {normal_count}\")\n", - "print(f\" Abnormal (observed): {abnormal_count}\")\n", - "\n", - "print(\"\\n\" + \"=\" * 80)\n", - "print(\" SUCCESS: Valid Phenopacket v2.0 generated\")\n", - "print(\"=\" * 80)\n", - "\n", - "# Save to file\n", - "output_path = Path(\"output/apple_sally_phenopacket_v2.json\")\n", - "output_path.parent.mkdir(exist_ok=True)\n", - "with open(output_path, \"w\") as f:\n", - " f.write(phenopacket_json)\n", - "print(f\"\\n Phenopacket saved to: {output_path}\")\n", - "\n", - "# Validate by round-tripping\n", - "print(\"\\n Validation: Round-trip test...\")\n", - "from google.protobuf.json_format import Parse\n", - "\n", - "parsed_back = Parse(phenopacket_json, pps2.Phenopacket())\n", - "assert parsed_back.id == phenopacket.id\n", - "assert len(parsed_back.phenotypic_features) == len(phenopacket.phenotypic_features)\n", - "print(\" Validation passed\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3685f9e5", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for head_circumference\n", - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for biparietal_diameter\n", - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for femur_length\n", - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for abdominal_circumference\n", - "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for occipitofrontal_diameter\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Loaded mappings for: ['head_circumference', 'biparietal_diameter', 'femur_length', 'abdominal_circumference', 'occipitofrontal_diameter']\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Starting Observer JSON extraction\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing fetus 1\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Found 6 measurements\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: AC\n", - "DEBUG:prenatalppkt.etl.extractors.observer:AC has percentile=55.6% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for AC: value=226.20000000000002mm, percentile=55.6%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=AC, value=226.20000000000002mm, percentile=55.6%, ga=, method=None\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0034207 - normal=True\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: BPD\n", - "DEBUG:prenatalppkt.etl.extractors.observer:BPD has percentile=51.2% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for BPD: value=66.8mm, percentile=51.2%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=BPD, value=66.8mm, percentile=51.2%, ga=, method=None\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: HC\n", - "DEBUG:prenatalppkt.etl.extractors.observer:HC has percentile=42.5% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for HC: value=250.0mm, percentile=42.5%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=HC, value=250.0mm, percentile=42.5%, ga=, method=None\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Femur\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Femur has percentile=46.8% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Femur: value=50.099999999999994mm, percentile=46.8%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Femur, value=50.099999999999994mm, percentile=46.8%, ga=, method=None\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0002823 - Abnormal femur morphology\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0002823 - normal=True\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Nuchal Fold\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Nuchal Fold has percentile=0% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Nuchal Fold: value=10.0mm, percentile=0%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Nuchal Fold, value=10.0mm, percentile=0.0%, ga=, method=None\n", - "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Nuchal Fold' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Cerebellum\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Cerebellum has percentile=0% (valid)\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Cerebellum: value=30.0mm, percentile=0%, ga=\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Cerebellum, value=30.0mm, percentile=0.0%, ga=, method=None\n", - "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Cerebellum' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", - "DEBUG:prenatalppkt.etl.extractors.observer:Successfully parsed 4 measurements\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Validating 4 TermBins for required measurements\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: AC\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: BPD\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: HC\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: Femur\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'HC', 'Femur', 'AC', 'BPD'}\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'HC', 'Femur', 'AC', 'BPD'}\n", - "DEBUG:prenatalppkt.etl.term_bin_factory:All required measurements present\n", - "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " STEP 1: Loading Observer JSON...\n", - "Loaded: tests/data/Apple_Sally_pretty.json\n", - "Fetuses: 1\n", - "Measurements: 6\n", - "Sample: AC = 22.62 cm\n", - "\n", - " STEP 2: Extracting biometry measurements to TermBins...\n", - " Extracted 4 TermBins\n", - "\n", - " [1] AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", - " HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", - " Normal: True\n", - "\n", - " [2] BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", - " HPO: HP:0000240 - Abnormality of skull size\n", - " Normal: True\n", - "\n", - " [3] HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", - " HPO: HP:0000240 - Abnormality of skull size\n", - " Normal: True\n", - "\n", - " [4] Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", - " HPO: HP:0002823 - Abnormal femur morphology\n", - " Normal: True\n", - "\n", - " STEP 3: Converting TermBins to PhenotypicFeatures...\n", - " Generated 4 PhenotypicFeatures\n", - "\n", - " [1] HP:0034207\n", - " Status: EXCLUDED (normal)\n", - " Description: AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", - "\n", - " [2] HP:0000240\n", - " Status: EXCLUDED (normal)\n", - " Description: BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", - "\n", - " [3] HP:0000240\n", - " Status: EXCLUDED (normal)\n", - " Description: HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", - "\n", - " [4] HP:0002823\n", - " Status: EXCLUDED (normal)\n", - " Description: Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", - "\n", - " STEP 4: Building Phenopacket v2.0...\n", - "{\n", - " \"id\": \"apple-sally-fetus-1\",\n", - " \"subject\": {\n", - " \"id\": \"fetus-1\",\n", - " \"time_at_last_encounter\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 26,\n", - " \"days\": 6\n", - " }\n", - " }\n", - " },\n", - " \"phenotypic_features\": [\n", - " {\n", - " \"description\": \"AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\",\n", - " \"type\": {\n", - " \"id\": \"HP:0034207\",\n", - " \"label\": \"Abnormal fetal gastrointestinal system morphology\"\n", - " },\n", - " \"excluded\": true,\n", - " \"onset\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 26,\n", - " \"days\": 6\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"description\": \"BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\",\n", - " \"type\": {\n", - " \"id\": \"HP:0000240\",\n", - " \"label\": \"Abnormality of skull size\"\n", - " },\n", - " \"excluded\": true,\n", - " \"onset\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 26,\n", - " \"days\": 6\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"description\": \"HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\",\n", - " \"type\": {\n", - " \"id\": \"HP:0000240\",\n", - " \"label\": \"Abnormality of skull size\"\n", - " },\n", - " \"excluded\": true,\n", - " \"onset\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 26,\n", - " \"days\": 6\n", - " }\n", - " }\n", - " },\n", - " {\n", - " \"description\": \"Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\",\n", - " \"type\": {\n", - " \"id\": \"HP:0002823\",\n", - " \"label\": \"Abnormal femur morphology\"\n", - " },\n", - " \"excluded\": true,\n", - " \"onset\": {\n", - " \"gestational_age\": {\n", - " \"weeks\": 27\n", - " }\n", - " }\n", - " }\n", - " ],\n", - " \"meta_data\": {\n", - " \"created\": \"2026-01-26T15:21:08.337338Z\",\n", - " \"created_by\": \"prenatalppkt-etl-pipeline\",\n", - " \"resources\": [\n", - " {\n", - " \"id\": \"hp\",\n", - " \"name\": \"Human Phenotype Ontology\",\n", - " \"url\": \"http://purl.obolibrary.org/obo/hp.owl\",\n", - " \"version\": \"2025-11-24\",\n", - " \"namespace_prefix\": \"HP\",\n", - " \"iri_prefix\": \"http://purl.obolibrary.org/obo/HP_\"\n", - " }\n", - " ],\n", - " \"phenopacket_schema_version\": \"2.0\"\n", - " }\n", - "}\n" + "DEBUG:hpotk.util:Using default encoding 'utf-8'\n", + "DEBUG:hpotk.util:Opening /tmp/hp.json\n", + "DEBUG:hpotk.util:Looks like a local file: /tmp/hp.json\n", + "DEBUG:hpotk.util:Looks like decompressed data\n" ] - } - ], - "source": [ - "# Shorter Test\n", - "\n", - "import json\n", - "import re\n", - "from datetime import datetime, timezone\n", - "from pathlib import Path\n", - "from google.protobuf.json_format import MessageToJson\n", - "from google.protobuf.timestamp_pb2 import Timestamp\n", - "import phenopackets.schema.v2 as pps2\n", - "from prenatalppkt.etl.extractors import observer\n", - "from prenatalppkt.gestational_age import GestationalAge\n", - "\n", - "print(\"\\n STEP 1: Loading Observer JSON...\")\n", - "data_path = Path(\"tests/data/Apple_Sally_pretty.json\")\n", - "with open(data_path) as f:\n", - " observer_data = json.load(f)\n", - "print(f\"Loaded: {data_path}\")\n", - "print(f\"Fetuses: {len(observer_data.get('fetuses', []))}\")\n", - "\n", - "first_fetus = observer_data[\"fetuses\"][0]\n", - "measurements = first_fetus.get(\"measurements\", [])\n", - "print(f\"Measurements: {len(measurements)}\")\n", - "print(f\"Sample: {measurements[0]['label']} = \", f\"{measurements[0]['value']} {measurements[0]['unit_of_measure']}\")\n", - "\n", - "print(\"\\n STEP 2: Extracting biometry measurements to TermBins...\")\n", - "term_bins = observer.extract(observer_data)\n", - "print(f\" Extracted {len(term_bins)} TermBins\")\n", - "for i, tb in enumerate(term_bins, 1):\n", - " print(f\"\\n [{i}] {tb.description}\")\n", - " print(f\" HPO: {tb.hpo_id} - {tb.hpo_label}\")\n", - " print(f\" Normal: {tb.normal}\")\n", - "\n", - "print(\"\\n STEP 3: Converting TermBins to PhenotypicFeatures...\")\n", - "def parse_ga_from_description(description: str) -> tuple[int, int]:\n", - " \"\"\"Extract weeks and days from TermBin description.\"\"\"\n", - " match = re.search(r\"at (\\d+)w(\\d+)d\", description)\n", - " if match:\n", - " return int(match.group(1)), int(match.group(2))\n", - " # Fallback\n", - " first_m = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", - " ga = GestationalAge.from_weeks(first_m.get(\"calculated_ega\", 26.9))\n", - " return ga.weeks, ga.days\n", - "phenotypic_features = []\n", - "for tb in term_bins:\n", - " weeks, days = parse_ga_from_description(tb.description)\n", - " # Create GestationalAge message\n", - " gestational_age = pps2.GestationalAge(weeks=weeks, days=days)\n", - " # Create TimeElement with gestational_age\n", - " onset = pps2.TimeElement(gestational_age=gestational_age)\n", - " # Create OntologyClass for the HPO term\n", - " hpo_type = pps2.OntologyClass(id=tb.hpo_id, label=tb.hpo_label)\n", - " # Create PhenotypicFeature\n", - " pf = pps2.PhenotypicFeature( type=hpo_type, excluded=tb.normal, onset=onset, description=tb.description)\n", - " phenotypic_features.append(pf)\n", - "print(f\" Generated {len(phenotypic_features)} PhenotypicFeatures\")\n", - "for i, pf in enumerate(phenotypic_features, 1):\n", - " status = \"EXCLUDED (normal)\" if pf.excluded else \"OBSERVED (abnormal)\"\n", - " print(f\"\\n [{i}] {pf.type.id}\")\n", - " print(f\" Status: {status}\")\n", - " print(f\" Description: {pf.description}\")\n", - "\n", - "\n", - "print(\"\\n STEP 4: Building Phenopacket v2.0...\")\n", - "# Get subject GA from first measurement\n", - "first_measurement = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", - "subject_ga_weeks = first_measurement.get(\"calculated_ega\", 26.9)\n", - "subject_ga = GestationalAge.from_weeks(subject_ga_weeks)\n", - "# Create Individual (subject) with GestationalAge\n", - "subject_time = pps2.TimeElement(gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days))\n", - "\n", - "subject = pps2.Individual(id=\"fetus-1\", sex=pps2.Sex.UNKNOWN_SEX, time_at_last_encounter=subject_time)\n", - "\n", - "# Create timestamp for metadata\n", - "now = datetime.now(timezone.utc)\n", - "created_timestamp = Timestamp()\n", - "created_timestamp.FromDatetime(now)\n", - "\n", - "# Create HPO Resource\n", - "hpo_resource = pps2.Resource(id=\"hp\", name=\"Human Phenotype Ontology\", url=\"http://purl.obolibrary.org/obo/hp.owl\", version=\"2025-11-24\", namespace_prefix=\"HP\", iri_prefix=\"http://purl.obolibrary.org/obo/HP_\")\n", - "\n", - "# Create MetaData\n", - "metadata = pps2.MetaData(created=created_timestamp, created_by=\"prenatalppkt-etl-pipeline\", phenopacket_schema_version=\"2.0\")\n", - "metadata.resources.append(hpo_resource)\n", - "\n", - "# Create the Phenopacket\n", - "phenopacket = pps2.Phenopacket(id=\"apple-sally-fetus-1\", subject=subject, meta_data=metadata)\n", - "phenopacket.phenotypic_features.extend(phenotypic_features)\n", - "\n", - "# Convert protobuf message to JSON using official method\n", - "phenopacket_json = MessageToJson(phenopacket, preserving_proto_field_name=True)\n", - "print(phenopacket_json)" - ] - }, - { - "cell_type": "markdown", - "id": "1e24f7ff", - "metadata": {}, - "source": [ - "# New" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "0f79d3fe", - "metadata": {}, - "outputs": [ + }, { "name": "stdout", "output_type": "stream", "text": [ "================================================================================\n", - "PRENATALPPKT EXPANDED ETL PIPELINE\n", - "Observer JSON -> Biometry + Clinical Sections -> Phenopacket v2.0\n", + "PRENATALPPKT ETL PIPELINE\n", + "Observer JSON -> Section Parsing -> Phenopacket v2.0\n", "================================================================================\n", "\n", - "[STEP 1] Loading HPO Concept Recognizer...\n" + "[STEP 1] Loading the HPO Concept Recognizer...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "DEBUG:hpotk.util:Using default encoding 'utf-8'\n", - "DEBUG:hpotk.util:Opening /tmp/hp.json\n", - "DEBUG:hpotk.util:Looks like a local file: /tmp/hp.json\n", - "DEBUG:hpotk.util:Looks like decompressed data\n", "DEBUG:hpotk.ontology.load.obographs._load:Extracting ontology terms\n", "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", @@ -905,75 +118,105 @@ "name": "stdout", "output_type": "stream", "text": [ - " ? HPO version: 2025-10-22\n", - " ? Concept recognizer: HpoExactConceptRecognizer\n", + "HPO version: 2025-10-22\n", "\n", "[STEP 2] Loading Observer JSON...\n", - " ? Loaded: tests/data/Apple_Sally_pretty.json\n", - " ? Fetuses: 1\n", - " ? Measurements: 6\n", - " ? Sample: AC = 22.62 cm\n", + "Loaded: Apple_Sally_pretty.json\n", + "Fetuses: 1\n", "\n", - "[STEP 3] Extracting biometry measurements to TermBins...\n", - " ? Extracted 4 TermBins:\n", - " [1] HP:0034207 (Abnormal fetal gastrointestinal system morphology) - ? Normal\n", - " AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\n", - " [2] HP:0000240 (Abnormality of skull size) - ? Normal\n", - " BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\n", - " [3] HP:0000240 (Abnormality of skull size) - ? Normal\n", - " HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\n", - " [4] HP:0002823 (Abnormal femur morphology) - ? Normal\n", - " Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\n", + "[STEP 3] Extracting biometry measurements...\n", + "Extracted 4 TermBins\n", + " - AC: 226.2 mm (55.6%) at 26w6d [Fetus 1] [Normal]\n", + " HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + " - BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1] [Normal]\n", + " HPO: HP:0000240 - Abnormality of skull size\n", + " - HC: 250.0 mm (42.5%) at 26w6d [Fetus 1] [Normal]\n", + " HPO: HP:0000240 - Abnormality of skull size\n", + " - Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1] [Normal]\n", + " HPO: HP:0002823 - Abnormal femur morphology\n", "\n", "[STEP 4] Parsing clinical sections...\n", "\n", - " --- Clinical Indication ---\n", - " Indication: (not found)\n", - "\n", - " --- Pregnancy Dating ---\n", - " LMP: 0001-01-01\n", - " EDD: None\n", - " Dating Method: None\n", - " GA by Ultrasound: None\n", - "\n", - " --- Clinical Impression ---\n", - " Impression (1294 chars): \"The patient was referred for a fetal anatomical survey. Sonographic measurements were consistent with the expected gestational age. The amniotic fluid volume was normal. A detailed fetal anatomic s...\"\n", - " Growth Assessment: None\n", + " [4a] Clinical Indication:\n", + " Reason: N/A...\n", + "\n", + " [4b] Pregnancy Dating:\n", + " LMP: 0001-01-01\n", + " EDD: None\n", + " GA at exam: N/A weeks\n", + "\n", + " [4c] Clinical Impression:\n", + "S... Text: The patient was referred for a fetal anatomical survey. \n", + " HPO terms found: 4\n", + "\n", + "[STEP 5] Parsing fetal-specific sections...\n", + "\n", + " [5a] Fetal Anatomy:\n", + " Normal structures: 0\n", + " Abnormal structures: 0\n", + " Not visualized: 0\n", + " Anomalies detected: 0\n", + " HPO terms extracted: 1\n", + "\n", + " [5b] Estimated Fetal Weight:\n", + " EFW: 1014.8 grams\n", + " Percentile: 55.6%\n", + " Method: Hadlock (AC, FL, HC)\n", + " Growth category: AGA\n", + " Within normal range: True\n", + "\n", + " [5c] Fetal Ratios:\n", + " Ratios calculated: 3\n", + " All within range: True\n", + " Proportionality: Normal\n", + " ? HC/AC: 1.105\n", + " ? FL/AC: 22.149\n", + " ? FL/BPD: 75\n", + "\n", + "[STEP 6] Building PhenotypicFeatures...\n", + "Growth category AGA (normal) - no HPO term needed\n", + "\n", + " Summary by source:\n", + " - Biometry: 4 features\n", + " - Clinical Text: 4 features\n", + " - Anatomy: 1 features\n", + " Total: 9 PhenotypicFeatures\n", + "\n", + "[STEP 7] Assembling Phenopacket v2.0...\n", + "Phenopacket ID: apple-sally-fetus-1-complete\n", + "Subject: fetus-1\n", + "Features: 9\n", + "\n", + "[STEP 8] Output & Validation...\n", + "Round-trip validation passed\n", + "Saved to: output/apple_sally_phenopacket_complete.json\n", "\n", - " --- HPO Concept Recognition from Clinical Text ---\n", - " Found 4 HPO terms in clinical narrative:\n", - " ? HP:0001274: Agenesis of corpus callosum\n", - " ? HP:0000256: Macrocephaly\n", - " ? HP:0001305: Dandy-Walker malformation\n", - " ? HP:0002119: Ventriculomegaly\n", - "\n", - "[STEP 5] Previewing anatomy findings...\n", - " Normal (0): ...\n", - " Abnormal (0): (none)\n", - " Not visualized (0): ...\n", - " (Note: Anatomy section parser not yet implemented in ETL)\n", - "\n", - "[STEP 6] Converting to PhenotypicFeatures...\n", - "\n", - " --- From Biometry ---\n", - " ? Added 4 features from biometry\n", + "================================================================================\n", + "PHENOPACKET GENERATION COMPLETE\n", + "================================================================================\n", "\n", - " --- From Clinical Text ---\n", - " ? Added 4 features from clinical text\n", + "[Clinical Context]\n", + " Indication: N/A...\n", + " GA at exam: N/A weeks\n", + " EFW: 1014.8g (AGA)\n", + " Proportionality: Normal\n", "\n", - " Total PhenotypicFeatures: 8\n", + "[Phenotypic Features by Source]\n", + " Biometry: 4\n", + " Clinical Text: 4\n", + " Anatomy: 1\n", "\n", - "[STEP 7] Building Phenopacket v2.0...\n", - " ? Phenopacket assembled successfully\n", - " ID: apple-sally-fetus-1\n", - " Subject: fetus-1 at 26w6d\n", - " Features: 8\n", + "[Feature Status]\n", + " Observed (abnormal): 5\n", + " Excluded (normal): 4\n", "\n", "================================================================================\n", - "PHENOPACKET v2.0 OUTPUT (JSON)\n", + "SUCCESS: Complete phenopacket at output/apple_sally_phenopacket_complete.json\n", "================================================================================\n", + "\n", + "[Phenopacket JSON Output]\n", "{\n", - " \"id\": \"apple-sally-fetus-1\",\n", + " \"id\": \"apple-sally-fetus-1-complete\",\n", " \"subject\": {\n", " \"id\": \"fetus-1\",\n", " \"time_at_last_encounter\": {\n", @@ -985,7 +228,7 @@ " },\n", " \"phenotypic_features\": [\n", " {\n", - " \"description\": \"[Biometry] AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\",\n", + " \"description\": \"Biometry: AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\",\n", " \"type\": {\n", " \"id\": \"HP:0034207\",\n", " \"label\": \"Abnormal fetal gastrointestinal system morphology\"\n", @@ -999,7 +242,7 @@ " }\n", " },\n", " {\n", - " \"description\": \"[Biometry] BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\",\n", + " \"description\": \"Biometry: BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\",\n", " \"type\": {\n", " \"id\": \"HP:0000240\",\n", " \"label\": \"Abnormality of skull size\"\n", @@ -1013,7 +256,7 @@ " }\n", " },\n", " {\n", - " \"description\": \"[Biometry] HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\",\n", + " \"description\": \"Biometry: HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\",\n", " \"type\": {\n", " \"id\": \"HP:0000240\",\n", " \"label\": \"Abnormality of skull size\"\n", @@ -1027,7 +270,7 @@ " }\n", " },\n", " {\n", - " \"description\": \"[Biometry] Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\",\n", + " \"description\": \"Biometry: Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\",\n", " \"type\": {\n", " \"id\": \"HP:0002823\",\n", " \"label\": \"Abnormal femur morphology\"\n", @@ -1040,10 +283,10 @@ " }\n", " },\n", " {\n", - " \"description\": \"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", + " \"description\": \"Clinical impression: Macrocephaly\",\n", " \"type\": {\n", - " \"id\": \"HP:0001274\",\n", - " \"label\": \"Agenesis of corpus callosum\"\n", + " \"id\": \"HP:0000256\",\n", + " \"label\": \"Macrocephaly\"\n", " },\n", " \"onset\": {\n", " \"gestational_age\": {\n", @@ -1053,10 +296,10 @@ " }\n", " },\n", " {\n", - " \"description\": \"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", + " \"description\": \"Clinical impression: Agenesis of corpus callosum\",\n", " \"type\": {\n", - " \"id\": \"HP:0000256\",\n", - " \"label\": \"Macrocephaly\"\n", + " \"id\": \"HP:0001274\",\n", + " \"label\": \"Agenesis of corpus callosum\"\n", " },\n", " \"onset\": {\n", " \"gestational_age\": {\n", @@ -1066,7 +309,7 @@ " }\n", " },\n", " {\n", - " \"description\": \"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", + " \"description\": \"Clinical impression: Dandy-Walker malformation\",\n", " \"type\": {\n", " \"id\": \"HP:0001305\",\n", " \"label\": \"Dandy-Walker malformation\"\n", @@ -1079,7 +322,7 @@ " }\n", " },\n", " {\n", - " \"description\": \"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", + " \"description\": \"Clinical impression: Ventriculomegaly\",\n", " \"type\": {\n", " \"id\": \"HP:0002119\",\n", " \"label\": \"Ventriculomegaly\"\n", @@ -1090,11 +333,24 @@ " \"days\": 6\n", " }\n", " }\n", + " },\n", + " {\n", + " \"description\": \"Anatomy finding: Neural tube defect\",\n", + " \"type\": {\n", + " \"id\": \"HP:0045005\",\n", + " \"label\": \"Neural tube defect\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", " }\n", " ],\n", " \"meta_data\": {\n", - " \"created\": \"2026-01-26T15:21:11.051438Z\",\n", - " \"created_by\": \"prenatalppkt-etl-pipeline\",\n", + " \"created\": \"2026-02-05T19:49:31.564591Z\",\n", + " \"created_by\": \"prenatalppkt-etl-pipeline-v2\",\n", " \"resources\": [\n", " {\n", " \"id\": \"hp\",\n", @@ -1107,85 +363,22 @@ " ],\n", " \"phenopacket_schema_version\": \"2.0\"\n", " }\n", - "}\n", - "\n", - "================================================================================\n", - "VALIDATION & SUMMARY\n", - "================================================================================\n", - "\n", - "[Validation] Round-trip test...\n", - " ? Round-trip validation passed\n", - "\n", - "[Summary] Phenotypic Features:\n", - " Total: 8\n", - " From Biometry: 4\n", - " From Clinical Text: 4\n", - " Normal (excluded): 4\n", - " Abnormal (observed): 4\n", - "\n", - "[Detail] All Phenotypic Features:\n", - "------------------------------------------------------------\n", - "\n", - " [1] HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", - " Source: Biometry\n", - " Status: EXCLUDED (normal)\n", - " Onset: 26w6d\n", - "\n", - " [2] HP:0000240 - Abnormality of skull size\n", - " Source: Biometry\n", - " Status: EXCLUDED (normal)\n", - " Onset: 26w6d\n", - "\n", - " [3] HP:0000240 - Abnormality of skull size\n", - " Source: Biometry\n", - " Status: EXCLUDED (normal)\n", - " Onset: 26w6d\n", - "\n", - " [4] HP:0002823 - Abnormal femur morphology\n", - " Source: Biometry\n", - " Status: EXCLUDED (normal)\n", - " Onset: 27w0d\n", - "\n", - " [5] HP:0001274 - Agenesis of corpus callosum\n", - " Source: Clinical Text\n", - " Status: OBSERVED (abnormal)\n", - " Onset: 26w6d\n", - "\n", - " [6] HP:0000256 - Macrocephaly\n", - " Source: Clinical Text\n", - " Status: OBSERVED (abnormal)\n", - " Onset: 26w6d\n", - "\n", - " [7] HP:0001305 - Dandy-Walker malformation\n", - " Source: Clinical Text\n", - " Status: OBSERVED (abnormal)\n", - " Onset: 26w6d\n", - "\n", - " [8] HP:0002119 - Ventriculomegaly\n", - " Source: Clinical Text\n", - " Status: OBSERVED (abnormal)\n", - " Onset: 26w6d\n", - "\n", - "================================================================================\n", - "SUCCESS: Phenopacket saved to output/apple_sally_phenopacket_expanded.json\n", - "================================================================================\n" + "}\n" ] } ], "source": [ "\"\"\"\n", - "PRENATALPPKT EXPANDED ETL PIPELINE\n", - "Observer JSON -> Biometry + Clinical Sections -> Phenopacket v2.0\n", "\n", - "Demonstrates the complete ETL pipeline:\n", + "Steps:\n", "1. Biometry extraction -> List[TermBin] -> quantitative HPO terms\n", "2. Clinical indication -> reason for exam\n", "3. Pregnancy dating -> LMP, EDD, gestational age context\n", "4. Clinical impression -> qualitative HPO terms from free text\n", - "5. Phenopacket assembly -> GA4GH Phenopacket v2.0 JSON\n", - "\n", - "Uses the official GA4GH phenopackets library per:\n", - "https://phenopacket-schema.readthedocs.io/en/latest/python.html\n", + "5. Fetal anatomy -> structured findings + HPO terms from anomalies\n", + "6. Estimated fetal weight -> SGA/AGA/LGA classification\n", + "7. Fetal ratios -> proportionality assessment\n", + "8. Phenopacket assembly -> GA4GH Phenopacket v2.0 JSON\n", "\"\"\"\n", "\n", "import gzip\n", @@ -1206,6 +399,9 @@ " parse_clinical_indication,\n", " parse_pregnancy_dating,\n", " parse_clinical_impression,\n", + " parse_fetal_anatomy,\n", + " parse_estimated_fetal_weight,\n", + " parse_fetal_ratios,\n", ")\n", "\n", "# HPO Concept Recognition\n", @@ -1215,19 +411,18 @@ "from prenatalppkt.gestational_age import GestationalAge\n", "\n", "print(\"=\" * 80)\n", - "print(\"PRENATALPPKT EXPANDED ETL PIPELINE\")\n", - "print(\"Observer JSON -> Biometry + Clinical Sections -> Phenopacket v2.0\")\n", + "print(\"PRENATALPPKT ETL PIPELINE\")\n", + "print(\"Observer JSON -> Section Parsing -> Phenopacket v2.0\")\n", "print(\"=\" * 80)\n", "\n", "# =============================================================================\n", "# STEP 1: Load HPO Concept Recognizer\n", "# =============================================================================\n", - "print(\"\\n[STEP 1] Loading HPO Concept Recognizer...\")\n", + "print(\"\\n[STEP 1] Loading the HPO Concept Recognizer...\")\n", "\n", "HP_JSON_GZ = Path(\"tests/data/hp.json.gz\")\n", "TMP_HP_JSON = Path(\"/tmp/hp.json\")\n", "\n", - "# Decompress hp.json.gz to temp location\n", "with gzip.open(HP_JSON_GZ, \"rt\", encoding=\"utf-8\") as f_in:\n", " with open(TMP_HP_JSON, \"w\", encoding=\"utf-8\") as f_out:\n", " f_out.write(f_in.read())\n", @@ -1235,202 +430,210 @@ "hpo_parser = HpoParser(hpo_json_file=str(TMP_HP_JSON))\n", "hpo_cr = hpo_parser.get_hpo_concept_recognizer()\n", "\n", - "print(f\" ? HPO version: {hpo_parser.get_version()}\")\n", - "print(f\" ? Concept recognizer: {type(hpo_cr).__name__}\")\n", + "print(f\"HPO version: {hpo_parser.get_version()}\")\n", "\n", "# =============================================================================\n", - "# STEP 2: Load Observer JSON Data\n", + "# STEP 2: Load Observer JSON\n", "# =============================================================================\n", "print(\"\\n[STEP 2] Loading Observer JSON...\")\n", "\n", - "DATA_PATH = Path(\"tests/data/Apple_Sally_pretty.json\")\n", - "\n", - "with open(DATA_PATH) as f:\n", + "data_path = Path(\"tests/data/Apple_Sally_pretty.json\")\n", + "with open(data_path) as f:\n", " observer_data = json.load(f)\n", "\n", - "# Keep raw JSON string for section parsers\n", - "with open(DATA_PATH) as f:\n", - " observer_json_str = f.read()\n", - "\n", - "print(f\" ? Loaded: {DATA_PATH}\")\n", - "print(f\" ? Fetuses: {len(observer_data.get('fetuses', []))}\")\n", - "\n", - "first_fetus = observer_data[\"fetuses\"][0]\n", - "measurements = first_fetus.get(\"measurements\", [])\n", - "print(f\" ? Measurements: {len(measurements)}\")\n", - "print(f\" ? Sample: {measurements[0]['label']} = {measurements[0]['value']} {measurements[0]['unit_of_measure']}\")\n", + "print(f\"Loaded: {data_path.name}\")\n", + "print(f\"Fetuses: {len(observer_data.get('fetuses', []))}\")\n", "\n", "# =============================================================================\n", "# STEP 3: Extract Biometry -> TermBins\n", "# =============================================================================\n", - "print(\"\\n[STEP 3] Extracting biometry measurements to TermBins...\")\n", + "print(\"\\n[STEP 3] Extracting biometry measurements...\")\n", "\n", "term_bins = observer.extract(observer_data)\n", + "print(f\"Extracted {len(term_bins)} TermBins\")\n", + "\n", + "# Helper function to parse GA from TermBin description\n", + "def parse_ga_from_description(description: str) -> tuple[int, int]:\n", + " \"\"\"Extract weeks and days from TermBin description like 'HC: 250.0 mm (42.5%) at 26w6d'\"\"\"\n", + " match = re.search(r\"at (\\d+)w(\\d+)d\", description)\n", + " if match:\n", + " return int(match.group(1)), int(match.group(2))\n", + " return 27, 0 # fallback values for XwYd\n", "\n", - "print(f\" ? Extracted {len(term_bins)} TermBins:\")\n", - "for i, tb in enumerate(term_bins, 1):\n", - " status = \"? Normal\" if tb.normal else \"? Abnormal\"\n", - " print(f\" [{i}] {tb.hpo_id} ({tb.hpo_label}) - {status}\")\n", - " print(f\" {tb.description}\")\n", + "# Display TermBins - note: TermBin has description, hpo_id, hpo_label, normal, range\n", + "# NOT label, value_mm, percentile directly\n", + "for tb in term_bins:\n", + " status = \"Normal\" if tb.normal else \"ABNORMAL\"\n", + " print(f\" - {tb.description} [{status}]\")\n", + " print(f\" HPO: {tb.hpo_id} - {tb.hpo_label}\")\n", "\n", "# =============================================================================\n", "# STEP 4: Parse Clinical Sections\n", "# =============================================================================\n", "print(\"\\n[STEP 4] Parsing clinical sections...\")\n", "\n", - "SOURCE_FORMAT = \"observer_json\"\n", - "\n", - "# 4a: Clinical Indication\n", - "print(\"\\n --- Clinical Indication ---\")\n", - "indication = parse_clinical_indication(observer_json_str, SOURCE_FORMAT)\n", - "indication_text = indication.get(\"indication_text\", \"\")\n", - "if indication_text:\n", - " print(f\" Indication: {indication_text[:100]}{'...' if len(indication_text) > 100 else ''}\")\n", - "else:\n", - " print(\" Indication: (not found)\")\n", - "\n", - "# 4b: Pregnancy Dating\n", - "print(\"\\n --- Pregnancy Dating ---\")\n", - "dating = parse_pregnancy_dating(observer_json_str, SOURCE_FORMAT)\n", - "print(f\" LMP: {dating.get('lmp', '(not found)')}\")\n", - "print(f\" EDD: {dating.get('edd', '(not found)')}\")\n", - "print(f\" Dating Method: {dating.get('dating_method', '(not found)')}\")\n", - "print(f\" GA by Ultrasound: {dating.get('ga_by_ultrasound', '(not found)')}\")\n", - "\n", - "# 4c: Clinical Impression\n", - "print(\"\\n --- Clinical Impression ---\")\n", - "impression = parse_clinical_impression(observer_json_str, SOURCE_FORMAT)\n", - "impression_text = impression.get(\"impression_text\", \"\")\n", - "\n", - "if impression_text:\n", - " # Clean up for display\n", - " preview = impression_text[:200].replace('\\r', ' ').replace('\\n', ' ')\n", - " print(f\" Impression ({len(impression_text)} chars): \\\"{preview}...\\\"\")\n", - "else:\n", - " print(\" Impression: (not found)\")\n", - "\n", - "print(f\" Growth Assessment: {impression.get('growth_assessment', '(not detected)')}\")\n", - "\n", - "# 4d: Extract HPO terms from clinical narrative\n", - "print(\"\\n --- HPO Concept Recognition from Clinical Text ---\")\n", - "if impression_text:\n", - " hpo_terms_from_text = hpo_cr.parse(impression_text)\n", - " print(f\" Found {len(hpo_terms_from_text)} HPO terms in clinical narrative:\")\n", - " for term in hpo_terms_from_text:\n", - " print(f\" ? {term.hpo_id}: {term.hpo_label}\")\n", - "else:\n", - " hpo_terms_from_text = []\n", - " print(\" (no impression text to parse)\")\n", - "\n", - "if not hpo_terms_from_text:\n", - " print(\" (no HPO terms matched)\")\n", + "# 4a. Clinical Indication\n", + "indication = parse_clinical_indication(observer_data, \"observer_json\")\n", + "indication_text = indication.get('indication_text', 'N/A') or 'N/A'\n", + "print(f\"\\n [4a] Clinical Indication:\")\n", + "print(f\" Reason: {indication_text[:60]}...\")\n", + "\n", + "# 4b. Pregnancy Dating\n", + "dating = parse_pregnancy_dating(observer_data, \"observer_json\")\n", + "print(f\"\\n [4b] Pregnancy Dating:\")\n", + "print(f\" LMP: {dating.get('lmp', 'N/A')}\")\n", + "print(f\" EDD: {dating.get('edd', 'N/A')}\")\n", + "print(f\" GA at exam: {dating.get('ga_weeks', 'N/A')} weeks\")\n", + "\n", + "# 4c. Clinical Impression (with HPO extraction)\n", + "impression = parse_clinical_impression(observer_data, \"observer_json\", hpo_cr=hpo_cr)\n", + "impression_text = impression.get('impression_text', 'N/A') or 'N/A'\n", + "print(f\"\\n [4c] Clinical Impression:\")\n", + "print(f\" Text: {impression_text[:60]}...\")\n", + "print(f\" HPO terms found: {len(impression.get('hpo_terms', []))}\")\n", "\n", "# =============================================================================\n", - "# STEP 5: Preview Anatomy Findings (Structured Data)\n", + "# STEP 5: Parse Fetal-Specific Sections (NEW)\n", "# =============================================================================\n", - "print(\"\\n[STEP 5] Previewing anatomy findings...\")\n", - "\n", - "fetus_data = observer_data[\"fetuses\"][0].get(\"fetus\", {})\n", - "anatomy_list = fetus_data.get(\"anatomy\", [])\n", - "\n", - "normal_structures = []\n", - "abnormal_structures = []\n", - "unseen_structures = []\n", - "anomalies_found = []\n", - "\n", - "for item in anatomy_list:\n", - " main = item.get(\"main\", {})\n", - " label = main.get(\"label\", \"Unknown\")\n", - " state = main.get(\"anat_state\", \"\")\n", - " \n", - " if state == \"Normal\":\n", - " normal_structures.append(label)\n", - " elif state == \"Abnormal\":\n", - " abnormal_structures.append(label)\n", - " # Check for specific anomalies\n", - " anomalies = item.get(\"anomalies\", [])\n", - " if anomalies:\n", - " for anom in anomalies:\n", - " desc = anom.get(\"description\", \"?\")\n", - " anomalies_found.append(f\"{label}: {desc}\")\n", - " elif state == \"Unseen\":\n", - " unseen_structures.append(label)\n", - "\n", - "print(f\" Normal ({len(normal_structures)}): {', '.join(normal_structures[:5])}...\")\n", - "print(f\" Abnormal ({len(abnormal_structures)}): {', '.join(abnormal_structures) if abnormal_structures else '(none)'}\")\n", - "print(f\" Not visualized ({len(unseen_structures)}): {', '.join(unseen_structures[:3])}...\")\n", - "\n", - "if anomalies_found:\n", - " print(f\" ? Anomalies detected:\")\n", - " for anom in anomalies_found:\n", - " print(f\" - {anom}\")\n", - "\n", - "print(\" (Note: Anatomy section parser not yet implemented in ETL)\")\n", + "print(\"\\n[STEP 5] Parsing fetal-specific sections...\")\n", + "\n", + "# 5a. Fetal Anatomy (with HPO extraction from anomalies)\n", + "anatomy = parse_fetal_anatomy(observer_data, \"observer_json\", hpo_cr=hpo_cr)\n", + "print(f\"\\n [5a] Fetal Anatomy:\")\n", + "print(f\" Normal structures: {len(anatomy.get('normal_structures', []))}\")\n", + "print(f\" Abnormal structures: {len(anatomy.get('abnormal_structures', []))}\")\n", + "print(f\" Not visualized: {len(anatomy.get('not_visualized', []))}\")\n", + "print(f\" Anomalies detected: {len(anatomy.get('anomalies', []))}\")\n", + "print(f\" HPO terms extracted: {len(anatomy.get('hpo_terms', []))}\")\n", + "\n", + "for anomaly in anatomy.get(\"anomalies\", [])[:3]:\n", + " print(f\" o {anomaly.get('description', 'N/A')} ({anomaly.get('variant_type', 'N/A')})\")\n", + "\n", + "# 5b. Estimated Fetal Weight\n", + "efw = parse_estimated_fetal_weight(observer_data, \"observer_json\")\n", + "print(f\"\\n [5b] Estimated Fetal Weight:\")\n", + "print(f\" EFW: {efw.get('efw_grams', 'N/A')} grams\")\n", + "print(f\" Percentile: {efw.get('percentile', 'N/A')}%\")\n", + "print(f\" Method: {efw.get('method', 'N/A')}\")\n", + "print(f\" Growth category: {efw.get('growth_category', 'N/A')}\")\n", + "print(f\" Within normal range: {efw.get('within_normal_range', 'N/A')}\")\n", + "\n", + "# 5c. Fetal Ratios\n", + "ratios = parse_fetal_ratios(observer_data, \"observer_json\")\n", + "print(f\"\\n [5c] Fetal Ratios:\")\n", + "print(f\" Ratios calculated: {len(ratios.get('ratios', []))}\")\n", + "print(f\" All within range: {ratios.get('all_within_range', 'N/A')}\")\n", + "print(f\" Proportionality: {ratios.get('proportionality_assessment', 'N/A')}\")\n", + "\n", + "for ratio in ratios.get(\"ratios\", [])[:3]:\n", + " name = ratio.get(\"name\", \"N/A\")\n", + " value = ratio.get(\"value\", \"N/A\")\n", + " in_range = \"[OK]\" if ratio.get(\"within_range\") else \"[!]\"\n", + " print(f\" {in_range} {name}: {value}\")\n", "\n", "# =============================================================================\n", - "# STEP 6: Convert to PhenotypicFeatures\n", + "# STEP 6: Build PhenotypicFeatures from ALL sources\n", "# =============================================================================\n", - "print(\"\\n[STEP 6] Converting to PhenotypicFeatures...\")\n", - "\n", - "\n", - "def parse_ga_from_description(description: str, fallback_weeks: float = 26.9) -> tuple[int, int]:\n", - " \"\"\"Extract weeks and days from TermBin description.\"\"\"\n", - " match = re.search(r\"at (\\d+)w(\\d+)d\", description)\n", - " if match:\n", - " return int(match.group(1)), int(match.group(2))\n", - " ga = GestationalAge.from_weeks(fallback_weeks)\n", - " return ga.weeks, ga.days\n", - "\n", - "\n", - "# Get subject GA for features without specific timing\n", - "first_measurement = observer_data[\"fetuses\"][0][\"measurements\"][0]\n", - "subject_ga_weeks = first_measurement.get(\"calculated_ega\", 26.9)\n", - "subject_ga = GestationalAge.from_weeks(subject_ga_weeks)\n", + "print(\"\\n[STEP 6] Building PhenotypicFeatures...\")\n", "\n", "phenotypic_features = []\n", "\n", - "# 6a: Convert biometry TermBins -> PhenotypicFeatures\n", - "print(\"\\n --- From Biometry ---\")\n", + "# Get subject GA from dating or fallback to first measurement\n", + "ga_weeks = dating.get(\"ga_weeks\")\n", + "if ga_weeks:\n", + " subject_ga = GestationalAge.from_weeks(float(ga_weeks))\n", + "else:\n", + " # Fallback: parse from first TermBin description\n", + " if term_bins:\n", + " weeks, days = parse_ga_from_description(term_bins[0].description)\n", + " subject_ga = GestationalAge(weeks=weeks, days=days)\n", + " else:\n", + " subject_ga = GestationalAge(weeks=27, days=0)\n", + "\n", + "# 6a. From biometry TermBins\n", "for tb in term_bins:\n", - " weeks, days = parse_ga_from_description(tb.description, subject_ga_weeks)\n", - " \n", + " weeks, days = parse_ga_from_description(tb.description)\n", + " onset = pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=weeks, days=days)\n", + " )\n", " pf = pps2.PhenotypicFeature(\n", " type=pps2.OntologyClass(id=tb.hpo_id, label=tb.hpo_label),\n", " excluded=tb.normal, # normal=True means abnormality is EXCLUDED\n", + " description=f\"Biometry: {tb.description}\",\n", + " onset=onset,\n", + " )\n", + " phenotypic_features.append((\"Biometry\", pf))\n", + "\n", + "# 6b. From clinical impression HPO terms (SimpleTerm objects with hpo_id, hpo_label)\n", + "for term in impression.get(\"hpo_terms\", []):\n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=term.hpo_id, label=term.hpo_label),\n", + " excluded=False,\n", + " description=f\"Clinical impression: {term.hpo_label}\",\n", " onset=pps2.TimeElement(\n", - " gestational_age=pps2.GestationalAge(weeks=weeks, days=days)\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", " ),\n", - " description=f\"[Biometry] {tb.description}\",\n", " )\n", - " phenotypic_features.append(pf)\n", - "\n", - "print(f\" ? Added {len(term_bins)} features from biometry\")\n", + " phenotypic_features.append((\"Clinical Text\", pf))\n", "\n", - "# 6b: Convert clinical text HPO terms -> PhenotypicFeatures\n", - "print(\"\\n --- From Clinical Text ---\")\n", - "text_feature_count = 0\n", - "for term in hpo_terms_from_text:\n", - " # Findings mentioned in clinical impression are OBSERVED (not excluded)\n", + "# 6c. From fetal anatomy HPO terms (SimpleTerm objects with hpo_id, hpo_label) (NEW)\n", + "for term in anatomy.get(\"hpo_terms\", []):\n", " pf = pps2.PhenotypicFeature(\n", " type=pps2.OntologyClass(id=term.hpo_id, label=term.hpo_label),\n", - " excluded=False, # These are observed findings\n", + " excluded=False,\n", + " description=f\"Anatomy finding: {term.hpo_label}\",\n", + " onset=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + " ),\n", + " )\n", + " phenotypic_features.append((\"Anatomy\", pf))\n", + "\n", + "# 6d. Growth category as phenotypic feature (NEW)\n", + "growth_hpo_map = {\n", + " \"SGA\": (\"HP:0001518\", \"Small for gestational age\"),\n", + " \"LGA\": (\"HP:0001520\", \"Large for gestational age\"),\n", + "}\n", + "growth_cat = efw.get(\"growth_category\")\n", + "if growth_cat in growth_hpo_map:\n", + " hpo_id, hpo_label = growth_hpo_map[growth_cat]\n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=hpo_id, label=hpo_label),\n", + " excluded=False,\n", + " description=f\"EFW {efw.get('efw_grams')}g at {efw.get('percentile')}th percentile\",\n", + " onset=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + " ),\n", + " )\n", + " phenotypic_features.append((\"Growth\", pf))\n", + "# AGA is normal - we could add as excluded feature or skip\n", + "elif growth_cat == \"AGA\":\n", + " print(\"Growth category AGA (normal) - no HPO term needed\")\n", + "\n", + "# 6e. Proportionality assessment as phenotypic feature (NEW)\n", + "if ratios.get(\"proportionality_assessment\") == \"Asymmetric\":\n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=\"HP:0001511\", label=\"Intrauterine growth retardation\"),\n", + " excluded=False,\n", + " description=\"Asymmetric growth pattern detected from biometric ratios\",\n", " onset=pps2.TimeElement(\n", " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", " ),\n", - " description=f\"[Clinical Impression] Extracted from narrative text via HPO Concept Recognition\",\n", " )\n", - " phenotypic_features.append(pf)\n", - " text_feature_count += 1\n", + " phenotypic_features.append((\"Ratios\", pf))\n", "\n", - "print(f\" ? Added {text_feature_count} features from clinical text\")\n", - "print(f\"\\n Total PhenotypicFeatures: {len(phenotypic_features)}\")\n", + "print(f\"\\n Summary by source:\")\n", + "sources = {}\n", + "for source, pf in phenotypic_features:\n", + " sources[source] = sources.get(source, 0) + 1\n", + "for source, count in sources.items():\n", + " print(f\" - {source}: {count} features\")\n", + "print(f\" Total: {len(phenotypic_features)} PhenotypicFeatures\")\n", "\n", "# =============================================================================\n", - "# STEP 7: Build Complete Phenopacket v2.0\n", + "# STEP 7: Assemble Phenopacket v2.0\n", "# =============================================================================\n", - "print(\"\\n[STEP 7] Building Phenopacket v2.0...\")\n", + "print(\"\\n[STEP 7] Assembling Phenopacket v2.0...\")\n", "\n", - "# Subject (fetus)\n", "subject = pps2.Individual(\n", " id=\"fetus-1\",\n", " sex=pps2.Sex.UNKNOWN_SEX,\n", @@ -1439,7 +642,6 @@ " ),\n", ")\n", "\n", - "# Metadata\n", "now = datetime.now(timezone.utc)\n", "created_timestamp = Timestamp()\n", "created_timestamp.FromDatetime(now)\n", @@ -1448,89 +650,81 @@ " id=\"hp\",\n", " name=\"Human Phenotype Ontology\",\n", " url=\"http://purl.obolibrary.org/obo/hp.owl\",\n", - " version=hpo_parser.get_version() or \"2025-01-01\",\n", + " version=hpo_parser.get_version() or \"2025-01-01\", # TODO (@VarenyaJ): Change version date if update the compressed hp.json\n", " namespace_prefix=\"HP\",\n", " iri_prefix=\"http://purl.obolibrary.org/obo/HP_\",\n", ")\n", "\n", "metadata = pps2.MetaData(\n", " created=created_timestamp,\n", - " created_by=\"prenatalppkt-etl-pipeline\",\n", + " created_by=\"prenatalppkt-etl-pipeline-v2\",\n", " phenopacket_schema_version=\"2.0\",\n", ")\n", "metadata.resources.append(hpo_resource)\n", "\n", - "# Assemble the Phenopacket\n", "phenopacket = pps2.Phenopacket(\n", - " id=\"apple-sally-fetus-1\",\n", + " id=\"apple-sally-fetus-1-complete\",\n", " subject=subject,\n", " meta_data=metadata,\n", ")\n", - "phenopacket.phenotypic_features.extend(phenotypic_features)\n", + "phenopacket.phenotypic_features.extend([pf for _, pf in phenotypic_features])\n", "\n", - "print(\" ? Phenopacket assembled successfully\")\n", - "print(f\" ID: {phenopacket.id}\")\n", - "print(f\" Subject: {phenopacket.subject.id} at {subject_ga.weeks}w{subject_ga.days}d\")\n", - "print(f\" Features: {len(phenopacket.phenotypic_features)}\")\n", + "print(f\"Phenopacket ID: {phenopacket.id}\")\n", + "print(f\"Subject: {phenopacket.subject.id}\")\n", + "print(f\"Features: {len(phenopacket.phenotypic_features)}\")\n", "\n", "# =============================================================================\n", - "# STEP 8: Output JSON\n", + "# STEP 8: Output & Validation\n", "# =============================================================================\n", - "print(\"\\n\" + \"=\" * 80)\n", - "print(\"PHENOPACKET v2.0 OUTPUT (JSON)\")\n", - "print(\"=\" * 80)\n", + "print(\"\\n[STEP 8] Output & Validation...\")\n", "\n", "phenopacket_json = MessageToJson(phenopacket, preserving_proto_field_name=True)\n", - "print(phenopacket_json)\n", - "\n", - "# =============================================================================\n", - "# STEP 9: Validation & Summary\n", - "# =============================================================================\n", - "print(\"\\n\" + \"=\" * 80)\n", - "print(\"VALIDATION & SUMMARY\")\n", - "print(\"=\" * 80)\n", "\n", "# Round-trip validation\n", - "print(\"\\n[Validation] Round-trip test...\")\n", "parsed_back = Parse(phenopacket_json, pps2.Phenopacket())\n", "assert parsed_back.id == phenopacket.id\n", "assert len(parsed_back.phenotypic_features) == len(phenopacket.phenotypic_features)\n", - "print(\" ? Round-trip validation passed\")\n", - "\n", - "# Feature breakdown\n", - "biometry_features = [pf for pf in phenopacket.phenotypic_features if \"[Biometry]\" in pf.description]\n", - "clinical_features = [pf for pf in phenopacket.phenotypic_features if \"[Clinical\" in pf.description]\n", - "excluded_count = sum(1 for pf in phenopacket.phenotypic_features if pf.excluded)\n", - "observed_count = len(phenopacket.phenotypic_features) - excluded_count\n", - "\n", - "print(\"\\n[Summary] Phenotypic Features:\")\n", - "print(f\" Total: {len(phenopacket.phenotypic_features)}\")\n", - "print(f\" From Biometry: {len(biometry_features)}\")\n", - "print(f\" From Clinical Text: {len(clinical_features)}\")\n", - "print(f\" Normal (excluded): {excluded_count}\")\n", - "print(f\" Abnormal (observed): {observed_count}\")\n", - "\n", - "# Detailed feature list\n", - "print(\"\\n[Detail] All Phenotypic Features:\")\n", - "print(\"-\" * 60)\n", - "for i, pf in enumerate(phenopacket.phenotypic_features, 1):\n", - " status = \"EXCLUDED (normal)\" if pf.excluded else \"OBSERVED (abnormal)\"\n", - " ga = pf.onset.gestational_age\n", - " source = \"Biometry\" if \"[Biometry]\" in pf.description else \"Clinical Text\"\n", - " print(f\"\\n [{i}] {pf.type.id} - {pf.type.label}\")\n", - " print(f\" Source: {source}\")\n", - " print(f\" Status: {status}\")\n", - " print(f\" Onset: {ga.weeks}w{ga.days}d\")\n", + "print(\"Round-trip validation passed\")\n", "\n", "# Save to file\n", - "output_path = Path(\"output/apple_sally_phenopacket_expanded.json\")\n", + "output_path = Path(\"output/apple_sally_phenopacket_complete.json\")\n", "output_path.parent.mkdir(exist_ok=True)\n", "with open(output_path, \"w\") as f:\n", " f.write(phenopacket_json)\n", + "print(f\"Saved to: {output_path}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 9: Summary Report\n", + "# =============================================================================\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"PHENOPACKET GENERATION COMPLETE\")\n", + "print(\"=\" * 80)\n", + "\n", + "print(\"\\n[Clinical Context]\")\n", + "print(f\" Indication: {indication_text[:50]}...\")\n", + "print(f\" GA at exam: {dating.get('ga_weeks', 'N/A')} weeks\")\n", + "print(f\" EFW: {efw.get('efw_grams', 'N/A')}g ({efw.get('growth_category', 'N/A')})\")\n", + "print(f\" Proportionality: {ratios.get('proportionality_assessment', 'N/A')}\")\n", + "\n", + "print(\"\\n[Phenotypic Features by Source]\")\n", + "for source, count in sources.items():\n", + " print(f\" {source}: {count}\")\n", + "\n", + "observed = sum(1 for _, pf in phenotypic_features if not pf.excluded)\n", + "excluded = sum(1 for _, pf in phenotypic_features if pf.excluded)\n", + "print(f\"\\n[Feature Status]\")\n", + "print(f\" Observed (abnormal): {observed}\")\n", + "print(f\" Excluded (normal): {excluded}\")\n", "\n", "print(\"\\n\" + \"=\" * 80)\n", - "print(f\"SUCCESS: Phenopacket saved to {output_path}\")\n", - "print(\"=\" * 80)" + "print(f\"SUCCESS: Complete phenopacket at {output_path}\")\n", + "print(\"=\" * 80)\n", + "\n", + "# =============================================================================\n", + "# STEP 10: Display JSON Output\n", + "# =============================================================================\n", + "print(\"\\n[Phenopacket JSON Output]\")\n", + "print(phenopacket_json)" ] } ],