diff --git a/.github/actions/python_from_pyproject/action.yaml b/.github/actions/python_from_pyproject/action.yaml index 79ee605..13d6210 100644 --- a/.github/actions/python_from_pyproject/action.yaml +++ b/.github/actions/python_from_pyproject/action.yaml @@ -1,22 +1,22 @@ name: Install python from pyproject.toml -description: 'Installs Python from the version found in the pyproject.toml' +description: Installs Python from the version found in pyproject.toml inputs: - pyproject-file-path: - required: False - description: "Path to the pyproject.toml including filename" - default: "./pyproject.toml" + pyproject-file-path: + required: false + description: Path to the pyproject.toml including filename + default: ./pyproject.toml runs: - using: composite - steps: - - name: Get project version with yq - id: get_python_version - uses: mikefarah/yq@v4.46.1 - with: - cmd: yq '.project.requires-python' ${{ inputs.pyproject-file-path }} + using: composite + steps: + - name: Read requires-python from pyproject.toml + id: get_python_version + shell: bash + run: | + python -c "import tomllib, pathlib; p = pathlib.Path('${{ inputs.pyproject-file-path }}'); req = tomllib.loads(p.read_text())['project']['requires-python']; print(f'result={req}')" >> $GITHUB_OUTPUT - - name: Set up Python - uses: actions/setup-python@v5.6.0 - with: - python-version: ${{ steps.get_python_version.outputs.result }} \ No newline at end of file + - name: Set up Python + uses: actions/setup-python@v5.6.0 + with: + python-version: ${{ steps.get_python_version.outputs.result }} \ No newline at end of file diff --git a/prenatalppkt.ipynb b/prenatalppkt.ipynb new file mode 100644 index 0000000..cb51d6d --- /dev/null +++ b/prenatalppkt.ipynb @@ -0,0 +1,752 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8b89cf52", + "metadata": {}, + "source": [ + "# Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8f2cfce", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:hpotk.util:Using default encoding 'utf-8'\n", + "DEBUG:hpotk.util:Opening /tmp/hp.json\n", + "DEBUG:hpotk.util:Looks like a local file: /tmp/hp.json\n", + "DEBUG:hpotk.util:Looks like decompressed data\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "================================================================================\n", + "PRENATALPPKT ETL PIPELINE\n", + "Observer JSON -> Section Parsing -> Phenopacket v2.0\n", + "================================================================================\n", + "\n", + "[STEP 1] Loading the HPO Concept Recognizer...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:hpotk.ontology.load.obographs._load:Extracting ontology terms\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.load.obographs._factory:Unknown synonym type http://purl.obolibrary.org/obo/hp#allelic_requirement\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/BFO_0000051', 'lbl': 'has part', 'meta': {'xrefs': [{'val': 'BFO:0000051'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'has_part'}]}}\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/BFO_0000066', 'lbl': 'occurs in', 'meta': {'xrefs': [{'val': 'BFO:0000066'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'occurs_in'}]}}\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/RO_0002503', 'lbl': 'towards', 'meta': {'xrefs': [{'val': 'RO:0002503'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'towards'}]}}\n", + "DEBUG:hpotk.ontology.io.obographs:Missing node type in {'id': 'http://purl.obolibrary.org/obo/RO_0002573', 'lbl': 'has modifier', 'meta': {'comments': ['placeholder relation to indicate normality/abnormality.'], 'xrefs': [{'val': 'RO:0002180'}], 'basicPropertyValues': [{'pred': 'http://www.geneontology.org/formats/oboInOwl#shorthand', 'val': 'qualifier'}]}}\n", + "DEBUG:hpotk.ontology.load.obographs._load:Creating the edge list\n", + "DEBUG:hpotk.ontology.load.obographs._load:Building ontology graph\n", + "DEBUG:hpotk.graph._factory:Creating ontology graph from 23612 edges\n", + "DEBUG:hpotk.graph._factory:Found root HP:0000001\n", + "DEBUG:hpotk.graph._factory:Extracted 19262 nodes\n", + "DEBUG:hpotk.ontology.load.obographs._load:Assembling the ontology\n", + "DEBUG:hpotk.ontology.load.obographs._load:Done\n", + "DEBUG:prenatalppkt.hpo.hpo_parser:Instantiating HPO concept recognizer.\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for head_circumference\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for biparietal_diameter\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for femur_length\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for abdominal_circumference\n", + "DEBUG:prenatalppkt.mapping_loader:Loaded 8 bins for occipitofrontal_diameter\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Loaded mappings for: ['head_circumference', 'biparietal_diameter', 'femur_length', 'abdominal_circumference', 'occipitofrontal_diameter']\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Starting Observer JSON extraction\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing fetus 1\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Found 6 measurements\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: AC\n", + "DEBUG:prenatalppkt.etl.extractors.observer:AC has percentile=55.6% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for AC: value=226.20000000000002mm, percentile=55.6%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=AC, value=226.20000000000002mm, percentile=55.6%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0034207 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: BPD\n", + "DEBUG:prenatalppkt.etl.extractors.observer:BPD has percentile=51.2% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for BPD: value=66.8mm, percentile=51.2%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=BPD, value=66.8mm, percentile=51.2%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: HC\n", + "DEBUG:prenatalppkt.etl.extractors.observer:HC has percentile=42.5% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for HC: value=250.0mm, percentile=42.5%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=HC, value=250.0mm, percentile=42.5%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0000240 - Abnormality of skull size\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0000240 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Femur\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Femur has percentile=46.8% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Femur: value=50.099999999999994mm, percentile=46.8%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Femur, value=50.099999999999994mm, percentile=46.8%, ga=, method=None\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Selected HPO: HP:0002823 - Abnormal femur morphology\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Created TermBin: HP:0002823 - normal=True\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Nuchal Fold\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Nuchal Fold has percentile=0% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Nuchal Fold: value=10.0mm, percentile=0%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Nuchal Fold, value=10.0mm, percentile=0.0%, ga=, method=None\n", + "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Nuchal Fold' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Processing measurement: Cerebellum\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Cerebellum has percentile=0% (valid)\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Creating TermBin for Cerebellum: value=30.0mm, percentile=0%, ga=\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Creating TermBin: name=Cerebellum, value=30.0mm, percentile=0.0%, ga=, method=None\n", + "WARNING:prenatalppkt.etl.term_bin_factory:No HPO mapping for optional measurement 'Cerebellum' - skipping. TODO(@VarenyaJ): Add HPO terms when available\n", + "DEBUG:prenatalppkt.etl.extractors.observer:Successfully parsed 4 measurements\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Validating 4 TermBins for required measurements\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: AC\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: BPD\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: HC\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Found measurement: Femur\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Present: {'HC', 'Femur', 'AC', 'BPD'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:Required: {'HC', 'Femur', 'AC', 'BPD'}\n", + "DEBUG:prenatalppkt.etl.term_bin_factory:All required measurements present\n", + "INFO:prenatalppkt.etl.extractors.observer:Extracted 4 TermBins from Observer JSON\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HPO version: 2025-10-22\n", + "\n", + "[STEP 2] Loading Observer JSON...\n", + "Loaded: Apple_Sally_pretty.json\n", + "Fetuses: 1\n", + "\n", + "[STEP 3] Extracting biometry measurements...\n", + "Extracted 4 TermBins\n", + " - AC: 226.2 mm (55.6%) at 26w6d [Fetus 1] [Normal]\n", + " HPO: HP:0034207 - Abnormal fetal gastrointestinal system morphology\n", + " - BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1] [Normal]\n", + " HPO: HP:0000240 - Abnormality of skull size\n", + " - HC: 250.0 mm (42.5%) at 26w6d [Fetus 1] [Normal]\n", + " HPO: HP:0000240 - Abnormality of skull size\n", + " - Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1] [Normal]\n", + " HPO: HP:0002823 - Abnormal femur morphology\n", + "\n", + "[STEP 4] Parsing clinical sections...\n", + "\n", + " [4a] Clinical Indication:\n", + " Reason: N/A...\n", + "\n", + " [4b] Pregnancy Dating:\n", + " LMP: 0001-01-01\n", + " EDD: None\n", + " GA at exam: N/A weeks\n", + "\n", + " [4c] Clinical Impression:\n", + "S... Text: The patient was referred for a fetal anatomical survey. \n", + " HPO terms found: 4\n", + "\n", + "[STEP 5] Parsing fetal-specific sections...\n", + "\n", + " [5a] Fetal Anatomy:\n", + " Normal structures: 0\n", + " Abnormal structures: 0\n", + " Not visualized: 0\n", + " Anomalies detected: 0\n", + " HPO terms extracted: 1\n", + "\n", + " [5b] Estimated Fetal Weight:\n", + " EFW: 1014.8 grams\n", + " Percentile: 55.6%\n", + " Method: Hadlock (AC, FL, HC)\n", + " Growth category: AGA\n", + " Within normal range: True\n", + "\n", + " [5c] Fetal Ratios:\n", + " Ratios calculated: 3\n", + " All within range: True\n", + " Proportionality: Normal\n", + " ? HC/AC: 1.105\n", + " ? FL/AC: 22.149\n", + " ? FL/BPD: 75\n", + "\n", + "[STEP 6] Building PhenotypicFeatures...\n", + "Growth category AGA (normal) - no HPO term needed\n", + "\n", + " Summary by source:\n", + " - Biometry: 4 features\n", + " - Clinical Text: 4 features\n", + " - Anatomy: 1 features\n", + " Total: 9 PhenotypicFeatures\n", + "\n", + "[STEP 7] Assembling Phenopacket v2.0...\n", + "Phenopacket ID: apple-sally-fetus-1-complete\n", + "Subject: fetus-1\n", + "Features: 9\n", + "\n", + "[STEP 8] Output & Validation...\n", + "Round-trip validation passed\n", + "Saved to: output/apple_sally_phenopacket_complete.json\n", + "\n", + "================================================================================\n", + "PHENOPACKET GENERATION COMPLETE\n", + "================================================================================\n", + "\n", + "[Clinical Context]\n", + " Indication: N/A...\n", + " GA at exam: N/A weeks\n", + " EFW: 1014.8g (AGA)\n", + " Proportionality: Normal\n", + "\n", + "[Phenotypic Features by Source]\n", + " Biometry: 4\n", + " Clinical Text: 4\n", + " Anatomy: 1\n", + "\n", + "[Feature Status]\n", + " Observed (abnormal): 5\n", + " Excluded (normal): 4\n", + "\n", + "================================================================================\n", + "SUCCESS: Complete phenopacket at output/apple_sally_phenopacket_complete.json\n", + "================================================================================\n", + "\n", + "[Phenopacket JSON Output]\n", + "{\n", + " \"id\": \"apple-sally-fetus-1-complete\",\n", + " \"subject\": {\n", + " \"id\": \"fetus-1\",\n", + " \"time_at_last_encounter\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " \"phenotypic_features\": [\n", + " {\n", + " \"description\": \"Biometry: AC: 226.2 mm (55.6%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0034207\",\n", + " \"label\": \"Abnormal fetal gastrointestinal system morphology\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Biometry: BPD: 66.8 mm (51.2%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000240\",\n", + " \"label\": \"Abnormality of skull size\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Biometry: HC: 250.0 mm (42.5%) at 26w6d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000240\",\n", + " \"label\": \"Abnormality of skull size\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Biometry: Femur: 50.1 mm (46.8%) at 27w0d [Fetus 1]\",\n", + " \"type\": {\n", + " \"id\": \"HP:0002823\",\n", + " \"label\": \"Abnormal femur morphology\"\n", + " },\n", + " \"excluded\": true,\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 27\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Clinical impression: Macrocephaly\",\n", + " \"type\": {\n", + " \"id\": \"HP:0000256\",\n", + " \"label\": \"Macrocephaly\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Clinical impression: Agenesis of corpus callosum\",\n", + " \"type\": {\n", + " \"id\": \"HP:0001274\",\n", + " \"label\": \"Agenesis of corpus callosum\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Clinical impression: Dandy-Walker malformation\",\n", + " \"type\": {\n", + " \"id\": \"HP:0001305\",\n", + " \"label\": \"Dandy-Walker malformation\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Clinical impression: Ventriculomegaly\",\n", + " \"type\": {\n", + " \"id\": \"HP:0002119\",\n", + " \"label\": \"Ventriculomegaly\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"description\": \"Anatomy finding: Neural tube defect\",\n", + " \"type\": {\n", + " \"id\": \"HP:0045005\",\n", + " \"label\": \"Neural tube defect\"\n", + " },\n", + " \"onset\": {\n", + " \"gestational_age\": {\n", + " \"weeks\": 26,\n", + " \"days\": 6\n", + " }\n", + " }\n", + " }\n", + " ],\n", + " \"meta_data\": {\n", + " \"created\": \"2026-02-05T19:49:31.564591Z\",\n", + " \"created_by\": \"prenatalppkt-etl-pipeline-v2\",\n", + " \"resources\": [\n", + " {\n", + " \"id\": \"hp\",\n", + " \"name\": \"Human Phenotype Ontology\",\n", + " \"url\": \"http://purl.obolibrary.org/obo/hp.owl\",\n", + " \"version\": \"2025-10-22\",\n", + " \"namespace_prefix\": \"HP\",\n", + " \"iri_prefix\": \"http://purl.obolibrary.org/obo/HP_\"\n", + " }\n", + " ],\n", + " \"phenopacket_schema_version\": \"2.0\"\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "\n", + "Steps:\n", + "1. Biometry extraction -> List[TermBin] -> quantitative HPO terms\n", + "2. Clinical indication -> reason for exam\n", + "3. Pregnancy dating -> LMP, EDD, gestational age context\n", + "4. Clinical impression -> qualitative HPO terms from free text\n", + "5. Fetal anatomy -> structured findings + HPO terms from anomalies\n", + "6. Estimated fetal weight -> SGA/AGA/LGA classification\n", + "7. Fetal ratios -> proportionality assessment\n", + "8. Phenopacket assembly -> GA4GH Phenopacket v2.0 JSON\n", + "\"\"\"\n", + "\n", + "import gzip\n", + "import json\n", + "import re\n", + "from datetime import datetime, timezone\n", + "from pathlib import Path\n", + "\n", + "from google.protobuf.json_format import MessageToJson, Parse\n", + "from google.protobuf.timestamp_pb2 import Timestamp\n", + "import phenopackets.schema.v2 as pps2\n", + "\n", + "# ETL Extractors (biometry -> TermBins)\n", + "from prenatalppkt.etl.extractors import observer\n", + "\n", + "# ETL Section Parsers (clinical metadata -> Dicts)\n", + "from prenatalppkt.etl.sections import (\n", + " parse_clinical_indication,\n", + " parse_pregnancy_dating,\n", + " parse_clinical_impression,\n", + " parse_fetal_anatomy,\n", + " parse_estimated_fetal_weight,\n", + " parse_fetal_ratios,\n", + ")\n", + "\n", + "# HPO Concept Recognition\n", + "from prenatalppkt.hpo import HpoParser\n", + "\n", + "# Gestational Age utilities\n", + "from prenatalppkt.gestational_age import GestationalAge\n", + "\n", + "print(\"=\" * 80)\n", + "print(\"PRENATALPPKT ETL PIPELINE\")\n", + "print(\"Observer JSON -> Section Parsing -> Phenopacket v2.0\")\n", + "print(\"=\" * 80)\n", + "\n", + "# =============================================================================\n", + "# STEP 1: Load HPO Concept Recognizer\n", + "# =============================================================================\n", + "print(\"\\n[STEP 1] Loading the HPO Concept Recognizer...\")\n", + "\n", + "HP_JSON_GZ = Path(\"tests/data/hp.json.gz\")\n", + "TMP_HP_JSON = Path(\"/tmp/hp.json\")\n", + "\n", + "with gzip.open(HP_JSON_GZ, \"rt\", encoding=\"utf-8\") as f_in:\n", + " with open(TMP_HP_JSON, \"w\", encoding=\"utf-8\") as f_out:\n", + " f_out.write(f_in.read())\n", + "\n", + "hpo_parser = HpoParser(hpo_json_file=str(TMP_HP_JSON))\n", + "hpo_cr = hpo_parser.get_hpo_concept_recognizer()\n", + "\n", + "print(f\"HPO version: {hpo_parser.get_version()}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 2: Load Observer JSON\n", + "# =============================================================================\n", + "print(\"\\n[STEP 2] Loading Observer JSON...\")\n", + "\n", + "data_path = Path(\"tests/data/Apple_Sally_pretty.json\")\n", + "with open(data_path) as f:\n", + " observer_data = json.load(f)\n", + "\n", + "print(f\"Loaded: {data_path.name}\")\n", + "print(f\"Fetuses: {len(observer_data.get('fetuses', []))}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 3: Extract Biometry -> TermBins\n", + "# =============================================================================\n", + "print(\"\\n[STEP 3] Extracting biometry measurements...\")\n", + "\n", + "term_bins = observer.extract(observer_data)\n", + "print(f\"Extracted {len(term_bins)} TermBins\")\n", + "\n", + "# Helper function to parse GA from TermBin description\n", + "def parse_ga_from_description(description: str) -> tuple[int, int]:\n", + " \"\"\"Extract weeks and days from TermBin description like 'HC: 250.0 mm (42.5%) at 26w6d'\"\"\"\n", + " match = re.search(r\"at (\\d+)w(\\d+)d\", description)\n", + " if match:\n", + " return int(match.group(1)), int(match.group(2))\n", + " return 27, 0 # fallback values for XwYd\n", + "\n", + "# Display TermBins - note: TermBin has description, hpo_id, hpo_label, normal, range\n", + "# NOT label, value_mm, percentile directly\n", + "for tb in term_bins:\n", + " status = \"Normal\" if tb.normal else \"ABNORMAL\"\n", + " print(f\" - {tb.description} [{status}]\")\n", + " print(f\" HPO: {tb.hpo_id} - {tb.hpo_label}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 4: Parse Clinical Sections\n", + "# =============================================================================\n", + "print(\"\\n[STEP 4] Parsing clinical sections...\")\n", + "\n", + "# 4a. Clinical Indication\n", + "indication = parse_clinical_indication(observer_data, \"observer_json\")\n", + "indication_text = indication.get('indication_text', 'N/A') or 'N/A'\n", + "print(f\"\\n [4a] Clinical Indication:\")\n", + "print(f\" Reason: {indication_text[:60]}...\")\n", + "\n", + "# 4b. Pregnancy Dating\n", + "dating = parse_pregnancy_dating(observer_data, \"observer_json\")\n", + "print(f\"\\n [4b] Pregnancy Dating:\")\n", + "print(f\" LMP: {dating.get('lmp', 'N/A')}\")\n", + "print(f\" EDD: {dating.get('edd', 'N/A')}\")\n", + "print(f\" GA at exam: {dating.get('ga_weeks', 'N/A')} weeks\")\n", + "\n", + "# 4c. Clinical Impression (with HPO extraction)\n", + "impression = parse_clinical_impression(observer_data, \"observer_json\", hpo_cr=hpo_cr)\n", + "impression_text = impression.get('impression_text', 'N/A') or 'N/A'\n", + "print(f\"\\n [4c] Clinical Impression:\")\n", + "print(f\" Text: {impression_text[:60]}...\")\n", + "print(f\" HPO terms found: {len(impression.get('hpo_terms', []))}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 5: Parse Fetal-Specific Sections (NEW)\n", + "# =============================================================================\n", + "print(\"\\n[STEP 5] Parsing fetal-specific sections...\")\n", + "\n", + "# 5a. Fetal Anatomy (with HPO extraction from anomalies)\n", + "anatomy = parse_fetal_anatomy(observer_data, \"observer_json\", hpo_cr=hpo_cr)\n", + "print(f\"\\n [5a] Fetal Anatomy:\")\n", + "print(f\" Normal structures: {len(anatomy.get('normal_structures', []))}\")\n", + "print(f\" Abnormal structures: {len(anatomy.get('abnormal_structures', []))}\")\n", + "print(f\" Not visualized: {len(anatomy.get('not_visualized', []))}\")\n", + "print(f\" Anomalies detected: {len(anatomy.get('anomalies', []))}\")\n", + "print(f\" HPO terms extracted: {len(anatomy.get('hpo_terms', []))}\")\n", + "\n", + "for anomaly in anatomy.get(\"anomalies\", [])[:3]:\n", + " print(f\" o {anomaly.get('description', 'N/A')} ({anomaly.get('variant_type', 'N/A')})\")\n", + "\n", + "# 5b. Estimated Fetal Weight\n", + "efw = parse_estimated_fetal_weight(observer_data, \"observer_json\")\n", + "print(f\"\\n [5b] Estimated Fetal Weight:\")\n", + "print(f\" EFW: {efw.get('efw_grams', 'N/A')} grams\")\n", + "print(f\" Percentile: {efw.get('percentile', 'N/A')}%\")\n", + "print(f\" Method: {efw.get('method', 'N/A')}\")\n", + "print(f\" Growth category: {efw.get('growth_category', 'N/A')}\")\n", + "print(f\" Within normal range: {efw.get('within_normal_range', 'N/A')}\")\n", + "\n", + "# 5c. Fetal Ratios\n", + "ratios = parse_fetal_ratios(observer_data, \"observer_json\")\n", + "print(f\"\\n [5c] Fetal Ratios:\")\n", + "print(f\" Ratios calculated: {len(ratios.get('ratios', []))}\")\n", + "print(f\" All within range: {ratios.get('all_within_range', 'N/A')}\")\n", + "print(f\" Proportionality: {ratios.get('proportionality_assessment', 'N/A')}\")\n", + "\n", + "for ratio in ratios.get(\"ratios\", [])[:3]:\n", + " name = ratio.get(\"name\", \"N/A\")\n", + " value = ratio.get(\"value\", \"N/A\")\n", + " in_range = \"[OK]\" if ratio.get(\"within_range\") else \"[!]\"\n", + " print(f\" {in_range} {name}: {value}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 6: Build PhenotypicFeatures from ALL sources\n", + "# =============================================================================\n", + "print(\"\\n[STEP 6] Building PhenotypicFeatures...\")\n", + "\n", + "phenotypic_features = []\n", + "\n", + "# Get subject GA from dating or fallback to first measurement\n", + "ga_weeks = dating.get(\"ga_weeks\")\n", + "if ga_weeks:\n", + " subject_ga = GestationalAge.from_weeks(float(ga_weeks))\n", + "else:\n", + " # Fallback: parse from first TermBin description\n", + " if term_bins:\n", + " weeks, days = parse_ga_from_description(term_bins[0].description)\n", + " subject_ga = GestationalAge(weeks=weeks, days=days)\n", + " else:\n", + " subject_ga = GestationalAge(weeks=27, days=0)\n", + "\n", + "# 6a. From biometry TermBins\n", + "for tb in term_bins:\n", + " weeks, days = parse_ga_from_description(tb.description)\n", + " onset = pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=weeks, days=days)\n", + " )\n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=tb.hpo_id, label=tb.hpo_label),\n", + " excluded=tb.normal, # normal=True means abnormality is EXCLUDED\n", + " description=f\"Biometry: {tb.description}\",\n", + " onset=onset,\n", + " )\n", + " phenotypic_features.append((\"Biometry\", pf))\n", + "\n", + "# 6b. From clinical impression HPO terms (SimpleTerm objects with hpo_id, hpo_label)\n", + "for term in impression.get(\"hpo_terms\", []):\n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=term.hpo_id, label=term.hpo_label),\n", + " excluded=False,\n", + " description=f\"Clinical impression: {term.hpo_label}\",\n", + " onset=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + " ),\n", + " )\n", + " phenotypic_features.append((\"Clinical Text\", pf))\n", + "\n", + "# 6c. From fetal anatomy HPO terms (SimpleTerm objects with hpo_id, hpo_label) (NEW)\n", + "for term in anatomy.get(\"hpo_terms\", []):\n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=term.hpo_id, label=term.hpo_label),\n", + " excluded=False,\n", + " description=f\"Anatomy finding: {term.hpo_label}\",\n", + " onset=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + " ),\n", + " )\n", + " phenotypic_features.append((\"Anatomy\", pf))\n", + "\n", + "# 6d. Growth category as phenotypic feature (NEW)\n", + "growth_hpo_map = {\n", + " \"SGA\": (\"HP:0001518\", \"Small for gestational age\"),\n", + " \"LGA\": (\"HP:0001520\", \"Large for gestational age\"),\n", + "}\n", + "growth_cat = efw.get(\"growth_category\")\n", + "if growth_cat in growth_hpo_map:\n", + " hpo_id, hpo_label = growth_hpo_map[growth_cat]\n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=hpo_id, label=hpo_label),\n", + " excluded=False,\n", + " description=f\"EFW {efw.get('efw_grams')}g at {efw.get('percentile')}th percentile\",\n", + " onset=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + " ),\n", + " )\n", + " phenotypic_features.append((\"Growth\", pf))\n", + "# AGA is normal - we could add as excluded feature or skip\n", + "elif growth_cat == \"AGA\":\n", + " print(\"Growth category AGA (normal) - no HPO term needed\")\n", + "\n", + "# 6e. Proportionality assessment as phenotypic feature (NEW)\n", + "if ratios.get(\"proportionality_assessment\") == \"Asymmetric\":\n", + " pf = pps2.PhenotypicFeature(\n", + " type=pps2.OntologyClass(id=\"HP:0001511\", label=\"Intrauterine growth retardation\"),\n", + " excluded=False,\n", + " description=\"Asymmetric growth pattern detected from biometric ratios\",\n", + " onset=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + " ),\n", + " )\n", + " phenotypic_features.append((\"Ratios\", pf))\n", + "\n", + "print(f\"\\n Summary by source:\")\n", + "sources = {}\n", + "for source, pf in phenotypic_features:\n", + " sources[source] = sources.get(source, 0) + 1\n", + "for source, count in sources.items():\n", + " print(f\" - {source}: {count} features\")\n", + "print(f\" Total: {len(phenotypic_features)} PhenotypicFeatures\")\n", + "\n", + "# =============================================================================\n", + "# STEP 7: Assemble Phenopacket v2.0\n", + "# =============================================================================\n", + "print(\"\\n[STEP 7] Assembling Phenopacket v2.0...\")\n", + "\n", + "subject = pps2.Individual(\n", + " id=\"fetus-1\",\n", + " sex=pps2.Sex.UNKNOWN_SEX,\n", + " time_at_last_encounter=pps2.TimeElement(\n", + " gestational_age=pps2.GestationalAge(weeks=subject_ga.weeks, days=subject_ga.days)\n", + " ),\n", + ")\n", + "\n", + "now = datetime.now(timezone.utc)\n", + "created_timestamp = Timestamp()\n", + "created_timestamp.FromDatetime(now)\n", + "\n", + "hpo_resource = pps2.Resource(\n", + " id=\"hp\",\n", + " name=\"Human Phenotype Ontology\",\n", + " url=\"http://purl.obolibrary.org/obo/hp.owl\",\n", + " version=hpo_parser.get_version() or \"2025-01-01\", # TODO (@VarenyaJ): Change version date if update the compressed hp.json\n", + " namespace_prefix=\"HP\",\n", + " iri_prefix=\"http://purl.obolibrary.org/obo/HP_\",\n", + ")\n", + "\n", + "metadata = pps2.MetaData(\n", + " created=created_timestamp,\n", + " created_by=\"prenatalppkt-etl-pipeline-v2\",\n", + " phenopacket_schema_version=\"2.0\",\n", + ")\n", + "metadata.resources.append(hpo_resource)\n", + "\n", + "phenopacket = pps2.Phenopacket(\n", + " id=\"apple-sally-fetus-1-complete\",\n", + " subject=subject,\n", + " meta_data=metadata,\n", + ")\n", + "phenopacket.phenotypic_features.extend([pf for _, pf in phenotypic_features])\n", + "\n", + "print(f\"Phenopacket ID: {phenopacket.id}\")\n", + "print(f\"Subject: {phenopacket.subject.id}\")\n", + "print(f\"Features: {len(phenopacket.phenotypic_features)}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 8: Output & Validation\n", + "# =============================================================================\n", + "print(\"\\n[STEP 8] Output & Validation...\")\n", + "\n", + "phenopacket_json = MessageToJson(phenopacket, preserving_proto_field_name=True)\n", + "\n", + "# Round-trip validation\n", + "parsed_back = Parse(phenopacket_json, pps2.Phenopacket())\n", + "assert parsed_back.id == phenopacket.id\n", + "assert len(parsed_back.phenotypic_features) == len(phenopacket.phenotypic_features)\n", + "print(\"Round-trip validation passed\")\n", + "\n", + "# Save to file\n", + "output_path = Path(\"output/apple_sally_phenopacket_complete.json\")\n", + "output_path.parent.mkdir(exist_ok=True)\n", + "with open(output_path, \"w\") as f:\n", + " f.write(phenopacket_json)\n", + "print(f\"Saved to: {output_path}\")\n", + "\n", + "# =============================================================================\n", + "# STEP 9: Summary Report\n", + "# =============================================================================\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(\"PHENOPACKET GENERATION COMPLETE\")\n", + "print(\"=\" * 80)\n", + "\n", + "print(\"\\n[Clinical Context]\")\n", + "print(f\" Indication: {indication_text[:50]}...\")\n", + "print(f\" GA at exam: {dating.get('ga_weeks', 'N/A')} weeks\")\n", + "print(f\" EFW: {efw.get('efw_grams', 'N/A')}g ({efw.get('growth_category', 'N/A')})\")\n", + "print(f\" Proportionality: {ratios.get('proportionality_assessment', 'N/A')}\")\n", + "\n", + "print(\"\\n[Phenotypic Features by Source]\")\n", + "for source, count in sources.items():\n", + " print(f\" {source}: {count}\")\n", + "\n", + "observed = sum(1 for _, pf in phenotypic_features if not pf.excluded)\n", + "excluded = sum(1 for _, pf in phenotypic_features if pf.excluded)\n", + "print(f\"\\n[Feature Status]\")\n", + "print(f\" Observed (abnormal): {observed}\")\n", + "print(f\" Excluded (normal): {excluded}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 80)\n", + "print(f\"SUCCESS: Complete phenopacket at {output_path}\")\n", + "print(\"=\" * 80)\n", + "\n", + "# =============================================================================\n", + "# STEP 10: Display JSON Output\n", + "# =============================================================================\n", + "print(\"\\n[Phenopacket JSON Output]\")\n", + "print(phenopacket_json)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "prenatalppkt", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/prenatalppkt/etl/sections/__init__.py b/src/prenatalppkt/etl/sections/__init__.py index 2d3fc17..3893145 100644 --- a/src/prenatalppkt/etl/sections/__init__.py +++ b/src/prenatalppkt/etl/sections/__init__.py @@ -2,18 +2,21 @@ Section parsers for non-biometry clinical data. These parsers extract additional clinical information from ultrasound reports -beyond fetal biometry measurements. They are designed to eventually integrate -with HPO Clinical Record (CR) modules for comprehensive phenotype capture. +beyond fetal biometry measurements. They return Dict objects with parsed data. -Current Status: SKELETON IMPLEMENTATIONS -- Basic parsing structure in place -- Returns placeholder data -- TODO comments describe future implementation +Implemented parsers: +- parse_clinical_indication: Extract reason for exam +- parse_pregnancy_dating: Extract LMP, EDD, gestational age +- parse_clinical_impression: Extract clinical narrative and HPO terms +- parse_fetal_anatomy: Extract anatomy findings and HPO terms +- parse_estimated_fetal_weight: Extract EFW and growth classification +- parse_fetal_ratios: Extract biometric ratios and proportionality -Future Integration: -- Map findings to HPO terms using src/prenatalppkt/hpo modules -- Support symmetric processing across Observer JSON, ViewPoint Text, and HL7 -- Enable full phenotype packet generation +Skeleton parsers (TODO): +- parse_maternal_history: OB history, complications +- parse_placenta: Placental assessment +- parse_amniotic_fluid: AFI, MVP measurements +- parse_umbilical_cord: Vessel count, insertion site """ from prenatalppkt.etl.sections.maternal_history import parse_maternal_history @@ -21,6 +24,10 @@ from prenatalppkt.etl.sections.clinical_indication import parse_clinical_indication from prenatalppkt.etl.sections.pregnancy_dating import parse_pregnancy_dating from prenatalppkt.etl.sections.fetal_anatomy import parse_fetal_anatomy +from prenatalppkt.etl.sections.estimated_fetal_weight import ( + parse_estimated_fetal_weight, +) +from prenatalppkt.etl.sections.fetal_ratios import parse_fetal_ratios from prenatalppkt.etl.sections.placenta import parse_placenta from prenatalppkt.etl.sections.amniotic_fluid import parse_amniotic_fluid from prenatalppkt.etl.sections.umbilical_cord import parse_umbilical_cord @@ -31,6 +38,8 @@ "parse_clinical_indication", "parse_pregnancy_dating", "parse_fetal_anatomy", + "parse_estimated_fetal_weight", + "parse_fetal_ratios", "parse_placenta", "parse_amniotic_fluid", "parse_umbilical_cord", diff --git a/src/prenatalppkt/etl/sections/clinical_impression.py b/src/prenatalppkt/etl/sections/clinical_impression.py index 4925e67..6f83ee5 100644 --- a/src/prenatalppkt/etl/sections/clinical_impression.py +++ b/src/prenatalppkt/etl/sections/clinical_impression.py @@ -1,66 +1,192 @@ """ -Clinical impression section parser (SKELETON). +Clinical impression / interpretation section parser. -Extracts clinical impressions, diagnoses, and findings from report impression. - -TODO @VarenyaJ: Complete implementation, Map clinical findings to HPO terms, Extract structured anomalies from impression text +Extracts clinical narrative text and optionally extracts HPO terms +from free text using the HPO Concept Recognizer. """ -from typing import Dict +from __future__ import annotations + +import json +import re +from typing import Dict, List, Optional, Union -def parse_clinical_impression(data: str, source_format: str = "viewpoint_text") -> Dict: +def parse_clinical_impression( + data: Union[str, Dict], source_format: str, hpo_cr=None +) -> Dict: """ - Extract clinical impression from ultrasound report. + Parse clinical impression / interpretation section. + + Supports: + - observer_json + - viewpoint_text + - viewpoint_hl7 Args: - data: Report content (text, JSON, or HL7) + data: Raw input data (JSON string, dict, or text) source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + hpo_cr: Optional HpoExactConceptRecognizer for HPO term extraction. + If provided, will extract HPO terms from impression text. Returns: Dict with keys: - impression_text: str - Full impression narrative - - diagnoses: List[str] - Identified diagnoses - - anomalies: List[Dict] - Structured anomaly data - - gestational_age_assessment: str - GA conclusion - - growth_assessment: str - Fetal growth conclusion - - recommendations: List[str] - Follow-up recommendations - - hpo_terms: List[str] - Mapped HPO term IDs (FUTURE) - - TODO @VarenyaJ Implementation Steps: - 1. Locate impression section: - - ViewPoint Text: "Impression" section after "=========" - - Observer JSON: exam.finalize.generalComment.plain_text - - ViewPoint HL7: May be in RequestedProcedure or exam notes - 2. Parse free-text impression for key findings - 3. Extract anomalies: - - Observer JSON: fetuses[].anatomy[].anomalies[] - - Text: Look for patterns like "consistent with", "suggestive of" - 4. Identify growth conclusions (FGR, LGA, AGA) - 5. Extract recommendations for follow-up - 6. Map findings to HPO terms: - - Use src/prenatalppkt/hpo.cr_fetal_findings - - Handle synonyms and varied clinical language - - TODO @VarenyaJ: DO NOT: - - Assume impression section exists (optional in all formats) - - Parse impression without context (may reference biometry results) - - Miss negative findings (e.g., "no evidence of...") - - Ignore severity qualifiers (mild, moderate, severe) + - diagnoses: List[str] - Identified diagnoses (future) + - anomalies: List[Dict] - Structured anomaly data (future) + - gestational_age_assessment: Optional[str] - GA conclusion + - growth_assessment: Optional[str] - FGR, LGA, AGA, or None + - recommendations: List[str] - Follow-up recommendations (future) + - hpo_terms: List[SimpleTerm] - HPO terms extracted via CR + - source_format: str """ - # SKELETON: Return empty structure + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + impression_text = _parse_observer_impression(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + impression_text = _parse_viewpoint_text_impression(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + impression_text = _parse_viewpoint_hl7_impression(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + # Extract HPO terms if concept recognizer is provided + hpo_terms = [] + if impression_text and hpo_cr is not None: + # HpoExactConceptRecognizer uses parse() method, not extract() + if hasattr(hpo_cr, "parse"): + hpo_terms = hpo_cr.parse(impression_text) + return { - "impression_text": "", + "impression_text": impression_text, "diagnoses": [], "anomalies": [], "gestational_age_assessment": None, - "growth_assessment": None, + "growth_assessment": _infer_growth_assessment(impression_text), "recommendations": [], - "hpo_terms": [], # FUTURE + "hpo_terms": hpo_terms, + "source_format": source_format, } -# TODO @VarenyaJ: Add helper functions: -# - _extract_anomalies_from_text(text: str) -> List[Dict] -# - _classify_growth_assessment(text: str) -> str -# - _extract_recommendations(text: str) -> List[str] +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_impression(json_data: Dict) -> str: + """ + Extract impression from Observer JSON. + + The finalize block can be at: + - Root level: json_data["finalize"]["generalComment"]["plain_text"] + - Under exam: json_data["exam"]["finalize"]["generalComment"]["plain_text"] + + We check the root level first (most common), then fall back to exam. + """ + impression = "" + + # Check root level first (this is where Apple_Sally has it) + finalize = json_data.get("finalize", {}) + impression = finalize.get("generalComment", {}).get("plain_text", "").strip() + + # Fall back to exam.finalize if not found at root + if not impression: + exam = json_data.get("exam", {}) + finalize = exam.get("finalize", {}) + impression = finalize.get("generalComment", {}).get("plain_text", "").strip() + + return impression + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_impression(text: str) -> str: + """ + Extract impression from ViewPoint text reports. + + Expected pattern: + Impression + ========== + [free text narrative] + """ + pattern = re.compile( + r"Impression\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) + + match = pattern.search(text) + return match.group("body").strip() if match else "" + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_impression(hl7: str) -> str: + """ + Extract impression from HL7 ORU^R01 messages. + + Looks for OBX segments containing "Impression" or "Interpretation" + in the observation identifier field. + """ + lines: List[str] = [] + + for line in hl7.splitlines(): + if not line.startswith("OBX"): + continue + + fields = line.split("|") + if len(fields) < 6: + continue + + obs_id = fields[3] + value = fields[5].split("^")[0].strip() + + if "Impression" in obs_id or "Interpretation" in obs_id: + if value: + lines.append(value) + + return " ".join(lines) + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _infer_growth_assessment(text: str) -> Optional[str]: + """ + Infer fetal growth assessment from impression text. + + Returns: + "FGR" - Fetal Growth Restriction + "LGA" - Large for Gestational Age + "AGA" - Appropriate for Gestational Age + None - No assessment detected + """ + if not text: + return None + + text_lower = text.lower() + + if "growth restriction" in text_lower or "fgr" in text_lower: + return "FGR" + if "large for gestational age" in text_lower or "lga" in text_lower: + return "LGA" + if "appropriate for gestational age" in text_lower or "aga" in text_lower: + return "AGA" + + return None diff --git a/src/prenatalppkt/etl/sections/clinical_indication.py b/src/prenatalppkt/etl/sections/clinical_indication.py index de31325..4cf3431 100644 --- a/src/prenatalppkt/etl/sections/clinical_indication.py +++ b/src/prenatalppkt/etl/sections/clinical_indication.py @@ -1,12 +1,128 @@ -""" -Clinical indication section parser (SKELETON). +from __future__ import annotations -TODO @VarenyaJ: Map indications to ICD-10 and HPO terms -""" +import json +import re +from typing import Dict, List, Union -from typing import Dict +def parse_clinical_indication(data: Union[str, Dict], source_format: str) -> Dict: + """ + Parse clinical indication / reason for exam from different source formats. -def parse_clinical_indication(data: str, source_format: str = "viewpoint_text") -> Dict: - """Extract indication for ultrasound exam.""" - return {"indication_text": "", "icd10_codes": [], "hpo_terms": []} + Supported formats: + - observer_json + - viewpoint_text + - viewpoint_hl7 + + Returns a normalized Dict with indication metadata. + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + result = _parse_observer_indication(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + result = _parse_viewpoint_text_indication(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + result = _parse_viewpoint_hl7_indication(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + # Standardized return schema + result.setdefault("icd10_codes", []) + result.setdefault("hpo_terms", []) + result["source_format"] = source_format + return result + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_indication(json_data: Dict) -> Dict: + """ + Extract indication from Observer JSON. + Known locations: + - exam.indication + - exam.finalize.indication + """ + indication_text = "" + + exam = json_data.get("exam", {}) + if isinstance(exam, dict): + indication_text = ( + exam.get("indication") or exam.get("finalize", {}).get("indication") or "" + ) + + return {"indication_text": indication_text.strip(), "raw_data": json_data} + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_indication(text: str) -> Dict: + """ + Extract indication section from ViewPoint text reports. + + Expected pattern: + Indication + ========== + [free text] + """ + indication_text = "" + + pattern = re.compile( + r"Indication\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) + + match = pattern.search(text) + if match: + indication_text = match.group("body").strip() + + return {"indication_text": indication_text, "raw_data": {"text": text}} + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_indication(hl7: str) -> Dict: + """ + Extract indication from HL7 ORU^R01 messages. + + Common pattern: + OBX||ST|RequestedProcedure.Indication^Indication|1|Advanced maternal age + """ + indication_lines: List[str] = [] + + for line in hl7.splitlines(): + if not line.startswith("OBX"): + continue + + fields = line.split("|") + if len(fields) < 6: + continue + + observation_id = fields[3] + value_field = fields[5] + + if "RequestedProcedure.Indication" in observation_id: + # HL7 values may be caret-delimited + value = value_field.split("^")[0] + if value: + indication_lines.append(value.strip()) + + indication_text = " ".join(indication_lines) + + return {"indication_text": indication_text, "raw_data": {"hl7": hl7}} diff --git a/src/prenatalppkt/etl/sections/estimated_fetal_weight.py b/src/prenatalppkt/etl/sections/estimated_fetal_weight.py new file mode 100644 index 0000000..04ac910 --- /dev/null +++ b/src/prenatalppkt/etl/sections/estimated_fetal_weight.py @@ -0,0 +1,249 @@ +""" +Estimated fetal weight (EFW) section parser. + +Extracts EFW values, percentiles, and growth classification. +""" + +from __future__ import annotations + +import json +import re +from typing import Dict, List, Optional, Union + + +def parse_estimated_fetal_weight(data: Union[str, Dict], source_format: str) -> Dict: + """ + Parse estimated fetal weight section. + + Supports: + - observer_json + - viewpoint_text (skeleton) + - viewpoint_hl7 (skeleton) + + Args: + data: Raw input data (JSON string, dict, or text) + source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + + Returns: + Dict with keys: + - efw_grams: float - Primary EFW value in grams + - percentile: float - Percentile for primary EFW + - method: str - Calculation method (e.g., "Hadlock (AC, FL, HC)") + - within_normal_range: bool - True if 10th-90th percentile + - growth_category: str - "SGA", "AGA", or "LGA" + - all_estimates: List[Dict] - All EFW calculations + - source_format: str + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + return _parse_observer_efw(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + return _parse_viewpoint_text_efw(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + return _parse_viewpoint_hl7_efw(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_efw(json_data: Dict) -> Dict: + """ + Extract EFW from Observer JSON. + + Path: fetuses[i].efws[] + - fetus_number: int + - label: str - method description (e.g., "EFW (AC, FL, HC)") + - value: float - weight in grams + - calculated_percentile: float + - percentile_for_display: str + - print_in_report: int - 1 if this is the primary EFW + - range: str - optional expected range + """ + all_estimates: List[Dict] = [] + primary_efw: Optional[Dict] = None + + # Get first fetus + fetuses = json_data.get("fetuses", []) + if not fetuses: + return _empty_result("observer_json") + + efws = fetuses[0].get("efws", []) + if not efws: + return _empty_result("observer_json") + + for efw in efws: + label = efw.get("label", "") + value = efw.get("value", 0) + percentile = efw.get("calculated_percentile", 0) + print_in_report = efw.get("print_in_report", 0) + + # Extract method from label (e.g., "EFW (AC, FL, HC)" -> "AC, FL, HC") + method = _extract_method_from_label(label) + + estimate = { + "method": method, + "grams": round(value, 1), + "percentile": round(percentile, 1), + "print_in_report": bool(print_in_report), + } + all_estimates.append(estimate) + + # Select primary EFW (print_in_report=1 or first one) + if print_in_report == 1 and primary_efw is None: + primary_efw = estimate + + # Fallback to first estimate if none marked for report + if primary_efw is None and all_estimates: + primary_efw = all_estimates[0] + + if primary_efw is None: + return _empty_result("observer_json") + + # Classify growth + percentile = primary_efw["percentile"] + growth_category = _classify_growth(percentile) + within_normal = 10 <= percentile <= 90 + + return { + "efw_grams": primary_efw["grams"], + "percentile": primary_efw["percentile"], + "method": primary_efw["method"], + "within_normal_range": within_normal, + "growth_category": growth_category, + "all_estimates": all_estimates, + "source_format": "observer_json", + } + + +# --------------------------------------------------------------------- +# ViewPoint Text (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_efw(text: str) -> Dict: + """ + Extract EFW from ViewPoint text reports. + + Expected patterns: + EFW 2,042 g 2% + EFW (lb,oz) 4 lb 8 oz + EFW by Hadlock (BPD-HC-AC-FL) + + TODO @VarenyaJ: Implement full parsing + """ + efw_grams = None + percentile = None + method = None + + # Try to find EFW line with grams + efw_pattern = re.compile(r"EFW\s+([0-9,]+)\s+g\s+(\d+)%", re.IGNORECASE) + match = efw_pattern.search(text) + if match: + efw_grams = float(match.group(1).replace(",", "")) + percentile = float(match.group(2)) + + # Try to find method + method_pattern = re.compile(r"EFW by\s+(.+)", re.IGNORECASE) + method_match = method_pattern.search(text) + if method_match: + method = method_match.group(1).strip() + + if efw_grams is None: + return _empty_result("viewpoint_text") + + growth_category = _classify_growth(percentile) if percentile else "Unknown" + within_normal = 10 <= percentile <= 90 if percentile else False + + return { + "efw_grams": efw_grams, + "percentile": percentile, + "method": method or "Unknown", + "within_normal_range": within_normal, + "growth_category": growth_category, + "all_estimates": [ + { + "method": method or "Unknown", + "grams": efw_grams, + "percentile": percentile, + } + ], + "source_format": "viewpoint_text", + } + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_efw(hl7: str) -> Dict: + """ + Extract EFW from HL7 ORU^R01 messages. + + Note: EFW may not be present in all HL7 exports. + This is a skeleton for potential future implementation. + + TODO @VarenyaJ: Implement if HL7 EFW encoding is discovered + """ + return _empty_result("viewpoint_hl7") + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _extract_method_from_label(label: str) -> str: + """ + Extract method from EFW label. + + Examples: + "EFW (AC, FL, HC)" -> "Hadlock (AC, FL, HC)" + "EFW (AC, FL)" -> "Hadlock (AC, FL)" + """ + match = re.search(r"\(([^)]+)\)", label) + if match: + params = match.group(1) + return f"Hadlock ({params})" + return "Hadlock" + + +def _classify_growth(percentile: float) -> str: + """ + Classify fetal growth based on EFW percentile. + + - SGA (Small for Gestational Age): <10th percentile + - AGA (Appropriate for Gestational Age): 10th-90th percentile + - LGA (Large for Gestational Age): >90th percentile + """ + if percentile < 10: + return "SGA" + elif percentile > 90: + return "LGA" + else: + return "AGA" + + +def _empty_result(source_format: str) -> Dict: + """Return empty result structure.""" + return { + "efw_grams": None, + "percentile": None, + "method": None, + "within_normal_range": None, + "growth_category": None, + "all_estimates": [], + "source_format": source_format, + } diff --git a/src/prenatalppkt/etl/sections/fetal_anatomy.py b/src/prenatalppkt/etl/sections/fetal_anatomy.py index 694eab5..34c0351 100644 --- a/src/prenatalppkt/etl/sections/fetal_anatomy.py +++ b/src/prenatalppkt/etl/sections/fetal_anatomy.py @@ -1,21 +1,254 @@ """ -Fetal anatomy section parser (SKELETON). +Fetal anatomy section parser. -TODO @VarenyaJ: Parse anatomy checklist (normal/abnormal/not visualized) -TODO @VarenyaJ: Map anatomical findings to HPO terms -TODO @VarenyaJ: Handle detailed anatomy subsections +Extracts structured anatomy findings and free-text anatomy narrative, +with optional HPO term extraction from anomaly descriptions. """ -from typing import Dict +from __future__ import annotations +import json +import re +from typing import Dict, List, Union -def parse_fetal_anatomy(data: str, source_format: str = "viewpoint_text") -> Dict: - """Extract fetal anatomy assessment.""" + +def parse_fetal_anatomy( + data: Union[str, Dict], source_format: str, hpo_cr=None +) -> Dict: + """ + Parse fetal anatomy section. + + Supports: + - observer_json + - viewpoint_text (skeleton) + - viewpoint_hl7 (skeleton) + + Args: + data: Raw input data (JSON string, dict, or text) + source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + hpo_cr: Optional HpoExactConceptRecognizer for HPO term extraction. + If provided, will extract HPO terms from anomaly descriptions. + + Returns: + Dict with keys: + - anatomy_text: str - Free text anatomy narrative + - normal_structures: List[str] - Structures marked Normal + - abnormal_structures: List[str] - Structures marked Abnormal + - not_visualized: List[str] - Structures marked Unseen + - anomalies: List[Dict] - Specific anomaly findings + - hpo_terms: List[SimpleTerm] - HPO terms extracted via CR + - source_format: str + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + return _parse_observer_anatomy(data, hpo_cr) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + return _parse_viewpoint_text_anatomy(data, hpo_cr) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + return _parse_viewpoint_hl7_anatomy(data, hpo_cr) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _classify_structure( + label: str, state: str, normal: List[str], abnormal: List[str], unseen: List[str] +) -> None: + """Classify a structure into the appropriate list based on state.""" + if not label: + return + if state == "Normal" and label not in normal: + normal.append(label) + elif state == "Abnormal" and label not in abnormal: + abnormal.append(label) + elif state == "Unseen" and label not in unseen: + unseen.append(label) + + +def _process_anatomy_item( + item: Dict, + normal: List[str], + abnormal: List[str], + unseen: List[str], + anomalies: List[Dict], +) -> None: + """Process a single anatomy item, extracting structures and anomalies.""" + main = item.get("main", {}) + label = main.get("label", "") + state = main.get("anat_state", "") + + # Classify main structure + _classify_structure(label, state, normal, abnormal, unseen) + + # Process detail sub-structures + for detail in item.get("detail", []): + detail_label = detail.get("label", "") + detail_state = detail.get("anat_det_state", "") + _classify_structure(detail_label, detail_state, normal, abnormal, unseen) + + # Process anomalies + for anom in item.get("anomalies", []): + description = anom.get("description", "") + if description: + anomalies.append( + { + "structure": label, + "description": description, + "variant_type": anom.get("abnormal_or_normal_variant", "Abnormal"), + } + ) + + +def _extract_hpo_terms(anatomy_text: str, anomalies: List[Dict], hpo_cr) -> List: + """Extract HPO terms from anatomy text and anomaly descriptions.""" + if hpo_cr is None or not hasattr(hpo_cr, "parse"): + return [] + + all_anomaly_text = " ".join( + a["description"] for a in anomalies if a.get("description") + ) + combined_text = f"{anatomy_text} {all_anomaly_text}".strip() + + if not combined_text: + return [] + + return hpo_cr.parse(combined_text) + + +def _parse_observer_anatomy(json_data: Dict, hpo_cr=None) -> Dict: + """ + Extract anatomy findings from Observer JSON. + + Paths: + - fetuses[i].fetus.anatomy_text - free text narrative + - fetuses[i].fetus.anatomy[] - structured findings + - main.label - structure name (e.g., "Head", "Face") + - main.anat_state - "Normal", "Abnormal", or "Unseen" + - detail[].label - sub-structure name + - detail[].anat_det_state - sub-structure state + - anomalies[].description - specific finding text + - anomalies[].abnormal_or_normal_variant - classification + """ + fetuses = json_data.get("fetuses", []) + if not fetuses: + return _empty_result("observer_json") + + fetus_block = fetuses[0].get("fetus", {}) + anatomy_text = fetus_block.get("anatomy_text", "") + + normal_structures: List[str] = [] + abnormal_structures: List[str] = [] + not_visualized: List[str] = [] + anomalies: List[Dict] = [] + + for item in fetus_block.get("anatomy", []): + _process_anatomy_item( + item, normal_structures, abnormal_structures, not_visualized, anomalies + ) + + hpo_terms = _extract_hpo_terms(anatomy_text, anomalies, hpo_cr) + + return { + "anatomy_text": anatomy_text, + "normal_structures": normal_structures, + "abnormal_structures": abnormal_structures, + "not_visualized": not_visualized, + "anomalies": anomalies, + "hpo_terms": hpo_terms, + "source_format": "observer_json", + } + + +# --------------------------------------------------------------------- +# ViewPoint Text (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_anatomy(text: str, hpo_cr=None) -> Dict: + """ + Extract anatomy from ViewPoint text reports. + + Expected pattern: + Fetal Anatomy + ============= + The following structures appear normal: + Cranium. Brain. Face. ... + + The following structures appear abnormal: + GI tract: dilated bowel loops. + + The following structures could not be adequately visualized: + LVOT view. RVOT view. ... + + TODO @VarenyaJ: Implement full parsing + """ + # Skeleton: Extract the Fetal Anatomy section + pattern = re.compile( + r"Fetal Anatomy\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) + match = pattern.search(text) + anatomy_text = match.group("body").strip() if match else "" + + # TODO @VarenyaJ: Parse "appear normal", "appear abnormal", "could not be visualized" lists + + hpo_terms = [] + if anatomy_text and hpo_cr is not None and hasattr(hpo_cr, "parse"): + hpo_terms = hpo_cr.parse(anatomy_text) + + return { + "anatomy_text": anatomy_text, + "normal_structures": [], + "abnormal_structures": [], + "not_visualized": [], + "anomalies": [], + "hpo_terms": hpo_terms, + "source_format": "viewpoint_text", + } + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_anatomy(hl7: str, hpo_cr=None) -> Dict: + """ + Extract anatomy from HL7 ORU^R01 messages. + + Note: Anatomy is typically not encoded in discrete HL7 fields. + This is a skeleton for potential future implementation. + + TODO @VarenyaJ: Implement if HL7 anatomy encoding is discovered + """ + return _empty_result("viewpoint_hl7") + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _empty_result(source_format: str) -> Dict: + """Return empty result structure.""" return { - "structures_examined": [], + "anatomy_text": "", "normal_structures": [], "abnormal_structures": [], "not_visualized": [], "anomalies": [], "hpo_terms": [], + "source_format": source_format, } diff --git a/src/prenatalppkt/etl/sections/fetal_ratios.py b/src/prenatalppkt/etl/sections/fetal_ratios.py new file mode 100644 index 0000000..732b715 --- /dev/null +++ b/src/prenatalppkt/etl/sections/fetal_ratios.py @@ -0,0 +1,246 @@ +""" +Fetal ratios section parser. + +Extracts biometric ratios (HC/AC, FL/BPD, FL/AC) and assesses proportionality. +""" + +from __future__ import annotations + +import json +import re +from typing import Dict, List, Optional, Tuple, Union + + +def parse_fetal_ratios(data: Union[str, Dict], source_format: str) -> Dict: + """ + Parse fetal ratios section. + + Supports: + - observer_json + - viewpoint_text (skeleton) + - viewpoint_hl7 (skeleton) + + Args: + data: Raw input data (JSON string, dict, or text) + source_format: One of "observer_json", "viewpoint_text", "viewpoint_hl7" + + Returns: + Dict with keys: + - ratios: List[Dict] - Individual ratio data + - all_within_range: bool - True if all ratios are normal + - proportionality_assessment: str - "Normal" or "Asymmetric" + - source_format: str + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + return _parse_observer_ratios(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + return _parse_viewpoint_text_ratios(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + return _parse_viewpoint_hl7_ratios(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_ratios(json_data: Dict) -> Dict: + """ + Extract ratios from Observer JSON. + + Path: fetuses[i].ratios[] + - label: str - ratio name (e.g., "HC/AC", "FL/BPD") + - value: float - calculated ratio value + - range: str - expected normal range (e.g., "1.04 - 1.22") + - fetus_number: int + """ + ratios: List[Dict] = [] + + # Get first fetus + fetuses = json_data.get("fetuses", []) + if not fetuses: + return _empty_result("observer_json") + + ratio_list = fetuses[0].get("ratios", []) + if not ratio_list: + return _empty_result("observer_json") + + all_within_range = True + + for ratio in ratio_list: + label = ratio.get("label", "") + value = ratio.get("value", 0) + range_str = ratio.get("range", "") + + # Parse expected range + expected_range = _parse_range_string(range_str) + + # Check if within range + within_range = _is_within_range(value, expected_range) + if not within_range: + all_within_range = False + + ratios.append( + { + "name": label, + "value": round(value, 3) if isinstance(value, float) else value, + "expected_range": expected_range, + "within_range": within_range, + } + ) + + # Assess overall proportionality + # Asymmetric growth typically indicated by abnormal HC/AC ratio + proportionality = _assess_proportionality(ratios) + + return { + "ratios": ratios, + "all_within_range": all_within_range, + "proportionality_assessment": proportionality, + "source_format": "observer_json", + } + + +# --------------------------------------------------------------------- +# ViewPoint Text (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_ratios(text: str) -> Dict: + """ + Extract ratios from ViewPoint text reports. + + Expected pattern (under Fetal Biometry section): + FL / HC 0.23 + + TODO @VarenyaJ: : Implement full parsing + """ + ratios: List[Dict] = [] + + # Try to find ratio lines + # Pattern: FL / HC 0.23 + ratio_pattern = re.compile( + r"(FL|HC|AC|BPD)\s*/\s*(FL|HC|AC|BPD)\s+([\d.]+)", re.IGNORECASE + ) + + for match in ratio_pattern.finditer(text): + name = f"{match.group(1).upper()}/{match.group(2).upper()}" + value = float(match.group(3)) + ratios.append( + { + "name": name, + "value": value, + "expected_range": None, # Not available in text format + "within_range": None, + } + ) + + return { + "ratios": ratios, + "all_within_range": None, # Cannot assess without ranges + "proportionality_assessment": "Unknown", + "source_format": "viewpoint_text", + } + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (SKELETON) +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_ratios(hl7: str) -> Dict: + """ + Extract ratios from HL7 ORU^R01 messages. + + Note: Ratios may not be present in all HL7 exports. + This is a skeleton for potential future implementation. + + TODO @VarenyaJ: : Implement if HL7 ratio encoding is discovered + """ + return _empty_result("viewpoint_hl7") + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _parse_range_string(range_str: str) -> Optional[Tuple[float, float]]: + """ + Parse a range string into a tuple. + + Examples: + "1.04 - 1.22" -> (1.04, 1.22) + "20 - 24" -> (20.0, 24.0) + "" -> None + """ + if not range_str: + return None + + # Pattern: "min - max" or "min-max" + match = re.match(r"([\d.]+)\s*-\s*([\d.]+)", range_str.strip()) + if match: + return (float(match.group(1)), float(match.group(2))) + + return None + + +def _is_within_range( + value: float, expected_range: Optional[Tuple[float, float]] +) -> Optional[bool]: + """ + Check if a value is within the expected range. + + Returns None if range is not available. + """ + if expected_range is None: + return None + + min_val, max_val = expected_range + return min_val <= value <= max_val + + +def _assess_proportionality(ratios: List[Dict]) -> str: + """ + Assess overall fetal proportionality based on ratios. + + Asymmetric growth is typically indicated when: + - HC/AC ratio is abnormal (head-sparing or brain-sparing pattern) + - FL/AC ratio is abnormal + """ + if not ratios: + return "Unknown" + + # Check HC/AC specifically for asymmetric growth + for ratio in ratios: + if ratio["name"] == "HC/AC" and ratio["within_range"] is False: + return "Asymmetric" + + # Check if all ratios with known ranges are within range + ratios_with_ranges = [r for r in ratios if r["within_range"] is not None] + if not ratios_with_ranges: + return "Unknown" + + all_normal = all(r["within_range"] for r in ratios_with_ranges) + return "Normal" if all_normal else "Asymmetric" + + +def _empty_result(source_format: str) -> Dict: + """Return empty result structure.""" + return { + "ratios": [], + "all_within_range": None, + "proportionality_assessment": "Unknown", + "source_format": source_format, + } diff --git a/src/prenatalppkt/etl/sections/pregnancy_dating.py b/src/prenatalppkt/etl/sections/pregnancy_dating.py index 4c79114..93cf017 100644 --- a/src/prenatalppkt/etl/sections/pregnancy_dating.py +++ b/src/prenatalppkt/etl/sections/pregnancy_dating.py @@ -1,20 +1,208 @@ -""" -Pregnancy dating section parser (SKELETON). +from __future__ import annotations -TODO @VarenyaJ: Parse LMP, EDD, assigned dating method; Handle multiple dating methods (LMP, US, IVF) -""" +import json +import re +from datetime import datetime +from typing import Dict, Optional, Union -from typing import Dict +from prenatalppkt.gestational_age import GestationalAge -def parse_pregnancy_dating(data: str, source_format: str = "viewpoint_text") -> Dict: - """Extract pregnancy dating information.""" +DATE_FORMATS = ["%Y-%m-%d", "%m/%d/%Y", "%Y%m%d"] + + +def parse_pregnancy_dating(data: Union[str, Dict], source_format: str) -> Dict: + """ + Parse pregnancy dating information from ultrasound reports. + + Supported formats: + - observer_json + - viewpoint_text + - viewpoint_hl7 + """ + if source_format == "observer_json": + if isinstance(data, str): + data = json.loads(data) + result = _parse_observer_pregnancy(data) + + elif source_format == "viewpoint_text": + if not isinstance(data, str): + raise ValueError("viewpoint_text data must be a string") + result = _parse_viewpoint_text_pregnancy(data) + + elif source_format == "viewpoint_hl7": + if not isinstance(data, str): + raise ValueError("viewpoint_hl7 data must be a string") + result = _parse_viewpoint_hl7_pregnancy(data) + + else: + raise ValueError(f"Unsupported source_format: {source_format}") + + result["source_format"] = source_format + return result + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +def _parse_observer_pregnancy(json_data: Dict) -> Dict: + exam = json_data.get("exam", {}) + + lmp = exam.get("lmp") + edd = exam.get("edd") or exam.get("estimated_due_date") + dating_method = exam.get("dating_method") + + ga_by_lmp = _calculate_ga_from_lmp(lmp) if lmp else None + + return { + "lmp": lmp, + "edd": edd, + "assigned_edd": edd, + "dating_method": dating_method, + "ga_by_lmp": ga_by_lmp, + "ga_by_ultrasound": None, + "assigned_ga": ga_by_lmp, + "raw_data": json_data, + } + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +def _parse_viewpoint_text_pregnancy(text: str) -> Dict: + """ + Extract pregnancy dating from ViewPoint text reports. + + Example: + Dating + ====== + LMP 01/15/2025 + EDD by LMP 10/22/2025 + Assigned dating based on LMP + """ + lmp = None + edd = None + dating_method = None + + section = _extract_dating_section(text) + + for line in section.splitlines(): + line = line.strip() + + if line.upper().startswith("LMP"): + lmp = _parse_date_from_text(line) + + elif "EDD" in line.upper(): + edd = _parse_date_from_text(line) + + elif "ASSIGNED" in line.upper(): + dating_method = line + + ga_by_lmp = _calculate_ga_from_lmp(lmp) if lmp else None + return { - "lmp": None, - "edd": None, - "assigned_edd": None, + "lmp": lmp, + "edd": edd, + "assigned_edd": edd, + "dating_method": dating_method, + "ga_by_lmp": ga_by_lmp, + "ga_by_ultrasound": None, + "assigned_ga": ga_by_lmp, + "raw_data": {"text": text}, + } + + +def _extract_dating_section(text: str) -> str: + pattern = re.compile( + r"Dating\s*\n=+\n(?P.*?)(?:\n[A-Z][^\n]*\n=+|\Z)", + re.DOTALL | re.IGNORECASE, + ) + match = pattern.search(text) + return match.group("body") if match else "" + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +def _parse_viewpoint_hl7_pregnancy(hl7: str) -> Dict: + lmp = None + edd = None + + for line in hl7.splitlines(): + if not line.startswith("OBX"): + continue + + fields = line.split("|") + if len(fields) < 6: + continue + + obs_id = fields[3] + value = fields[5] + + if "LastMenstrualPeriod" in obs_id: + lmp = _parse_date_string(value) + + elif "EDD" in obs_id: + edd = _parse_date_string(value) + + ga_by_lmp = _calculate_ga_from_lmp(lmp) if lmp else None + + return { + "lmp": lmp, + "edd": edd, + "assigned_edd": edd, "dating_method": None, - "ga_by_lmp": None, + "ga_by_lmp": ga_by_lmp, "ga_by_ultrasound": None, - "assigned_ga": None, + "assigned_ga": ga_by_lmp, + "raw_data": {"hl7": hl7}, } + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _parse_date_from_text(text: str) -> Optional[str]: + for token in re.split(r"\s+", text): + parsed = _parse_date_string(token) + if parsed: + return parsed + return None + + +def _parse_date_string(value: str) -> Optional[str]: + value = value.split("^")[0].strip() + + # Fast reject: must contain digits + if not any(ch.isdigit() for ch in value): + return None + + for fmt in DATE_FORMATS: + parsed = _try_parse_date(value, fmt) + if parsed: + return parsed + + return None + + +def _try_parse_date(value: str, fmt: str) -> Optional[str]: + try: + return datetime.strptime(value, fmt).date().isoformat() + except ValueError: + return None + + +def _calculate_ga_from_lmp(lmp_iso: str) -> Optional[Dict]: + try: + ga = GestationalAge.from_lmp(lmp_iso) + return {"weeks": ga.weeks, "days": ga.days} + except Exception: + return None diff --git a/tests/etl/sections/__init__.py b/tests/etl/sections/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/etl/sections/test_clinical_impression.py b/tests/etl/sections/test_clinical_impression.py new file mode 100644 index 0000000..dc2c057 --- /dev/null +++ b/tests/etl/sections/test_clinical_impression.py @@ -0,0 +1,85 @@ +import json +import pytest + +from prenatalppkt.etl.sections.clinical_impression import parse_clinical_impression + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestClinicalImpressionObserver: + def test_basic_impression(self, hpo_cr): + data = json.dumps( + { + "exam": { + "finalize": { + "generalComment": { + "plain_text": "Normal fetal anatomy. No abnormalities." + } + } + } + } + ) + + result = parse_clinical_impression(data, "observer_json", hpo_cr=hpo_cr) + + assert "Normal fetal anatomy" in result["impression_text"] + assert result["hpo_terms"] == [] + assert result["source_format"] == "observer_json" + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +class TestClinicalImpressionViewPointText: + def test_basic_impression(self, hpo_cr): + text = """Impression +========= +Fetal growth restriction is suspected. +Recommend follow-up scan. +""" + + result = parse_clinical_impression(text, "viewpoint_text", hpo_cr=hpo_cr) + + assert "growth restriction" in result["impression_text"].lower() + assert result["growth_assessment"] == "FGR" + assert isinstance(result["hpo_terms"], list) + + def test_missing_impression(self, hpo_cr): + text = "Fetal Biometry\n============\nHC 175 mm" + result = parse_clinical_impression(text, "viewpoint_text", hpo_cr=hpo_cr) + assert result["impression_text"] == "" + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +class TestClinicalImpressionViewPointHL7: + def test_basic_hl7_impression(self, hpo_cr): + hl7 = "OBX||TX|Impression^Impression|1|Appropriate for gestational age\n" + + result = parse_clinical_impression(hl7, "viewpoint_hl7", hpo_cr=hpo_cr) + + assert "Appropriate" in result["impression_text"] + assert result["growth_assessment"] == "AGA" + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestClinicalImpressionEdgeCases: + def test_invalid_format(self, hpo_cr): + with pytest.raises(ValueError): + parse_clinical_impression("data", "bad_format", hpo_cr=hpo_cr) + + def test_non_string_text(self, hpo_cr): + with pytest.raises(ValueError): + parse_clinical_impression({"bad": "data"}, "viewpoint_text", hpo_cr=hpo_cr) diff --git a/tests/etl/sections/test_clinical_indication.py b/tests/etl/sections/test_clinical_indication.py new file mode 100644 index 0000000..9d1116c --- /dev/null +++ b/tests/etl/sections/test_clinical_indication.py @@ -0,0 +1,124 @@ +import json +import pytest + +from prenatalppkt.etl.sections.clinical_indication import parse_clinical_indication + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestClinicalIndicationObserver: + def test_basic_indication(self): + data = json.dumps({"exam": {"indication": "Advanced maternal age, dating"}}) + + result = parse_clinical_indication(data, "observer_json") + + assert "Advanced maternal age" in result["indication_text"] + assert result["source_format"] == "observer_json" + assert result["icd10_codes"] == [] + assert result["hpo_terms"] == [] + + def test_fallback_finalize_indication(self): + data = json.dumps( + {"exam": {"finalize": {"indication": "Poor obstetric history"}}} + ) + + result = parse_clinical_indication(data, "observer_json") + assert result["indication_text"] == "Poor obstetric history" + + def test_missing_indication(self): + data = json.dumps({"exam": {}}) + result = parse_clinical_indication(data, "observer_json") + assert result["indication_text"] == "" + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +class TestClinicalIndicationViewPointText: + def test_basic_indication(self): + text = """Indication +========== +Advanced maternal age, dating + +History +======= +Previous cesarean section +""" + result = parse_clinical_indication(text, "viewpoint_text") + + assert "Advanced maternal age" in result["indication_text"] + assert "History" not in result["indication_text"] + assert result["source_format"] == "viewpoint_text" + + def test_multiline_indication(self): + text = """Indication +========== +Advanced maternal age +Previous cesarean section +IVF pregnancy +""" + result = parse_clinical_indication(text, "viewpoint_text") + + assert "IVF pregnancy" in result["indication_text"] + assert result["indication_text"].count("\n") >= 1 + + def test_missing_indication_section(self): + text = """Fetal Biometry +============ +HC 175.0 mm +""" + result = parse_clinical_indication(text, "viewpoint_text") + assert result["indication_text"] == "" + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +class TestClinicalIndicationViewPointHL7: + def test_basic_indication(self): + hl7 = ( + "MSH|^~\\&|\n" + "OBX||ST|RequestedProcedure.Indication^Indication|1|Advanced maternal age\n" + "OBX||ST|RequestedProcedure.Indication^Indication|2|Dating scan\n" + ) + + result = parse_clinical_indication(hl7, "viewpoint_hl7") + + assert "Advanced maternal age" in result["indication_text"] + assert "Dating scan" in result["indication_text"] + assert result["source_format"] == "viewpoint_hl7" + + def test_no_indication_obx(self): + hl7 = "MSH|^~\\&|\nOBX||NM|SomeOtherField|1|123\n" + result = parse_clinical_indication(hl7, "viewpoint_hl7") + assert result["indication_text"] == "" + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestClinicalIndicationEdgeCases: + def test_invalid_format(self): + with pytest.raises(ValueError): + parse_clinical_indication("data", "unknown_format") + + def test_non_string_text(self): + with pytest.raises(ValueError): + parse_clinical_indication({"bad": "data"}, "viewpoint_text") + + def test_special_characters(self): + text = """Indication +========== +Advanced maternal age - >=35 years +""" + result = parse_clinical_indication(text, "viewpoint_text") + assert ">=35" in result["indication_text"] diff --git a/tests/etl/sections/test_estimated_fetal_weight.py b/tests/etl/sections/test_estimated_fetal_weight.py new file mode 100644 index 0000000..68bc210 --- /dev/null +++ b/tests/etl/sections/test_estimated_fetal_weight.py @@ -0,0 +1,295 @@ +"""Tests for estimated fetal weight section parser.""" + +import json +import pytest + +from prenatalppkt.etl.sections.estimated_fetal_weight import ( + parse_estimated_fetal_weight, +) + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestEstimatedFetalWeightObserver: + def test_basic_efw(self): + """Test parsing of basic EFW data.""" + data = { + "fetuses": [ + { + "efws": [ + { + "fetus_number": 1, + "label": "EFW (AC, FL, HC)", + "value": 1014.828, + "calculated_percentile": 55.6, + "percentile_for_display": "56%", + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["efw_grams"] == 1014.8 + assert result["percentile"] == 55.6 + assert result["method"] == "Hadlock (AC, FL, HC)" + assert result["within_normal_range"] is True + assert result["growth_category"] == "AGA" + assert result["source_format"] == "observer_json" + + def test_multiple_efw_estimates(self): + """Test that primary EFW is selected correctly.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL, HC)", + "value": 1014.828, + "calculated_percentile": 55.6, + "print_in_report": 1, + }, + { + "label": "EFW (AC, FL)", + "value": 1042.214, + "calculated_percentile": 63.7, + "print_in_report": 0, + }, + { + "label": "EFW (AC, BPD)", + "value": 1000.887, + "calculated_percentile": 51.2, + "print_in_report": 0, + }, + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + # Should select the one with print_in_report=1 + assert result["efw_grams"] == 1014.8 + assert len(result["all_estimates"]) == 3 + + def test_sga_classification(self): + """Test SGA (Small for Gestational Age) classification.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL, HC)", + "value": 800.0, + "calculated_percentile": 5.0, + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["growth_category"] == "SGA" + assert result["within_normal_range"] is False + + def test_lga_classification(self): + """Test LGA (Large for Gestational Age) classification.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL, HC)", + "value": 2500.0, + "calculated_percentile": 95.0, + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["growth_category"] == "LGA" + assert result["within_normal_range"] is False + + def test_json_string_input(self): + """Test that JSON string input is handled correctly.""" + data = json.dumps( + { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL)", + "value": 1200.0, + "calculated_percentile": 50.0, + "print_in_report": 1, + } + ] + } + ] + } + ) + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["efw_grams"] == 1200.0 + + def test_empty_fetuses(self): + """Test handling of empty fetuses array.""" + data = {"fetuses": []} + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["efw_grams"] is None + assert result["all_estimates"] == [] + + def test_missing_efws_key(self): + """Test handling of fetus without efws key.""" + data = {"fetuses": [{"fetus": {}}]} + + result = parse_estimated_fetal_weight(data, "observer_json") + + assert result["efw_grams"] is None + + def test_fallback_to_first_estimate(self): + """Test fallback when no estimate has print_in_report=1.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW (AC, FL)", + "value": 1100.0, + "calculated_percentile": 45.0, + "print_in_report": 0, + }, + { + "label": "EFW (AC, BPD)", + "value": 1050.0, + "calculated_percentile": 40.0, + "print_in_report": 0, + }, + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + + # Should fall back to first estimate + assert result["efw_grams"] == 1100.0 + + +# --------------------------------------------------------------------- +# ViewPoint Text (Skeleton) +# --------------------------------------------------------------------- + + +class TestEstimatedFetalWeightViewPointText: + def test_skeleton_returns_structure(self): + """Test that skeleton implementation returns expected structure.""" + text = "EFW 2,042 g 2%\nEFW by Hadlock" + + result = parse_estimated_fetal_weight(text, "viewpoint_text") + + assert result["source_format"] == "viewpoint_text" + # Skeleton may parse basic patterns + assert isinstance(result["all_estimates"], list) + + def test_no_efw_in_text(self): + """Test handling when no EFW is found.""" + text = "Fetal Biometry\nHC 250 mm" + + result = parse_estimated_fetal_weight(text, "viewpoint_text") + + assert result["efw_grams"] is None + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (Skeleton) +# --------------------------------------------------------------------- + + +class TestEstimatedFetalWeightViewPointHL7: + def test_skeleton_returns_empty(self): + """Test that HL7 skeleton returns empty result.""" + hl7 = "MSH|...\nOBX|..." + + result = parse_estimated_fetal_weight(hl7, "viewpoint_hl7") + + assert result["source_format"] == "viewpoint_hl7" + assert result["efw_grams"] is None + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestEstimatedFetalWeightEdgeCases: + def test_invalid_format(self): + """Test that invalid format raises ValueError.""" + with pytest.raises(ValueError): + parse_estimated_fetal_weight("data", "invalid_format") + + def test_non_string_viewpoint_text(self): + """Test that non-string viewpoint_text raises ValueError.""" + with pytest.raises(ValueError): + parse_estimated_fetal_weight({"not": "string"}, "viewpoint_text") + + def test_non_string_viewpoint_hl7(self): + """Test that non-string viewpoint_hl7 raises ValueError.""" + with pytest.raises(ValueError): + parse_estimated_fetal_weight({"not": "string"}, "viewpoint_hl7") + + def test_boundary_aga_at_10_percentile(self): + """Test AGA classification at exactly 10th percentile.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW", + "value": 900.0, + "calculated_percentile": 10.0, + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + assert result["growth_category"] == "AGA" + assert result["within_normal_range"] is True + + def test_boundary_aga_at_90_percentile(self): + """Test AGA classification at exactly 90th percentile.""" + data = { + "fetuses": [ + { + "efws": [ + { + "label": "EFW", + "value": 2000.0, + "calculated_percentile": 90.0, + "print_in_report": 1, + } + ] + } + ] + } + + result = parse_estimated_fetal_weight(data, "observer_json") + assert result["growth_category"] == "AGA" + assert result["within_normal_range"] is True diff --git a/tests/etl/sections/test_fetal_anatomy.py b/tests/etl/sections/test_fetal_anatomy.py new file mode 100644 index 0000000..d2f42ff --- /dev/null +++ b/tests/etl/sections/test_fetal_anatomy.py @@ -0,0 +1,219 @@ +"""Tests for fetal anatomy section parser.""" + +import json +import pytest + +from prenatalppkt.etl.sections.fetal_anatomy import parse_fetal_anatomy + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestFetalAnatomyObserver: + def test_basic_anatomy_structures(self): + """Test parsing of normal/abnormal/unseen structures.""" + data = { + "fetuses": [ + { + "fetus": { + "anatomy_text": "The fetal anatomy was assessed.", + "anatomy": [ + { + "main": {"label": "Head", "anat_state": "Normal"}, + "detail": [], + "anomalies": [], + }, + { + "main": {"label": "Heart", "anat_state": "Abnormal"}, + "detail": [], + "anomalies": [], + }, + { + "main": {"label": "Spine", "anat_state": "Unseen"}, + "detail": [], + "anomalies": [], + }, + ], + } + } + ] + } + + result = parse_fetal_anatomy(data, "observer_json") + + assert "Head" in result["normal_structures"] + assert "Heart" in result["abnormal_structures"] + assert "Spine" in result["not_visualized"] + assert result["anatomy_text"] == "The fetal anatomy was assessed." + assert result["source_format"] == "observer_json" + + def test_anatomy_with_anomalies(self): + """Test parsing of specific anomaly descriptions.""" + data = { + "fetuses": [ + { + "fetus": { + "anatomy_text": "", + "anatomy": [ + { + "main": {"label": "Head", "anat_state": "Abnormal"}, + "detail": [ + { + "label": "Cerebellum", + "anat_det_state": "Abnormal", + } + ], + "anomalies": [ + { + "description": "Dandy Walker", + "abnormal_or_normal_variant": "Abnormal", + } + ], + } + ], + } + } + ] + } + + result = parse_fetal_anatomy(data, "observer_json") + + assert "Head" in result["abnormal_structures"] + assert "Cerebellum" in result["abnormal_structures"] + assert len(result["anomalies"]) == 1 + assert result["anomalies"][0]["structure"] == "Head" + assert result["anomalies"][0]["description"] == "Dandy Walker" + assert result["anomalies"][0]["variant_type"] == "Abnormal" + + def test_anatomy_with_hpo_extraction(self, hpo_cr): + """Test HPO term extraction from anomaly descriptions.""" + data = { + "fetuses": [ + { + "fetus": { + "anatomy_text": "Findings consistent with Dandy-Walker malformation.", + "anatomy": [ + { + "main": {"label": "Brain", "anat_state": "Abnormal"}, + "detail": [], + "anomalies": [ + {"description": "Ventriculomegaly noted"} + ], + } + ], + } + } + ] + } + + result = parse_fetal_anatomy(data, "observer_json", hpo_cr=hpo_cr) + + # Should find HPO terms from the combined text + assert len(result["hpo_terms"]) > 0 + hpo_ids = [t.hpo_id for t in result["hpo_terms"]] + # Dandy-Walker malformation is HP:0001305 + assert "HP:0001305" in hpo_ids or "HP:0002119" in hpo_ids # Ventriculomegaly + + def test_anatomy_json_string_input(self): + """Test that JSON string input is handled correctly.""" + data = json.dumps( + { + "fetuses": [ + { + "fetus": { + "anatomy_text": "Normal anatomy.", + "anatomy": [ + {"main": {"label": "Face", "anat_state": "Normal"}} + ], + } + } + ] + } + ) + + result = parse_fetal_anatomy(data, "observer_json") + + assert "Face" in result["normal_structures"] + + def test_empty_fetuses(self): + """Test handling of empty fetuses array.""" + data = {"fetuses": []} + + result = parse_fetal_anatomy(data, "observer_json") + + assert result["normal_structures"] == [] + assert result["abnormal_structures"] == [] + assert result["anomalies"] == [] + + def test_missing_anatomy_key(self): + """Test handling of fetus without anatomy key.""" + data = {"fetuses": [{"fetus": {"anatomy_text": "Some text."}}]} + + result = parse_fetal_anatomy(data, "observer_json") + + assert result["anatomy_text"] == "Some text." + assert result["normal_structures"] == [] + + +# --------------------------------------------------------------------- +# ViewPoint Text (Skeleton) +# --------------------------------------------------------------------- + + +class TestFetalAnatomyViewPointText: + def test_skeleton_returns_empty_structures(self): + """Test that skeleton implementation returns expected structure.""" + text = """Fetal Anatomy +============= +The following structures appear normal: +Cranium. Brain. Face. +""" + + result = parse_fetal_anatomy(text, "viewpoint_text") + + assert result["source_format"] == "viewpoint_text" + assert isinstance(result["normal_structures"], list) + assert isinstance(result["abnormal_structures"], list) + # Skeleton extracts anatomy_text but doesn't parse structure lists yet + assert "normal" in result["anatomy_text"].lower() + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (Skeleton) +# --------------------------------------------------------------------- + + +class TestFetalAnatomyViewPointHL7: + def test_skeleton_returns_empty(self): + """Test that HL7 skeleton returns empty result.""" + hl7 = "MSH|...\nOBX|..." + + result = parse_fetal_anatomy(hl7, "viewpoint_hl7") + + assert result["source_format"] == "viewpoint_hl7" + assert result["normal_structures"] == [] + assert result["anatomy_text"] == "" + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestFetalAnatomyEdgeCases: + def test_invalid_format(self): + """Test that invalid format raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_anatomy("data", "invalid_format") + + def test_non_string_viewpoint_text(self): + """Test that non-string viewpoint_text raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_anatomy({"not": "string"}, "viewpoint_text") + + def test_non_string_viewpoint_hl7(self): + """Test that non-string viewpoint_hl7 raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_anatomy({"not": "string"}, "viewpoint_hl7") diff --git a/tests/etl/sections/test_fetal_ratios.py b/tests/etl/sections/test_fetal_ratios.py new file mode 100644 index 0000000..b5acd66 --- /dev/null +++ b/tests/etl/sections/test_fetal_ratios.py @@ -0,0 +1,271 @@ +"""Tests for fetal ratios section parser.""" + +import json +import pytest + +from prenatalppkt.etl.sections.fetal_ratios import parse_fetal_ratios + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestFetalRatiosObserver: + def test_basic_ratios(self): + """Test parsing of basic ratio data.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 1.105, + "range": "1.04 - 1.22", + "fetus_number": 1, + }, + { + "label": "FL/AC", + "value": 22.149, + "range": "20 - 24", + "fetus_number": 1, + }, + { + "label": "FL/BPD", + "value": 75, + "range": "71 - 87", + "fetus_number": 1, + }, + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert len(result["ratios"]) == 3 + assert result["all_within_range"] is True + assert result["proportionality_assessment"] == "Normal" + assert result["source_format"] == "observer_json" + + # Check specific ratio + hc_ac = next(r for r in result["ratios"] if r["name"] == "HC/AC") + assert hc_ac["value"] == 1.105 + assert hc_ac["expected_range"] == (1.04, 1.22) + assert hc_ac["within_range"] is True + + def test_ratio_out_of_range(self): + """Test detection of out-of-range ratio.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 1.35, # Above normal range + "range": "1.04 - 1.22", + "fetus_number": 1, + } + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["all_within_range"] is False + assert result["proportionality_assessment"] == "Asymmetric" + assert result["ratios"][0]["within_range"] is False + + def test_asymmetric_growth_detection(self): + """Test asymmetric growth pattern detection via HC/AC.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 0.95, # Below normal - head-sparing + "range": "1.04 - 1.22", + }, + { + "label": "FL/BPD", + "value": 80, # Within range + "range": "71 - 87", + }, + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["proportionality_assessment"] == "Asymmetric" + + def test_json_string_input(self): + """Test that JSON string input is handled correctly.""" + data = json.dumps( + { + "fetuses": [ + { + "ratios": [ + {"label": "HC/AC", "value": 1.1, "range": "1.04 - 1.22"} + ] + } + ] + } + ) + + result = parse_fetal_ratios(data, "observer_json") + + assert len(result["ratios"]) == 1 + + def test_empty_fetuses(self): + """Test handling of empty fetuses array.""" + data = {"fetuses": []} + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"] == [] + assert result["all_within_range"] is None + + def test_missing_ratios_key(self): + """Test handling of fetus without ratios key.""" + data = {"fetuses": [{"fetus": {}}]} + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"] == [] + + def test_ratio_without_range(self): + """Test handling of ratio without expected range.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 1.1, + "range": "", # Empty range + } + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"][0]["expected_range"] is None + assert result["ratios"][0]["within_range"] is None + + def test_boundary_values(self): + """Test boundary values at exactly min and max of range.""" + data = { + "fetuses": [ + { + "ratios": [ + { + "label": "HC/AC", + "value": 1.04, + "range": "1.04 - 1.22", + }, # At min + {"label": "FL/AC", "value": 24, "range": "20 - 24"}, # At max + ] + } + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert all(r["within_range"] for r in result["ratios"]) + + +# --------------------------------------------------------------------- +# ViewPoint Text (Skeleton) +# --------------------------------------------------------------------- + + +class TestFetalRatiosViewPointText: + def test_skeleton_parses_ratio_pattern(self): + """Test that skeleton can parse basic ratio patterns.""" + text = """Fetal Biometry +============ +FL / HC 0.23 +""" + + result = parse_fetal_ratios(text, "viewpoint_text") + + assert result["source_format"] == "viewpoint_text" + # Skeleton may parse the FL/HC ratio + assert isinstance(result["ratios"], list) + + def test_no_ratios_in_text(self): + """Test handling when no ratios are found.""" + text = "Fetal Biometry\nHC 250 mm" + + result = parse_fetal_ratios(text, "viewpoint_text") + + assert result["ratios"] == [] + + +# --------------------------------------------------------------------- +# ViewPoint HL7 (Skeleton) +# --------------------------------------------------------------------- + + +class TestFetalRatiosViewPointHL7: + def test_skeleton_returns_empty(self): + """Test that HL7 skeleton returns empty result.""" + hl7 = "MSH|...\nOBX|..." + + result = parse_fetal_ratios(hl7, "viewpoint_hl7") + + assert result["source_format"] == "viewpoint_hl7" + assert result["ratios"] == [] + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestFetalRatiosEdgeCases: + def test_invalid_format(self): + """Test that invalid format raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_ratios("data", "invalid_format") + + def test_non_string_viewpoint_text(self): + """Test that non-string viewpoint_text raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_ratios({"not": "string"}, "viewpoint_text") + + def test_non_string_viewpoint_hl7(self): + """Test that non-string viewpoint_hl7 raises ValueError.""" + with pytest.raises(ValueError): + parse_fetal_ratios({"not": "string"}, "viewpoint_hl7") + + def test_malformed_range_string(self): + """Test handling of malformed range string.""" + data = { + "fetuses": [ + {"ratios": [{"label": "HC/AC", "value": 1.1, "range": "invalid"}]} + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"][0]["expected_range"] is None + + def test_integer_ratio_value(self): + """Test that integer ratio values are handled.""" + data = { + "fetuses": [ + {"ratios": [{"label": "FL/BPD", "value": 75, "range": "71 - 87"}]} + ] + } + + result = parse_fetal_ratios(data, "observer_json") + + assert result["ratios"][0]["value"] == 75 + assert result["ratios"][0]["within_range"] is True diff --git a/tests/etl/sections/test_pregnancy_dating.py b/tests/etl/sections/test_pregnancy_dating.py new file mode 100644 index 0000000..74e68f0 --- /dev/null +++ b/tests/etl/sections/test_pregnancy_dating.py @@ -0,0 +1,106 @@ +import json +import pytest + +from prenatalppkt.etl.sections.pregnancy_dating import parse_pregnancy_dating + + +# --------------------------------------------------------------------- +# Observer JSON +# --------------------------------------------------------------------- + + +class TestPregnancyDatingObserver: + def test_basic_lmp_and_edd(self): + data = json.dumps( + {"exam": {"lmp": "2025-01-15", "edd": "2025-10-22", "dating_method": "LMP"}} + ) + + result = parse_pregnancy_dating(data, "observer_json") + + assert result["lmp"] == "2025-01-15" + assert result["edd"] == "2025-10-22" + assert result["ga_by_lmp"] is None + assert result["source_format"] == "observer_json" + + def test_missing_dates(self): + data = json.dumps({"exam": {}}) + result = parse_pregnancy_dating(data, "observer_json") + assert result["lmp"] is None + assert result["ga_by_lmp"] is None + + +# --------------------------------------------------------------------- +# ViewPoint Text +# --------------------------------------------------------------------- + + +class TestPregnancyDatingViewPointText: + def test_basic_dating_section(self): + text = """Dating +====== +LMP 01/15/2025 +EDD by LMP 10/22/2025 +Assigned dating based on LMP +""" + + result = parse_pregnancy_dating(text, "viewpoint_text") + + assert result["lmp"] == "2025-01-15" + assert result["edd"] == "2025-10-22" + assert result["ga_by_lmp"] is None + assert "Assigned" in result["dating_method"] + + def test_missing_dating_section(self): + text = "Fetal Biometry\n============\nHC 175 mm" + result = parse_pregnancy_dating(text, "viewpoint_text") + assert result["lmp"] is None + assert result["edd"] is None + + +# --------------------------------------------------------------------- +# ViewPoint HL7 +# --------------------------------------------------------------------- + + +class TestPregnancyDatingViewPointHL7: + def test_basic_hl7_dates(self): + hl7 = ( + "OBX||DT|EpisodeHistory.LastMenstrualPeriod^LMP|1|20250115\n" + "OBX||DT|EpisodeHistory.EDDbyLMP^EDD|1|20251022\n" + ) + + result = parse_pregnancy_dating(hl7, "viewpoint_hl7") + + assert result["lmp"] == "2025-01-15" + assert result["edd"] == "2025-10-22" + assert result["ga_by_lmp"] is None + + def test_no_dates(self): + hl7 = "OBX||NM|SomeOtherField|1|123\n" + result = parse_pregnancy_dating(hl7, "viewpoint_hl7") + assert result["lmp"] is None + assert result["ga_by_lmp"] is None + + +# --------------------------------------------------------------------- +# Edge Cases +# --------------------------------------------------------------------- + + +class TestPregnancyDatingEdgeCases: + def test_invalid_format(self): + with pytest.raises(ValueError): + parse_pregnancy_dating("data", "bad_format") + + def test_non_string_text(self): + with pytest.raises(ValueError): + parse_pregnancy_dating({"bad": "data"}, "viewpoint_text") + + def test_malformed_dates(self): + text = """Dating + ====== + LMP not-a-date + """ + result = parse_pregnancy_dating(text, "viewpoint_text") + assert result["lmp"] is None + assert result["ga_by_lmp"] is None