Skip to content

Commit 7731f0f

Browse files
committed
feat: add phenotype to LLM query
1 parent 1edc3f9 commit 7731f0f

File tree

4 files changed

+69
-12
lines changed

4 files changed

+69
-12
lines changed

backend/api/services/anvil_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
ANVIL_FACET_NAME_MAPPING: Dict[str, str] = {
1212
# Primary facets
1313
"Diagnosis": "diagnoses.disease",
14+
"Phenotype": "diagnoses.phenotype",
1415
"Reported Ethnicity": "donors.reported_ethnicity",
1516
"File Format": "files.file_format",
1617
"Anatomical Site": "biosamples.anatomical_site",
@@ -24,7 +25,6 @@
2425
# Unmatched terms - no OpenSearch mapping
2526
"unmatched": "unmatched",
2627
# Legacy mappings (for backwards compatibility)
27-
"Phenotype": "diagnoses.phenotype",
2828
"Data Use Permission": "datasets.data_use_permission",
2929
"Activity Type": "activities.activity_type",
3030
"Assay Type": "activities.assay_type",

backend/api/services/llm_mention_extractor.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,26 @@
2828
independent of the technology or methods used to produce the information.
2929
3030
- Diagnosis:
31-
A disease, condition, phenotypic abnormality, or clinical finding reported in an entity.
31+
A disease or individual phenotypic feature.
3232
This includes:
33-
* Diseases (e.g., diabetes, cancer, Alzheimer's disease)
34-
* Clinical conditions and phenotypes (e.g., accessory oral frenulum, cleft palate)
35-
* HPO (Human Phenotype Ontology) terms describing abnormalities
33+
* Named diseases (e.g., diabetes, cancer, Alzheimer's disease, autism)
3634
* MONDO disease terms
37-
* Any medical/clinical abnormality or pathological condition
35+
* Individual HPO phenotypic features (e.g., cleft palate, seizure, hypotonia)
36+
* Single observable characteristics or abnormalities
3837
39-
IMPORTANT: Even if a term contains anatomical words (e.g., "oral", "cardiac"),
40-
it should be categorized as Diagnosis if it describes an abnormality or condition,
41-
not Anatomical Site.
38+
IMPORTANT: Use Diagnosis for both named diseases AND individual phenotypic traits.
39+
Only use Phenotype for complex phenotype syndromes.
40+
41+
- Phenotype:
42+
Complex phenotype syndromes and named phenotypic conditions.
43+
This includes:
44+
* Complex phenotype syndromes (e.g., Coffin-Siris syndrome, Epileptic Encephalopathy)
45+
* Named phenotypic conditions (e.g., Agenesis of the Corpus Callosum)
46+
* Multi-feature phenotypic presentations
47+
* Specific syndrome names
48+
49+
IMPORTANT: Use Phenotype only for complex/named syndromes. For individual features
50+
like "cleft palate" or "seizure", use Diagnosis instead.
4251
4352
- Organism Type:
4453
A human-readable reference to the organism type.
@@ -73,9 +82,11 @@
7382
Instructions:
7483
- Extract exact substrings from the query.
7584
- Assign mentions to the most appropriate facet listed above.
76-
- When choosing between Diagnosis and Anatomical Site: if the term describes an abnormality,
77-
condition, or disease, choose Diagnosis. Only use Anatomical Site for normal body parts
78-
where samples are collected.
85+
- When choosing between Diagnosis and Phenotype: use Diagnosis for diseases AND individual
86+
phenotypic features (like "cleft palate", "seizure"). Use Phenotype only for complex
87+
syndrome names (like "Coffin-Siris syndrome", "Epileptic Encephalopathy").
88+
- When choosing between Diagnosis/Phenotype and Anatomical Site: use Diagnosis/Phenotype
89+
for abnormalities, use Anatomical Site only for normal body parts where samples are collected.
7990
- If a meaningful term does not map to any facet, use facet = 'unmatched'.
8091
- Do not invent new facet names.
8192
""".strip()
@@ -86,6 +97,7 @@
8697
"Consent Group",
8798
"Data Modality",
8899
"Diagnosis",
100+
"Phenotype",
89101
"Organism Type",
90102
"Reported Ethnicity",
91103
"Phenotypic Sex",

backend/api/tests/mock_llm_extractor.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ class MockLLMMentionExtractor:
2020
(r"\bcancer\b", "cancer", "Diagnosis"),
2121
(r"\bals\b", "als", "Diagnosis"),
2222
(r"\btype\s+2\s+diabetes\b", "type 2 diabetes", "Diagnosis"),
23+
# Individual phenotypic features (go in Diagnosis)
24+
(r"\bcleft\s+palate\b", "cleft palate", "Diagnosis"),
25+
(r"\bseizure\b", "seizure", "Diagnosis"),
26+
(r"\bhypotonia\b", "hypotonia", "Diagnosis"),
27+
# Complex phenotype syndromes (go in Phenotype) - patterns are lowercase because query is lowercased
28+
(r"\bcoffin-siris\s+syndrome\b", "Coffin-Siris syndrome", "Phenotype"),
29+
(r"\bepileptic\s+encephalopathy\b", "Epileptic Encephalopathy", "Phenotype"),
30+
(r"\bagenesis\s+of\s+(?:the\s+)?corpus\s+callosum\b", "agenesis of the corpus callosum", "Phenotype"),
2331
# Ethnicity
2432
(r"\blatino\b", "latino", "Reported Ethnicity"),
2533
(r"\bhispanic\b", "hispanic", "Reported Ethnicity"),

backend/api/tests/test_llm_extraction.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import pytest
33

44
from services.normalization_service import Mention
5+
from services.anvil_config import get_anvil_facet_mapping
56
from tests.mock_llm_extractor import MockLLMMentionExtractor
67

78

@@ -11,6 +12,12 @@ def mock_extractor() -> MockLLMMentionExtractor:
1112
return MockLLMMentionExtractor()
1213

1314

15+
@pytest.fixture
16+
def mock_extractor_with_mapping() -> MockLLMMentionExtractor:
17+
"""Fixture providing a mock LLM extractor with facet name mapping."""
18+
return MockLLMMentionExtractor(facet_name_mapping=get_anvil_facet_mapping())
19+
20+
1421
def test_simple_query_diabetes(mock_extractor: MockLLMMentionExtractor) -> None:
1522
"""Test extraction from simple query with one term."""
1623
query = "diabetes"
@@ -184,3 +191,33 @@ def test_phenotypic_sex_extraction(mock_extractor: MockLLMMentionExtractor) -> N
184191
mentions = mock_extractor.extract_mentions(query)
185192

186193
assert any(m.text == "male" and m.facet == "Phenotypic Sex" for m in mentions)
194+
195+
196+
def test_individual_phenotype_feature_as_diagnosis(mock_extractor_with_mapping: MockLLMMentionExtractor) -> None:
197+
"""Test that individual phenotypic features are extracted as Diagnosis."""
198+
query = "patients with cleft palate"
199+
200+
mentions = mock_extractor_with_mapping.extract_mentions(query)
201+
202+
assert any(m.text == "cleft palate" and m.facet == "diagnoses.disease" for m in mentions)
203+
204+
205+
def test_complex_phenotype_syndrome(mock_extractor_with_mapping: MockLLMMentionExtractor) -> None:
206+
"""Test that complex phenotype syndromes are extracted as Phenotype."""
207+
query = "patients with Coffin-Siris syndrome"
208+
209+
mentions = mock_extractor_with_mapping.extract_mentions(query)
210+
211+
assert any(m.text == "Coffin-Siris syndrome" and m.facet == "diagnoses.phenotype" for m in mentions)
212+
213+
214+
def test_diagnosis_vs_phenotype_distinction(mock_extractor_with_mapping: MockLLMMentionExtractor) -> None:
215+
"""Test that the mock extractor distinguishes between Diagnosis and Phenotype."""
216+
query = "patients with diabetes and Epileptic Encephalopathy"
217+
218+
mentions = mock_extractor_with_mapping.extract_mentions(query)
219+
220+
# diabetes should be Diagnosis (diagnoses.disease)
221+
assert any(m.text == "diabetes" and m.facet == "diagnoses.disease" for m in mentions)
222+
# Epileptic Encephalopathy should be Phenotype (diagnoses.phenotype)
223+
assert any(m.text == "Epileptic Encephalopathy" and m.facet == "diagnoses.phenotype" for m in mentions)

0 commit comments

Comments
 (0)